From 5769982bbc402cef0639bfdf11beae836fa622c5 Mon Sep 17 00:00:00 2001 From: Face <69168154+face-hh@users.noreply.github.com> Date: Wed, 27 Aug 2025 20:56:29 +0300 Subject: [PATCH] fix text-related tag events, remove warnings, make search engine not fetch /index.html separately but / --- flumi/Scripts/B9/HTMLParser.gd | 23 ++++++++-------- flumi/Scripts/B9/Lua.gd | 10 +++---- flumi/Scripts/Network.gd | 3 +-- flumi/Scripts/Tags/audio.gd | 2 +- flumi/Scripts/Tags/p.gd | 6 +++++ flumi/Scripts/Utils/BackgroundUtils.gd | 16 ++++++++++-- flumi/Scripts/Utils/Lua/Crumbs.gd | 1 - flumi/Scripts/Utils/Lua/DOM.gd | 24 ++++++++--------- flumi/Scripts/Utils/Lua/Event.gd | 4 +-- flumi/Scripts/Utils/Lua/JSON.gd | 3 +-- flumi/Scripts/Utils/Lua/ThreadedVM.gd | 1 - flumi/Scripts/Utils/Lua/WebSocket.gd | 28 ++++++++++---------- search-engine/frontend/search.lua | 35 +++++++++++++++++++++---- search-engine/src/crawler.rs | 36 +++++++++++++++++++------- tests/signal.html | 2 +- 15 files changed, 123 insertions(+), 71 deletions(-) diff --git a/flumi/Scripts/B9/HTMLParser.gd b/flumi/Scripts/B9/HTMLParser.gd index 5f689f7..645fbb6 100644 --- a/flumi/Scripts/B9/HTMLParser.gd +++ b/flumi/Scripts/B9/HTMLParser.gd @@ -80,12 +80,11 @@ static func unescape_html_entities(text: String) -> String: static func preprocess_html_entities(html: String) -> String: var result = "" var i = 0 - var in_tag = false while i < html.length(): - var char = html[i] + var character = html[i] - if char == "<": + if character == "<": # Check if this starts a valid HTML tag var tag_end = html.find(">", i) if tag_end != -1: @@ -97,11 +96,11 @@ static func preprocess_html_entities(html: String) -> String: continue # If not a valid tag, escape it result += "<" - elif char == ">": + elif character == ">": # Escape standalone > that's not part of a tag result += ">" else: - result += char + result += character i += 1 @@ -431,7 +430,7 @@ func get_all_images() -> Array[String]: func get_all_scripts() -> Array[String]: return get_attribute_values("script", "src") -func process_scripts(lua_api: LuaAPI, lua_vm) -> void: +func process_scripts(lua_api: LuaAPI, _lua_vm) -> void: if not lua_api: print("Warning: Lua API not available for script processing") return @@ -447,9 +446,9 @@ func process_scripts(lua_api: LuaAPI, lua_vm) -> void: parse_result.external_scripts = [] parse_result.external_scripts.append(src) elif not inline_code.is_empty(): - lua_api.execute_lua_script(inline_code, lua_vm) + lua_api.execute_lua_script(inline_code) -func process_external_scripts(lua_api: LuaAPI, lua_vm, base_url: String = "") -> void: +func process_external_scripts(lua_api: LuaAPI, _lua_vm, base_url: String = "") -> void: if not lua_api or not parse_result.external_scripts or parse_result.external_scripts.is_empty(): return @@ -458,7 +457,7 @@ func process_external_scripts(lua_api: LuaAPI, lua_vm, base_url: String = "") -> for script_url in parse_result.external_scripts: var script_content = await Network.fetch_external_resource(script_url, base_url) if not script_content.is_empty(): - lua_api.execute_lua_script(script_content, lua_vm) + lua_api.execute_lua_script(script_content) func get_all_stylesheets() -> Array[String]: return get_attribute_values("style", "src") @@ -470,7 +469,7 @@ func apply_element_styles(node: Control, element: HTMLElement, parser: HTMLParse var text = HTMLParser.get_bbcode_with_styles(element, styles, parser, []) label.text = text -static func apply_element_bbcode_formatting(element: HTMLElement, styles: Dictionary, content: String, parser: HTMLParser = null) -> String: +static func apply_element_bbcode_formatting(element: HTMLElement, styles: Dictionary, content: String) -> String: # Apply general styling first (color, font-weight) for all elements var formatted_content = content @@ -553,10 +552,10 @@ static func get_bbcode_with_styles(element: HTMLElement, styles: Dictionary, par if parser != null: child_styles = parser.get_element_styles_with_inheritance(child, "", new_visited) var child_content = HTMLParser.get_bbcode_with_styles(child, child_styles, parser, new_visited) - child_content = apply_element_bbcode_formatting(child, child_styles, child_content, parser) + child_content = apply_element_bbcode_formatting(child, child_styles, child_content) text += child_content # Apply formatting to the current element itself - text = apply_element_bbcode_formatting(element, styles, text, parser) + text = apply_element_bbcode_formatting(element, styles, text) return text diff --git a/flumi/Scripts/B9/Lua.gd b/flumi/Scripts/B9/Lua.gd index 249fab1..884bf75 100644 --- a/flumi/Scripts/B9/Lua.gd +++ b/flumi/Scripts/B9/Lua.gd @@ -155,7 +155,7 @@ func _gurt_clear_interval_handler(vm: LuauVM) -> int: return timeout_manager.clear_interval_handler(vm) # Location API handlers -func _gurt_location_reload_handler(vm: LuauVM) -> int: +func _gurt_location_reload_handler(_vm: LuauVM) -> int: call_deferred("_reload_current_page") return 0 @@ -634,7 +634,7 @@ func get_dom_node(node: Node, purpose: String = "general") -> Node: return node # Main execution function -func execute_lua_script(code: String, vm: LuauVM): +func execute_lua_script(code: String): if not threaded_vm.lua_thread or not threaded_vm.lua_thread.is_alive(): # Start the thread if it's not running threaded_vm.start_lua_thread(dom_parser, self) @@ -642,8 +642,8 @@ func execute_lua_script(code: String, vm: LuauVM): script_start_time = Time.get_ticks_msec() / 1000.0 threaded_vm.execute_script_async(code) -func _on_threaded_script_completed(result: Dictionary): - var execution_time = (Time.get_ticks_msec() / 1000.0) - script_start_time +func _on_threaded_script_completed(_result: Dictionary): + pass func _on_print_output(message: String): LuaPrintUtils.lua_print_direct(message) @@ -684,7 +684,7 @@ func _handle_dom_operation(operation: Dictionary): "insert_before": LuaDOMUtils.handle_insert_before(operation, dom_parser, self) "insert_after": - LuaDOMUtils.handle_insert_after(operation, dom_parser, self) + LuaDOMUtils.handle_insert_after(operation, dom_parser) "replace_child": LuaDOMUtils.handle_replace_child(operation, dom_parser, self) "focus_element": diff --git a/flumi/Scripts/Network.gd b/flumi/Scripts/Network.gd index 4f5bc66..267db0f 100644 --- a/flumi/Scripts/Network.gd +++ b/flumi/Scripts/Network.gd @@ -87,7 +87,6 @@ func fetch_text(url: String) -> String: var result = response[0] # HTTPClient.Result var response_code = response[1] # int - var headers = response[2] # PackedStringArray var body = response[3] # PackedByteArray http_request.queue_free() @@ -104,7 +103,7 @@ func fetch_external_resource(url: String, base_url: String = "") -> String: if resolved_url.begins_with("http://") or resolved_url.begins_with("https://"): return await fetch_text(resolved_url) elif resolved_url.begins_with("gurt://"): - return await fetch_gurt_resource(resolved_url) + return fetch_gurt_resource(resolved_url) else: return "" diff --git a/flumi/Scripts/Tags/audio.gd b/flumi/Scripts/Tags/audio.gd index 385950a..3786ee8 100644 --- a/flumi/Scripts/Tags/audio.gd +++ b/flumi/Scripts/Tags/audio.gd @@ -118,7 +118,7 @@ func load_audio_async(src: String) -> void: http_request.queue_free() return -func _on_audio_download_completed(result: int, response_code: int, headers: PackedStringArray, body: PackedByteArray): +func _on_audio_download_completed(_result: int, response_code: int, headers: PackedStringArray, body: PackedByteArray): var http_request = get_children().filter(func(child): return child is HTTPRequest)[0] http_request.queue_free() diff --git a/flumi/Scripts/Tags/p.gd b/flumi/Scripts/Tags/p.gd index d5aa743..34f40b0 100644 --- a/flumi/Scripts/Tags/p.gd +++ b/flumi/Scripts/Tags/p.gd @@ -62,6 +62,8 @@ func create_styled_label(text: String, element, parser: HTMLParser) -> RichTextL add_child(label) + parser.register_dom_node(element, label) + var styles = parser.get_element_styles_with_inheritance(element, "", []) StyleManager.apply_styles_to_label(label, styles, element, parser, text) @@ -169,5 +171,9 @@ func create_label(text: String) -> RichTextLabel: label.size_flags_vertical = Control.SIZE_SHRINK_CENTER add_child(label) + + if _element and _parser: + _parser.register_dom_node(_element, label) + call_deferred("_apply_auto_resize_to_label", label) return label diff --git a/flumi/Scripts/Utils/BackgroundUtils.gd b/flumi/Scripts/Utils/BackgroundUtils.gd index 0dc5b14..6aa5a1e 100644 --- a/flumi/Scripts/Utils/BackgroundUtils.gd +++ b/flumi/Scripts/Utils/BackgroundUtils.gd @@ -227,7 +227,7 @@ static func _on_child_mouse_entered(panel: PanelContainer): _on_panel_mouse_entered(panel) static func _on_child_mouse_exited(panel: PanelContainer): - panel.get_tree().create_timer(0.01).timeout.connect(func(): _check_panel_hover(panel)) + _create_panel_check_timer(panel) static func _on_panel_mouse_entered(panel: PanelContainer): panel.set_meta("is_hovering", true) @@ -241,7 +241,19 @@ static func _on_panel_mouse_entered(panel: PanelContainer): StyleManager.apply_transform_properties_direct(transform_target, hover_styles) static func _on_panel_mouse_exited_with_delay(panel: PanelContainer): - panel.get_tree().create_timer(0.01).timeout.connect(func(): _check_panel_hover(panel)) + _create_panel_check_timer(panel) + +static func _create_panel_check_timer(panel: PanelContainer): + if not is_instance_valid(panel): + return + var timer = panel.get_tree().create_timer(0.01) + var panel_ref = weakref(panel) + timer.timeout.connect(func(): _check_panel_hover_safe(panel_ref)) + +static func _check_panel_hover_safe(panel_ref: WeakRef): + var panel = panel_ref.get_ref() + if panel: + _check_panel_hover(panel) static func _check_panel_hover(panel: PanelContainer): if not panel or not is_instance_valid(panel): diff --git a/flumi/Scripts/Utils/Lua/Crumbs.gd b/flumi/Scripts/Utils/Lua/Crumbs.gd index eda9f05..7ba9fab 100644 --- a/flumi/Scripts/Utils/Lua/Crumbs.gd +++ b/flumi/Scripts/Utils/Lua/Crumbs.gd @@ -205,7 +205,6 @@ static func load_all_crumbs(domain: String) -> Dictionary: return {} var crumbs = {} - var current_time = Time.get_ticks_msec() / 1000.0 var changed = false for crumb_name in crumbs_data: diff --git a/flumi/Scripts/Utils/Lua/DOM.gd b/flumi/Scripts/Utils/Lua/DOM.gd index 49d3106..7013211 100644 --- a/flumi/Scripts/Utils/Lua/DOM.gd +++ b/flumi/Scripts/Utils/Lua/DOM.gd @@ -127,7 +127,7 @@ static func handle_element_append(operation: Dictionary, dom_parser: HTMLParser, if parent_dom_node: # Render the appended element - render_new_element.call_deferred(child_element, parent_dom_node, dom_parser, lua_api) + render_new_element.call_deferred(child_element, parent_dom_node, dom_parser) static func handle_element_remove(operation: Dictionary, dom_parser: HTMLParser) -> void: var element_id: String = operation.element_id @@ -190,9 +190,9 @@ static func handle_insert_before(operation: Dictionary, dom_parser: HTMLParser, parent_dom_node = dom_parser.parse_result.dom_nodes.get(parent_id, null) if parent_dom_node: - handle_visual_insertion_by_reference(parent_id, new_child_element, reference_child_id, true, dom_parser, lua_api) + handle_visual_insertion_by_reference(parent_id, new_child_element, reference_child_id, true, dom_parser) -static func handle_insert_after(operation: Dictionary, dom_parser: HTMLParser, lua_api) -> void: +static func handle_insert_after(operation: Dictionary, dom_parser: HTMLParser) -> void: var parent_id: String = operation.parent_id var new_child_id: String = operation.new_child_id var reference_child_id: String = operation.reference_child_id @@ -229,7 +229,7 @@ static func handle_insert_after(operation: Dictionary, dom_parser: HTMLParser, l parent_dom_node = dom_parser.parse_result.dom_nodes.get(parent_id, null) if parent_dom_node: - handle_visual_insertion_by_reference(parent_id, new_child_element, reference_child_id, false, dom_parser, lua_api) + handle_visual_insertion_by_reference(parent_id, new_child_element, reference_child_id, false, dom_parser) static func handle_replace_child(operation: Dictionary, dom_parser: HTMLParser, lua_api) -> void: var parent_id: String = operation.parent_id @@ -262,13 +262,12 @@ static func handle_replace_child(operation: Dictionary, dom_parser: HTMLParser, # Handle visual rendering handle_visual_replacement(old_child_id, new_child_element, parent_id, dom_parser, lua_api) -static func render_new_element(element: HTMLParser.HTMLElement, parent_node: Node, dom_parser: HTMLParser, lua_api) -> void: +static func render_new_element(element: HTMLParser.HTMLElement, parent_node: Node, dom_parser: HTMLParser) -> void: # Get reference to main scene for rendering var main_scene = Engine.get_main_loop().current_scene if not main_scene: return - var element_id = element.get_attribute("id") # Create the visual node for the element var element_node = await main_scene.create_element_node(element, dom_parser) @@ -341,7 +340,7 @@ static func _find_input_control_with_file_info(node: Node) -> Node: return null -static func _get_select_value(element: HTMLParser.HTMLElement, dom_node: Node) -> String: +static func _get_select_value(_element: HTMLParser.HTMLElement, dom_node: Node) -> String: if dom_node is OptionButton: var option_button = dom_node as OptionButton var selected_index = option_button.selected @@ -353,7 +352,7 @@ static func _get_select_value(element: HTMLParser.HTMLElement, dom_node: Node) - return option_button.get_item_text(selected_index) return "" -static func _set_select_value(element: HTMLParser.HTMLElement, dom_node: Node, value: Variant) -> void: +static func _set_select_value(_element: HTMLParser.HTMLElement, dom_node: Node, value: Variant) -> void: if dom_node is OptionButton: var option_button = dom_node as OptionButton var target_value = str(value) @@ -433,7 +432,7 @@ static func clone_element(element: HTMLParser.HTMLElement, deep: bool) -> HTMLPa return cloned -static func handle_visual_insertion_by_reference(parent_element_id: String, new_child_element: HTMLParser.HTMLElement, reference_element_id: String, insert_before: bool, dom_parser: HTMLParser, lua_api) -> void: +static func handle_visual_insertion_by_reference(parent_element_id: String, new_child_element: HTMLParser.HTMLElement, reference_element_id: String, insert_before: bool, dom_parser: HTMLParser) -> void: var parent_dom_node: Node = null if parent_element_id == "body": var main_scene = Engine.get_main_loop().current_scene @@ -646,7 +645,7 @@ static func add_element_methods(vm: LuauVM, lua_api: LuaAPI) -> void: vm.lua_pushcallable(LuaDOMUtils._element_unfocus_wrapper, "element.unfocus") vm.lua_setfield(-2, "unfocus") - _add_classlist_support(vm, lua_api) + add_classlist_support(vm) vm.lua_newtable() vm.lua_pushcallable(LuaDOMUtils._element_index_wrapper, "element.__index") @@ -876,7 +875,7 @@ static func _element_clone_wrapper(vm: LuauVM) -> int: var cloned_element = clone_element(element, deep) # Assign new ID to cloned element - var new_id = lua_api.get_or_assign_element_id(cloned_element) + lua_api.get_or_assign_element_id(cloned_element) # Add to parser's element collection lua_api.dom_parser.parse_result.all_elements.append(cloned_element) @@ -1074,7 +1073,7 @@ static func _element_index_wrapper(vm: LuauVM) -> int: vm.lua_remove(-2) return 1 -static func _add_classlist_support(vm: LuauVM, lua_api: LuaAPI) -> void: +static func add_classlist_support(vm: LuauVM) -> void: vm.lua_newtable() vm.lua_getfield(-2, "_element_id") @@ -1173,7 +1172,6 @@ static func _classlist_toggle_wrapper(vm: LuauVM) -> int: return 0 static func _classlist_contains_wrapper(vm: LuauVM) -> int: - var start_time = Time.get_ticks_msec() var lua_api = vm.get_meta("lua_api") as LuaAPI if not lua_api: diff --git a/flumi/Scripts/Utils/Lua/Event.gd b/flumi/Scripts/Utils/Lua/Event.gd index 7dcf80c..73a77c3 100644 --- a/flumi/Scripts/Utils/Lua/Event.gd +++ b/flumi/Scripts/Utils/Lua/Event.gd @@ -220,13 +220,13 @@ static func disconnect_subscription(subscription, lua_api) -> void: match subscription.connected_signal: "pressed": if target_node.has_signal("pressed"): - if subscription.has("wrapper_func") and subscription.wrapper_func: + if subscription.wrapper_func: target_node.pressed.disconnect(subscription.wrapper_func) else: target_node.pressed.disconnect(lua_api._on_event_triggered.bind(subscription)) "gui_input": if target_node.has_signal("gui_input"): - if subscription.has("wrapper_func") and subscription.wrapper_func: + if subscription.wrapper_func: target_node.gui_input.disconnect(subscription.wrapper_func) else: target_node.gui_input.disconnect(lua_api._on_gui_input_click.bind(subscription)) diff --git a/flumi/Scripts/Utils/Lua/JSON.gd b/flumi/Scripts/Utils/Lua/JSON.gd index 017cf53..ba2dea3 100644 --- a/flumi/Scripts/Utils/Lua/JSON.gd +++ b/flumi/Scripts/Utils/Lua/JSON.gd @@ -30,8 +30,7 @@ static func _lua_json_parse_handler(vm: LuauVM) -> int: static func _lua_json_stringify_handler(vm: LuauVM) -> int: var value = vm.lua_tovariant(1) - var json = JSON.new() - var json_string = json.stringify(value) + var json_string = JSON.stringify(value) vm.lua_pushstring(json_string) return 1 diff --git a/flumi/Scripts/Utils/Lua/ThreadedVM.gd b/flumi/Scripts/Utils/Lua/ThreadedVM.gd index 76f7e88..81e9197 100644 --- a/flumi/Scripts/Utils/Lua/ThreadedVM.gd +++ b/flumi/Scripts/Utils/Lua/ThreadedVM.gd @@ -253,7 +253,6 @@ func _print_handler(vm: LuauVM) -> int: message_parts.append(arg_str) var final_message = "\t".join(message_parts) - var current_time = Time.get_ticks_msec() / 1000.0 call_deferred("_emit_print_output", final_message) diff --git a/flumi/Scripts/Utils/Lua/WebSocket.gd b/flumi/Scripts/Utils/Lua/WebSocket.gd index 4182d2f..fef2425 100644 --- a/flumi/Scripts/Utils/Lua/WebSocket.gd +++ b/flumi/Scripts/Utils/Lua/WebSocket.gd @@ -11,7 +11,7 @@ class WebSocketWrapper: var vm: LuauVM var url: String var websocket: WebSocketPeer - var is_connected: bool = false + var connection_status: bool = false var event_handlers: Dictionary = {} var timer: Timer var last_state: int = -1 @@ -20,7 +20,7 @@ class WebSocketWrapper: websocket = WebSocketPeer.new() func connect_to_url(): - if is_connected: + if connection_status: return var error = websocket.connect_to_url(url) @@ -54,8 +54,8 @@ class WebSocketWrapper: match state: WebSocketPeer.STATE_OPEN: - if not is_connected: - is_connected = true + if not connection_status: + connection_status = true trigger_event("open", {}) # Check for messages @@ -65,8 +65,8 @@ class WebSocketWrapper: trigger_event("message", {"data": message}) WebSocketPeer.STATE_CLOSED: - if is_connected: - is_connected = false + if connection_status: + connection_status = false trigger_event("close", {}) # Clean up timer @@ -80,26 +80,26 @@ class WebSocketWrapper: WebSocketPeer.STATE_CLOSING: # Connection is closing - if is_connected: - is_connected = false + if connection_status: + connection_status = false _: # Unknown state or connection failed - if is_connected: - is_connected = false + if connection_status: + connection_status = false trigger_event("close", {}) - elif not is_connected: + elif not connection_status: # This might be a connection failure trigger_event("error", {"message": "Connection failed or was rejected by server"}) func send_message(message: String): - if is_connected and websocket: + if connection_status and websocket: websocket.send_text(message) func close_connection(): if websocket: websocket.close() - is_connected = false + connection_status = false if timer: timer.queue_free() @@ -222,7 +222,7 @@ static func _websocket_send(vm: LuauVM) -> int: # Get wrapper instance var wrapper: WebSocketWrapper = websocket_instances.get(instance_id) - if wrapper and wrapper.is_connected: + if wrapper and wrapper.connection_status: wrapper.send_message(message) else: vm.luaL_error("WebSocket is not connected") diff --git a/search-engine/frontend/search.lua b/search-engine/frontend/search.lua index 76e3231..9bb174d 100644 --- a/search-engine/frontend/search.lua +++ b/search-engine/frontend/search.lua @@ -7,13 +7,22 @@ local stats = gurt.select('#stats') local function showLoading() loading.classList:remove('hidden') - results.text = '' + + local children = results.children + for i = #children, 1, -1 do + children[i]:remove() + end + stats.text = '' end local function displayResults(data) loading.classList:add('hidden') - results.text = '' + + local children = results.children + for i = #children, 1, -1 do + children[i]:remove() + end if not data.results or #data.results == 0 then local noResultsItem = gurt.create('div', { @@ -94,7 +103,13 @@ local function performSearch(query) displayResults(data) else loading.classList:add('hidden') - results.text = '' + + -- Clear all existing children from results + local children = results.children + for i = #children, 1, -1 do + children[i]:remove() + end + stats.text = 'Search failed: ' .. response.status .. ' ' .. response.statusText end end @@ -117,12 +132,22 @@ local function performLuckySearch() gurt.location.goto(randomResult.url) else loading.classList:add('hidden') - results.text = '' + + local children = results.children + for i = #children, 1, -1 do + children[i]:remove() + end + stats.text = 'No sites available for lucky search' end else loading.classList:add('hidden') - results.text = '' + + local children = results.children + for i = #children, 1, -1 do + children[i]:remove() + end + stats.text = 'Lucky search failed' end end diff --git a/search-engine/src/crawler.rs b/search-engine/src/crawler.rs index 46fd86d..b2e47dc 100644 --- a/search-engine/src/crawler.rs +++ b/search-engine/src/crawler.rs @@ -185,7 +185,7 @@ impl DomainCrawler { } else if let Some(path_value) = line.to_lowercase().strip_prefix("allow:") { let path = path_value.trim(); if !path.is_empty() { - let full_url = format!("{}{}", base_url, path); + let full_url = Self::normalize_url(format!("{}{}", base_url, path)); debug!("Added allowed URL from clanker.txt: {}", full_url); allowed_urls.push(full_url); } @@ -222,19 +222,21 @@ impl DomainCrawler { } // Start with the root URL + let normalized_base_url = Self::normalize_url(base_url.clone()); queue.push_back(CrawlItem { - url: base_url.clone(), + url: normalized_base_url, depth: 0, }); // Add all URLs from clanker.txt to the queue for url in clanker_urls { - if !visited_urls.contains(&url) { + let normalized_url = Self::normalize_url(url); + if !visited_urls.contains(&normalized_url) { queue.push_back(CrawlItem { - url: url.clone(), + url: normalized_url.clone(), depth: 0, // Treat clanker.txt URLs as root level }); - debug!("Added clanker.txt URL to queue: {}", url); + debug!("Added clanker.txt URL to queue: {}", normalized_url); } } @@ -268,10 +270,11 @@ impl DomainCrawler { if let Ok(links) = self.extract_links(&page_with_html.original_html, &base_url).await { debug!("Found {} links on {}", links.len(), item.url); for link in links { - if self.should_crawl_url(&link, domain) { - debug!("Adding link to crawl queue: {}", link); + let normalized_link = Self::normalize_url(link); + if self.should_crawl_url(&normalized_link, domain) && !visited_urls.contains(&normalized_link) { + debug!("Adding link to crawl queue: {}", normalized_link); queue.push_back(CrawlItem { - url: link, + url: normalized_link, depth: item.depth + 1, }); } @@ -358,7 +361,7 @@ impl DomainCrawler { let page = CrawledPageWithHtml { crawled_page: CrawledPage { - url: url.to_string(), + url: Self::normalize_url(url.to_string()), domain: domain.full_domain(), title, content: cleaned_content.clone(), @@ -398,7 +401,7 @@ impl DomainCrawler { // Resolve relative URLs match base.join(href) { Ok(absolute_url) => { - let url_str = absolute_url.to_string(); + let url_str = Self::normalize_url(absolute_url.to_string()); // Only include GURT protocol URLs for the same domain if url_str.starts_with("gurt://") { @@ -601,6 +604,19 @@ impl DomainCrawler { false } + fn normalize_url(url: String) -> String { + if url.ends_with("/index.html") { + let without_index = &url[..url.len() - 11]; // Remove "/index.html" (11 chars) + if without_index.ends_with('/') { + without_index.to_string() + } else { + format!("{}/", without_index) + } + } else { + url + } + } + fn calculate_content_hash(content: &str) -> String { use sha2::{Sha256, Digest}; let mut hasher = Sha256::new(); diff --git a/tests/signal.html b/tests/signal.html index ea71c2d..22c8e38 100644 --- a/tests/signal.html +++ b/tests/signal.html @@ -50,7 +50,7 @@ if #logMessages > 20 then table.remove(logMessages, 1) end - logArea.text = table.concat(logMessages, '\\n') + logArea.text = table.concat(logMessages, '\n') end -- Function to update status