add search engine - ringle
This commit is contained in:
6
.gitignore
vendored
6
.gitignore
vendored
@@ -1,4 +1,6 @@
|
||||
*target*
|
||||
target/
|
||||
*.pem
|
||||
gurty.toml
|
||||
certs
|
||||
certs
|
||||
search_indexes
|
||||
config.toml
|
||||
@@ -74,7 +74,7 @@ end
|
||||
|
||||
local function loadDomains()
|
||||
print('Loading domains...')
|
||||
local response = fetch('gurt://localhost:8877/auth/domains?page=1&size=100', {
|
||||
local response = fetch('gurt://localhost:8877/auth/domains?page=1&limit=100', {
|
||||
headers = {
|
||||
Authorization = 'Bearer ' .. authToken
|
||||
}
|
||||
|
||||
@@ -325,7 +325,7 @@ gurt.select('#add-record-btn'):on('click', function()
|
||||
local recordType = gurt.select('#record-type').value
|
||||
local recordName = gurt.select('#record-name').value
|
||||
local recordValue = gurt.select('#record-value').value
|
||||
local recordTTL = tonumber(gurt.select('#record-ttl').value) or ''
|
||||
local recordTTL = tonumber(gurt.select('#record-ttl').value) or 'none'
|
||||
|
||||
if not recordValue or recordValue == '' then
|
||||
showError('record-error', 'Record value is required')
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
}
|
||||
|
||||
input {
|
||||
w-full p-3 border border-gray-600 rounded-md bg-[#374151] text-white mb-4 placeholder:text-[#999999] outline-none active:border-red-500
|
||||
text-xs w-full p-3 border border-gray-600 rounded-md bg-[#374151] text-white mb-4 placeholder:text-[#999999] outline-none active:border-red-500
|
||||
}
|
||||
|
||||
button {
|
||||
@@ -51,4 +51,4 @@
|
||||
|
||||
<p id="log-output" style="min-h-24"></p>
|
||||
</div>
|
||||
</body>
|
||||
</body>
|
||||
|
||||
@@ -39,9 +39,9 @@ local function renderTLDSelector()
|
||||
local total = #tlds
|
||||
local intervalId
|
||||
|
||||
intervalId = gurt.setInterval(function()
|
||||
intervalId = setInterval(function()
|
||||
if i > total then
|
||||
gurt.clearInterval(intervalId)
|
||||
clearInterval(intervalId)
|
||||
return
|
||||
end
|
||||
|
||||
|
||||
28
dns/migrations/010_add_search_crawl_status.sql
Normal file
28
dns/migrations/010_add_search_crawl_status.sql
Normal file
@@ -0,0 +1,28 @@
|
||||
-- Search engine domain crawl status tracking
|
||||
CREATE TABLE IF NOT EXISTS domain_crawl_status (
|
||||
domain_id INTEGER PRIMARY KEY REFERENCES domains(id) ON DELETE CASCADE,
|
||||
last_crawled_at TIMESTAMPTZ,
|
||||
next_crawl_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP,
|
||||
crawl_status VARCHAR(20) DEFAULT 'pending' CHECK (crawl_status IN ('pending', 'crawling', 'completed', 'failed', 'disabled')),
|
||||
error_message TEXT,
|
||||
pages_found INTEGER DEFAULT 0,
|
||||
updated_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_domain_crawl_status_next_crawl ON domain_crawl_status(next_crawl_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_domain_crawl_status_status ON domain_crawl_status(crawl_status);
|
||||
|
||||
-- Function to update the updated_at column
|
||||
CREATE OR REPLACE FUNCTION update_updated_at_column()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = CURRENT_TIMESTAMP;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Trigger for updated_at
|
||||
DROP TRIGGER IF EXISTS update_domain_crawl_status_updated_at ON domain_crawl_status;
|
||||
CREATE TRIGGER update_domain_crawl_status_updated_at
|
||||
BEFORE UPDATE ON domain_crawl_status
|
||||
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
|
||||
@@ -141,7 +141,7 @@ const config: Config = {
|
||||
prism: {
|
||||
theme: prismThemes.github,
|
||||
darkTheme: prismThemes.dracula,
|
||||
additionalLanguages: ['lua', 'bash', 'http'],
|
||||
additionalLanguages: ['lua', 'bash', 'http', "toml"],
|
||||
},
|
||||
} satisfies Preset.ThemeConfig,
|
||||
};
|
||||
|
||||
@@ -31,6 +31,7 @@ const sidebars: SidebarsConfig = {
|
||||
items: [
|
||||
'dns-system',
|
||||
'flumi-browser',
|
||||
'search-engine'
|
||||
],
|
||||
},
|
||||
{
|
||||
|
||||
@@ -5,6 +5,6 @@
|
||||
[node name="TextureRect" type="TextureRect"]
|
||||
size_flags_horizontal = 0
|
||||
size_flags_vertical = 0
|
||||
expand_mode = 1
|
||||
expand_mode = 2
|
||||
stretch_mode = 2
|
||||
script = ExtResource("1_clm6l")
|
||||
|
||||
@@ -12,8 +12,11 @@ func _resort() -> void:
|
||||
if has_meta("should_fill_horizontal"):
|
||||
size_flags_horizontal = Control.SIZE_FILL
|
||||
else:
|
||||
if not has_meta("size_flags_set_by_style_manager"):
|
||||
size_flags_horizontal = Control.SIZE_SHRINK_BEGIN
|
||||
if size_flags_horizontal == Control.SIZE_EXPAND_FILL and has_meta("size_flags_set_by_style_manager"):
|
||||
pass
|
||||
else:
|
||||
if not has_meta("size_flags_set_by_style_manager"):
|
||||
size_flags_horizontal = Control.SIZE_SHRINK_BEGIN
|
||||
|
||||
# Check if we should fill vertically (for h-full)
|
||||
if has_meta("should_fill_vertical"):
|
||||
|
||||
@@ -6,8 +6,9 @@ class CSSRule:
|
||||
var event_prefix: String = ""
|
||||
var properties: Dictionary = {}
|
||||
var specificity: int = 0
|
||||
var selector_type: String = "simple" # simple, descendant, child, adjacent_sibling, general_sibling, attribute
|
||||
var selector_parts: Array = [] # For complex selectors
|
||||
var selector_type: String = "simple" # simple, descendant, child, adjacent_sibling, general_sibling, attribute
|
||||
var selector_parts: Array = [] # For complex selectors
|
||||
var is_user_css: bool = false
|
||||
|
||||
func init(sel: String = ""):
|
||||
selector = sel
|
||||
@@ -52,9 +53,9 @@ class CSSRule:
|
||||
func calculate_specificity():
|
||||
specificity = 1
|
||||
if selector.begins_with("."):
|
||||
specificity += 10
|
||||
specificity += 20
|
||||
if selector.contains("["):
|
||||
specificity += 10 # Attribute selectors
|
||||
specificity += 10
|
||||
match selector_type:
|
||||
"child":
|
||||
specificity += 8
|
||||
@@ -68,6 +69,10 @@ class CSSRule:
|
||||
specificity += 4
|
||||
if event_prefix.length() > 0:
|
||||
specificity += 10
|
||||
|
||||
if is_user_css:
|
||||
specificity += 100
|
||||
|
||||
|
||||
class CSSStylesheet:
|
||||
var rules: Array[CSSRule] = []
|
||||
@@ -287,7 +292,7 @@ func init(css_content: String = ""):
|
||||
stylesheet = CSSStylesheet.new()
|
||||
css_text = css_content
|
||||
|
||||
func parse() -> void:
|
||||
func parse(is_user_css: bool = false) -> void:
|
||||
if css_text.is_empty():
|
||||
return
|
||||
|
||||
@@ -295,7 +300,7 @@ func parse() -> void:
|
||||
var rules = extract_rules(cleaned_css)
|
||||
|
||||
for rule_data in rules:
|
||||
var rule = parse_rule(rule_data)
|
||||
var rule = parse_rule(rule_data, is_user_css)
|
||||
if rule:
|
||||
stylesheet.add_rule(rule)
|
||||
|
||||
@@ -355,9 +360,10 @@ func find_matching_brace(css: String, start_pos: int) -> int:
|
||||
|
||||
return -1
|
||||
|
||||
func parse_rule(rule_data: Dictionary) -> CSSRule:
|
||||
func parse_rule(rule_data: Dictionary, is_user_css: bool = false) -> CSSRule:
|
||||
var rule = CSSRule.new()
|
||||
rule.selector = rule_data.selector
|
||||
rule.is_user_css = is_user_css
|
||||
rule.init(rule.selector)
|
||||
var properties_text = rule_data.properties
|
||||
|
||||
@@ -397,6 +403,7 @@ func parse_utility_class(rule: CSSRule, utility_name: String) -> void:
|
||||
pseudo_rule.event_prefix = pseudo
|
||||
pseudo_rule.selector_type = rule.selector_type
|
||||
pseudo_rule.selector_parts = rule.selector_parts.duplicate()
|
||||
pseudo_rule.is_user_css = rule.is_user_css
|
||||
pseudo_rule.calculate_specificity()
|
||||
pseudo_rule.specificity += 100
|
||||
|
||||
|
||||
@@ -25,20 +25,20 @@ class HTMLElement:
|
||||
return get_attribute("id")
|
||||
|
||||
func get_collapsed_text() -> String:
|
||||
var collapsed = text_content.strip_edges()
|
||||
var collapsed = HTMLParser.unescape_html_entities(text_content).strip_edges()
|
||||
# Replace multiple whitespace characters with single space
|
||||
var regex = RegEx.new()
|
||||
regex.compile("\\s+")
|
||||
return regex.sub(collapsed, " ", true)
|
||||
|
||||
func get_preserved_text() -> String:
|
||||
return text_content
|
||||
return HTMLParser.unescape_html_entities(text_content)
|
||||
|
||||
func get_bbcode_formatted_text(parser: HTMLParser) -> String:
|
||||
var styles = {}
|
||||
if parser != null:
|
||||
styles = parser.get_element_styles_with_inheritance(self, "", [])
|
||||
return HTMLParser.get_bbcode_with_styles(self, styles, parser)
|
||||
return HTMLParser.get_bbcode_with_styles(self, styles, parser, [])
|
||||
|
||||
func is_inline_element() -> bool:
|
||||
return tag_name in ["b", "i", "u", "small", "mark", "code", "span", "a", "input"]
|
||||
@@ -68,10 +68,69 @@ var bitcode: PackedByteArray
|
||||
var parse_result: ParseResult
|
||||
|
||||
func _init(data: PackedByteArray):
|
||||
bitcode = data
|
||||
var html_string = data.get_string_from_utf8()
|
||||
html_string = preprocess_html_entities(html_string)
|
||||
bitcode = html_string.to_utf8_buffer()
|
||||
xml_parser = XMLParser.new()
|
||||
parse_result = ParseResult.new()
|
||||
|
||||
static func unescape_html_entities(text: String) -> String:
|
||||
return text.replace("<", "<").replace(">", ">").replace(""", "\"").replace("'", "'").replace("&", "&")
|
||||
|
||||
static func preprocess_html_entities(html: String) -> String:
|
||||
var result = ""
|
||||
var i = 0
|
||||
var in_tag = false
|
||||
|
||||
while i < html.length():
|
||||
var char = html[i]
|
||||
|
||||
if char == "<":
|
||||
# Check if this starts a valid HTML tag
|
||||
var tag_end = html.find(">", i)
|
||||
if tag_end != -1:
|
||||
var potential_tag = html.substr(i, tag_end - i + 1)
|
||||
# Simple check for valid tag pattern
|
||||
if is_valid_tag_pattern(potential_tag):
|
||||
result += potential_tag
|
||||
i = tag_end + 1
|
||||
continue
|
||||
# If not a valid tag, escape it
|
||||
result += "<"
|
||||
elif char == ">":
|
||||
# Escape standalone > that's not part of a tag
|
||||
result += ">"
|
||||
else:
|
||||
result += char
|
||||
|
||||
i += 1
|
||||
|
||||
return result
|
||||
|
||||
static func is_valid_tag_pattern(tag: String) -> bool:
|
||||
if tag.length() < 3: # Minimum: <x>
|
||||
return false
|
||||
|
||||
if not tag.begins_with("<") or not tag.ends_with(">"):
|
||||
return false
|
||||
|
||||
var inner = tag.substr(1, tag.length() - 2).strip_edges()
|
||||
|
||||
if inner.begins_with("/"):
|
||||
inner = inner.substr(1).strip_edges()
|
||||
|
||||
# Handle self-closing tags
|
||||
if inner.ends_with("/"):
|
||||
inner = inner.substr(0, inner.length() - 1).strip_edges()
|
||||
|
||||
# Extract tag name (first part before space or attributes)
|
||||
var tag_name = inner.split(" ")[0].split("\t")[0]
|
||||
|
||||
# Valid tag names contain only letters, numbers, and hyphens
|
||||
var regex = RegEx.new()
|
||||
regex.compile("^[a-zA-Z][a-zA-Z0-9-]*$")
|
||||
return regex.search(tag_name) != null
|
||||
|
||||
# Main parsing function
|
||||
func parse() -> ParseResult:
|
||||
xml_parser.open_buffer(bitcode)
|
||||
@@ -408,7 +467,7 @@ func apply_element_styles(node: Control, element: HTMLElement, parser: HTMLParse
|
||||
var styles = parser.get_element_styles_with_inheritance(element, "", [])
|
||||
if node.get("rich_text_label"):
|
||||
var label = node.rich_text_label
|
||||
var text = HTMLParser.get_bbcode_with_styles(element, styles, parser)
|
||||
var text = HTMLParser.get_bbcode_with_styles(element, styles, parser, [])
|
||||
label.text = text
|
||||
|
||||
static func apply_element_bbcode_formatting(element: HTMLElement, styles: Dictionary, content: String, parser: HTMLParser = null) -> String:
|
||||
@@ -478,7 +537,13 @@ static func apply_element_bbcode_formatting(element: HTMLElement, styles: Dictio
|
||||
|
||||
return formatted_content
|
||||
|
||||
static func get_bbcode_with_styles(element: HTMLElement, styles: Dictionary, parser: HTMLParser) -> String:
|
||||
static func get_bbcode_with_styles(element: HTMLElement, styles: Dictionary, parser: HTMLParser, visited_elements: Array = []) -> String:
|
||||
if element in visited_elements:
|
||||
return ""
|
||||
|
||||
var new_visited = visited_elements.duplicate()
|
||||
new_visited.append(element)
|
||||
|
||||
var text = ""
|
||||
if element.text_content.length() > 0:
|
||||
text += element.get_collapsed_text()
|
||||
@@ -486,8 +551,8 @@ static func get_bbcode_with_styles(element: HTMLElement, styles: Dictionary, par
|
||||
for child in element.children:
|
||||
var child_styles = styles
|
||||
if parser != null:
|
||||
child_styles = parser.get_element_styles_with_inheritance(child, "", [])
|
||||
var child_content = HTMLParser.get_bbcode_with_styles(child, child_styles, parser)
|
||||
child_styles = parser.get_element_styles_with_inheritance(child, "", new_visited)
|
||||
var child_content = HTMLParser.get_bbcode_with_styles(child, child_styles, parser, new_visited)
|
||||
child_content = apply_element_bbcode_formatting(child, child_styles, child_content, parser)
|
||||
text += child_content
|
||||
|
||||
|
||||
@@ -642,8 +642,6 @@ func execute_lua_script(code: String, vm: LuauVM):
|
||||
script_start_time = Time.get_ticks_msec() / 1000.0
|
||||
threaded_vm.execute_script_async(code)
|
||||
|
||||
|
||||
|
||||
func _on_threaded_script_completed(result: Dictionary):
|
||||
var execution_time = (Time.get_ticks_msec() / 1000.0) - script_start_time
|
||||
|
||||
@@ -689,6 +687,10 @@ func _handle_dom_operation(operation: Dictionary):
|
||||
LuaDOMUtils.handle_insert_after(operation, dom_parser, self)
|
||||
"replace_child":
|
||||
LuaDOMUtils.handle_replace_child(operation, dom_parser, self)
|
||||
"focus_element":
|
||||
_handle_element_focus(operation)
|
||||
"unfocus_element":
|
||||
_handle_element_unfocus(operation)
|
||||
_:
|
||||
pass # Unknown operation type, ignore
|
||||
|
||||
@@ -786,6 +788,53 @@ func _handle_text_getting(operation: Dictionary):
|
||||
return element.text_content
|
||||
return ""
|
||||
|
||||
func _handle_element_focus(operation: Dictionary):
|
||||
var element_id: String = operation.element_id
|
||||
|
||||
var dom_node = dom_parser.parse_result.dom_nodes.get(element_id, null)
|
||||
if not dom_node:
|
||||
return
|
||||
|
||||
var focusable_control = _find_focusable_control(dom_node)
|
||||
if focusable_control and focusable_control.has_method("grab_focus"):
|
||||
focusable_control.call_deferred("grab_focus")
|
||||
|
||||
func _handle_element_unfocus(operation: Dictionary):
|
||||
var element_id: String = operation.element_id
|
||||
|
||||
var dom_node = dom_parser.parse_result.dom_nodes.get(element_id, null)
|
||||
if not dom_node:
|
||||
return
|
||||
|
||||
var focusable_control = _find_focusable_control(dom_node)
|
||||
if focusable_control and focusable_control.has_method("release_focus"):
|
||||
focusable_control.call_deferred("release_focus")
|
||||
|
||||
func _find_focusable_control(node: Node) -> Control:
|
||||
if not node:
|
||||
return null
|
||||
|
||||
if node is Control and node.focus_mode != Control.FOCUS_NONE and node.has_method("grab_focus"):
|
||||
return node
|
||||
|
||||
if node.has_method("get_children"):
|
||||
for child in node.get_children():
|
||||
if child.visible and child is Control:
|
||||
if child is LineEdit or child is TextEdit or child is SpinBox or child is OptionButton:
|
||||
if child.focus_mode != Control.FOCUS_NONE:
|
||||
return child
|
||||
|
||||
if child is SpinBox:
|
||||
var line_edit = child.get_line_edit()
|
||||
if line_edit and line_edit.focus_mode != Control.FOCUS_NONE:
|
||||
return line_edit
|
||||
|
||||
var focusable_child = _find_focusable_control(child)
|
||||
if focusable_child:
|
||||
return focusable_child
|
||||
|
||||
return null
|
||||
|
||||
func _handle_body_event_registration(operation: Dictionary):
|
||||
var event_name: String = operation.event_name
|
||||
var callback_ref: int = operation.callback_ref
|
||||
|
||||
@@ -21,6 +21,7 @@ code { text-xl font-mono }
|
||||
a { text-[#1a0dab] }
|
||||
pre { text-xl font-mono }
|
||||
|
||||
img { object-fill }
|
||||
button { text-[16px] bg-[#1b1b1b] rounded-md text-white hover:bg-[#2a2a2a] active:bg-[#101010] px-3 py-1.5 }
|
||||
button[disabled] { bg-[#666666] text-[#999999] cursor-not-allowed }
|
||||
|
||||
@@ -29,7 +30,7 @@ select:active { text-[#000000] border-[3px] border-[#000000] }
|
||||
|
||||
input[type="color"] { w-32 }
|
||||
input[type="range"] { w-32 }
|
||||
input[type="text"] { w-64 }
|
||||
input[type="text"] { text-[16px] w-64 }
|
||||
input[type="number"] { w-32 text-[16px] bg-transparent border border-[#000000] rounded-[3px] text-[#000000] hover:border-[3px] hover:border-[#000000] px-3 py-1.5 }
|
||||
input[type="date"] { w-28 text-[16px] bg-[#1b1b1b] rounded-md text-white hover:bg-[#2a2a2a] active:bg-[#101010] px-3 py-1.5 }
|
||||
"""
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
extends RefCounted
|
||||
class_name GurtProtocol
|
||||
|
||||
const DNS_API_URL = "gurt://localhost:8877"
|
||||
|
||||
# DNS resolution cache: domain.tld -> IP address
|
||||
static var _dns_cache: Dictionary = {}
|
||||
const DNS_SERVER_IP: String = "135.125.163.131"
|
||||
const DNS_SERVER_PORT: int = 8877
|
||||
|
||||
static func is_gurt_domain(url: String) -> bool:
|
||||
if url.begins_with("gurt://"):
|
||||
@@ -16,65 +14,13 @@ static func is_gurt_domain(url: String) -> bool:
|
||||
|
||||
return false
|
||||
|
||||
static func parse_gurt_domain(url: String) -> Dictionary:
|
||||
var domain_part = url
|
||||
var path = "/"
|
||||
static func is_direct_address(domain: String) -> bool:
|
||||
# Check if it's already an IP address or localhost
|
||||
if domain.contains(":"):
|
||||
var parts = domain.split(":")
|
||||
domain = parts[0]
|
||||
|
||||
if url.begins_with("gurt://"):
|
||||
domain_part = url.substr(7)
|
||||
|
||||
# Extract path from domain_part (e.g., "test.dawg/script.lua" -> "test.dawg" + "/script.lua")
|
||||
var path_start = domain_part.find("/")
|
||||
if path_start != -1:
|
||||
path = domain_part.substr(path_start)
|
||||
domain_part = domain_part.substr(0, path_start)
|
||||
|
||||
# Check if domain is cached (resolved before)
|
||||
var domain_key = domain_part
|
||||
if _dns_cache.has(domain_key):
|
||||
return {
|
||||
"direct_address": _dns_cache[domain_key],
|
||||
"display_url": domain_part + path,
|
||||
"is_direct": true,
|
||||
"path": path,
|
||||
"full_domain": domain_part
|
||||
}
|
||||
|
||||
if domain_part.contains(":") or domain_part.begins_with("127.0.0.1") or domain_part.begins_with("localhost") or is_ip_address(domain_part):
|
||||
return {
|
||||
"direct_address": domain_part,
|
||||
"display_url": domain_part + path,
|
||||
"is_direct": true,
|
||||
"path": path,
|
||||
"full_domain": domain_part
|
||||
}
|
||||
|
||||
var parts = domain_part.split(".")
|
||||
if parts.size() < 2:
|
||||
return {}
|
||||
|
||||
# Support subdomains (e.g., api.blog.example.com)
|
||||
if parts.size() == 2:
|
||||
return {
|
||||
"name": parts[0],
|
||||
"tld": parts[1],
|
||||
"display_url": domain_part + path,
|
||||
"is_direct": false,
|
||||
"path": path,
|
||||
"full_domain": domain_part,
|
||||
"is_subdomain": false
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"name": parts[parts.size() - 2], # The domain name part
|
||||
"tld": parts[parts.size() - 1], # The TLD part
|
||||
"display_url": domain_part + path,
|
||||
"is_direct": false,
|
||||
"path": path,
|
||||
"full_domain": domain_part,
|
||||
"is_subdomain": true,
|
||||
"subdomain_parts": parts.slice(0, parts.size() - 2)
|
||||
}
|
||||
return domain == "localhost" or domain == "127.0.0.1" or is_ip_address(domain)
|
||||
|
||||
static func is_ip_address(address: String) -> bool:
|
||||
var parts = address.split(".")
|
||||
@@ -90,329 +36,25 @@ static func is_ip_address(address: String) -> bool:
|
||||
|
||||
return true
|
||||
|
||||
static func fetch_domain_info(name: String, tld: String) -> Dictionary:
|
||||
var request_data = JSON.stringify({"name": name, "tld": tld})
|
||||
var result = await fetch_dns_post_working("localhost:8877", "/resolve", request_data)
|
||||
static func resolve_gurt_domain(domain: String) -> String:
|
||||
if is_direct_address(domain):
|
||||
if domain == "localhost":
|
||||
return "127.0.0.1"
|
||||
return domain
|
||||
|
||||
if result.has("error"):
|
||||
return {"error": result.error}
|
||||
|
||||
if not result.has("content"):
|
||||
return {"error": "No content in DNS response"}
|
||||
|
||||
var content_str = result.content.get_string_from_utf8()
|
||||
var json = JSON.new()
|
||||
var parse_result = json.parse(content_str)
|
||||
|
||||
if parse_result != OK:
|
||||
return {"error": "Invalid JSON in DNS response"}
|
||||
|
||||
return json.data
|
||||
|
||||
static func fetch_full_domain_info(full_domain: String, record_type: String = "") -> Dictionary:
|
||||
var request_data = {"domain": full_domain}
|
||||
if not record_type.is_empty():
|
||||
request_data["record_type"] = record_type
|
||||
|
||||
var json_data = JSON.stringify(request_data)
|
||||
var result = await fetch_dns_post_working("localhost:8877", "/resolve-full", json_data)
|
||||
|
||||
if result.has("error"):
|
||||
return {"error": result.error}
|
||||
|
||||
if not result.has("content"):
|
||||
return {"error": "No content in DNS response"}
|
||||
|
||||
var content_str = result.content.get_string_from_utf8()
|
||||
var json = JSON.new()
|
||||
var parse_result = json.parse(content_str)
|
||||
|
||||
if parse_result != OK:
|
||||
return {"error": "Invalid JSON in DNS response"}
|
||||
|
||||
return json.data
|
||||
|
||||
static func fetch_dns_post_working(server: String, path: String, json_data: String) -> Dictionary:
|
||||
var shared_result = {"finished": false}
|
||||
var thread = Thread.new()
|
||||
var mutex = Mutex.new()
|
||||
|
||||
var thread_func = func():
|
||||
var local_result = {}
|
||||
var client = GurtProtocolClient.new()
|
||||
|
||||
for ca_cert in CertificateManager.trusted_ca_certificates:
|
||||
client.add_ca_certificate(ca_cert)
|
||||
|
||||
if not client.create_client(10):
|
||||
local_result = {"error": "Failed to create client"}
|
||||
else:
|
||||
var url = "gurt://" + server + path
|
||||
|
||||
# Prepare request options
|
||||
var options = {
|
||||
"method": "POST",
|
||||
"headers": {"Content-Type": "application/json"},
|
||||
"body": json_data
|
||||
}
|
||||
|
||||
var response = client.request(url, options)
|
||||
|
||||
client.disconnect()
|
||||
|
||||
if not response:
|
||||
local_result = {"error": "No response from server"}
|
||||
elif not response.is_success:
|
||||
local_result = {"error": "Server error: " + str(response.status_code) + " " + str(response.status_message)}
|
||||
else:
|
||||
local_result = {"content": response.body}
|
||||
|
||||
mutex.lock()
|
||||
shared_result.clear()
|
||||
for key in local_result:
|
||||
shared_result[key] = local_result[key]
|
||||
shared_result["finished"] = true
|
||||
mutex.unlock()
|
||||
|
||||
thread.start(thread_func)
|
||||
|
||||
# Non-blocking wait
|
||||
while not shared_result.get("finished", false):
|
||||
await Engine.get_main_loop().process_frame
|
||||
|
||||
thread.wait_to_finish()
|
||||
|
||||
mutex.lock()
|
||||
var final_result = {}
|
||||
for key in shared_result:
|
||||
if key != "finished":
|
||||
final_result[key] = shared_result[key]
|
||||
mutex.unlock()
|
||||
|
||||
return final_result
|
||||
|
||||
static func fetch_content_via_gurt(ip: String, path: String = "/") -> Dictionary:
|
||||
var client = GurtProtocolClient.new()
|
||||
|
||||
for ca_cert in CertificateManager.trusted_ca_certificates:
|
||||
client.add_ca_certificate(ca_cert)
|
||||
|
||||
if not client.create_client(30):
|
||||
return {"error": "Failed to create GURT client"}
|
||||
|
||||
var gurt_url = "gurt://" + ip + ":4878" + path
|
||||
|
||||
var response = client.request(gurt_url, {"method": "GET"})
|
||||
|
||||
client.disconnect()
|
||||
|
||||
if not response:
|
||||
return {"error": "No response from GURT server"}
|
||||
|
||||
if not response.is_success:
|
||||
var error_msg = "Server returned status " + str(response.status_code) + ": " + response.status_message
|
||||
return {"error": error_msg}
|
||||
|
||||
var content = response.body
|
||||
return {"content": content, "headers": response.headers}
|
||||
|
||||
static func fetch_content_via_gurt_direct(address: String, path: String = "/") -> Dictionary:
|
||||
var shared_result = {"finished": false}
|
||||
var thread = Thread.new()
|
||||
var mutex = Mutex.new()
|
||||
|
||||
var thread_func = func():
|
||||
var local_result = {}
|
||||
var client = GurtProtocolClient.new()
|
||||
|
||||
for ca_cert in CertificateManager.trusted_ca_certificates:
|
||||
client.add_ca_certificate(ca_cert)
|
||||
|
||||
if not client.create_client(10):
|
||||
local_result = {"error": "Failed to create GURT client"}
|
||||
else:
|
||||
var gurt_url: String
|
||||
if address.contains(":"):
|
||||
gurt_url = "gurt://" + address + path
|
||||
else:
|
||||
gurt_url = "gurt://" + address + ":4878" + path
|
||||
|
||||
var response = client.request(gurt_url, {"method": "GET"})
|
||||
|
||||
client.disconnect()
|
||||
|
||||
if not response:
|
||||
local_result = {"error": "No response from GURT server"}
|
||||
else:
|
||||
var content = response.body
|
||||
|
||||
if not response.is_success:
|
||||
var error_msg = "Server returned status " + str(response.status_code) + ": " + response.status_message
|
||||
local_result = {"error": error_msg, "content": content, "headers": response.headers}
|
||||
else:
|
||||
local_result = {"content": content, "headers": response.headers}
|
||||
|
||||
mutex.lock()
|
||||
shared_result.clear()
|
||||
for key in local_result:
|
||||
shared_result[key] = local_result[key]
|
||||
shared_result["finished"] = true
|
||||
mutex.unlock()
|
||||
|
||||
thread.start(thread_func)
|
||||
|
||||
# Non-blocking wait using signals instead of polling
|
||||
while not shared_result.get("finished", false):
|
||||
await Engine.get_main_loop().process_frame
|
||||
# Yield control back to the main thread without blocking delays
|
||||
|
||||
thread.wait_to_finish()
|
||||
|
||||
mutex.lock()
|
||||
var final_result = {}
|
||||
for key in shared_result:
|
||||
if key != "finished":
|
||||
final_result[key] = shared_result[key]
|
||||
mutex.unlock()
|
||||
|
||||
return final_result
|
||||
|
||||
static func handle_gurt_domain(url: String) -> Dictionary:
|
||||
var parsed = parse_gurt_domain(url)
|
||||
if parsed.is_empty():
|
||||
return {"error": "Invalid domain format. Use: domain.tld or IP:port", "html": create_error_page("Invalid domain format. Use: domain.tld or IP:port")}
|
||||
|
||||
var target_address: String
|
||||
var path = parsed.get("path", "/")
|
||||
|
||||
if parsed.get("is_direct", false):
|
||||
target_address = parsed.direct_address
|
||||
else:
|
||||
var domain_info: Dictionary
|
||||
|
||||
# Use the new full domain resolution for subdomains
|
||||
if parsed.get("is_subdomain", false):
|
||||
domain_info = await fetch_full_domain_info(parsed.full_domain)
|
||||
else:
|
||||
domain_info = await fetch_domain_info(parsed.name, parsed.tld)
|
||||
|
||||
if domain_info.has("error"):
|
||||
return {"error": domain_info.error, "html": create_error_page(domain_info.error)}
|
||||
|
||||
# Process DNS records to find target address
|
||||
var target_result = await resolve_target_address(domain_info, parsed.full_domain)
|
||||
if target_result.has("error"):
|
||||
return {"error": target_result.error, "html": create_error_page(target_result.error)}
|
||||
|
||||
target_address = target_result.address
|
||||
|
||||
# Cache the resolved address
|
||||
var domain_key = parsed.full_domain
|
||||
_dns_cache[domain_key] = target_address
|
||||
|
||||
var content_result = await fetch_content_via_gurt_direct(target_address, path)
|
||||
if content_result.has("error"):
|
||||
var error_msg = "Failed to fetch content from " + target_address + path + " via GURT protocol - " + content_result.error
|
||||
if content_result.has("content") and not content_result.content.is_empty():
|
||||
return {"html": content_result.content, "display_url": parsed.display_url}
|
||||
return {"error": error_msg, "html": create_error_page(error_msg)}
|
||||
|
||||
if not content_result.has("content"):
|
||||
var error_msg = "No content received from " + target_address + path
|
||||
return {"error": error_msg, "html": create_error_page(error_msg)}
|
||||
|
||||
var html_content = content_result.content
|
||||
if html_content.is_empty():
|
||||
var error_msg = "Empty content received from " + target_address + path
|
||||
return {"error": error_msg, "html": create_error_page(error_msg)}
|
||||
|
||||
return {"html": html_content, "display_url": parsed.display_url}
|
||||
|
||||
static func resolve_target_address(domain_info: Dictionary, original_domain: String) -> Dictionary:
|
||||
if not domain_info.has("records") or domain_info.records == null:
|
||||
return {"error": "No DNS records found for domain"}
|
||||
|
||||
var records = domain_info.records
|
||||
var max_cname_depth = 5 # Prevent infinite CNAME loops
|
||||
var cname_depth = 0
|
||||
|
||||
# First pass: Look for direct A/AAAA records
|
||||
var a_records = []
|
||||
var aaaa_records = []
|
||||
var cname_records = []
|
||||
var ns_records = []
|
||||
|
||||
for record in records:
|
||||
if not record.has("type") or not record.has("value"):
|
||||
continue
|
||||
|
||||
match record.type:
|
||||
"A":
|
||||
a_records.append(record.value)
|
||||
"AAAA":
|
||||
aaaa_records.append(record.value)
|
||||
"CNAME":
|
||||
cname_records.append(record.value)
|
||||
"NS":
|
||||
ns_records.append(record.value)
|
||||
|
||||
# If we have direct A records, use the first one
|
||||
if not a_records.is_empty():
|
||||
return {"address": a_records[0]}
|
||||
|
||||
# If we have IPv6 AAAA records and no A records, we need to handle this
|
||||
if not aaaa_records.is_empty() and a_records.is_empty():
|
||||
return {"error": "Only IPv6 (AAAA) records found, but IPv4 required for GURT protocol"}
|
||||
|
||||
# Follow CNAME chain
|
||||
if not cname_records.is_empty():
|
||||
var current_cname = cname_records[0]
|
||||
|
||||
while cname_depth < max_cname_depth:
|
||||
cname_depth += 1
|
||||
|
||||
# Try to resolve the CNAME target
|
||||
var cname_info = await fetch_full_domain_info(current_cname, "A")
|
||||
if cname_info.has("error"):
|
||||
return {"error": "Failed to resolve CNAME target: " + current_cname + " - " + cname_info.error}
|
||||
|
||||
if not cname_info.has("records") or cname_info.records == null:
|
||||
return {"error": "No records found for CNAME target: " + current_cname}
|
||||
|
||||
# Look for A records in the CNAME target
|
||||
var found_next_cname = false
|
||||
for record in cname_info.records:
|
||||
if record.has("type") and record.type == "A" and record.has("value"):
|
||||
return {"address": record.value}
|
||||
elif record.has("type") and record.type == "CNAME" and record.has("value"):
|
||||
# Another CNAME, continue the chain
|
||||
current_cname = record.value
|
||||
found_next_cname = true
|
||||
break
|
||||
|
||||
if not found_next_cname:
|
||||
# No more CNAMEs found, but also no A record
|
||||
return {"error": "CNAME chain ended without A record for: " + current_cname}
|
||||
|
||||
return {"error": "CNAME chain too deep (max " + str(max_cname_depth) + " levels)"}
|
||||
|
||||
# If we have NS records, this indicates delegation
|
||||
if not ns_records.is_empty():
|
||||
return {"error": "Domain is delegated to nameservers: " + str(ns_records) + ". Cannot resolve directly."}
|
||||
|
||||
return {"error": "No A record found for domain"}
|
||||
return domain
|
||||
|
||||
static func get_error_type(error_message: String) -> Dictionary:
|
||||
if "DNS server is not responding" in error_message or "Domain not found" in error_message:
|
||||
return {"code": "ERR_NAME_NOT_RESOLVED", "title": "This site can't be reached", "icon": "🌐"}
|
||||
return {"code": "ERR_NAME_NOT_RESOLVED", "title": "This site can't be reached", "icon": "? :("}
|
||||
elif "timeout" in error_message.to_lower() or "timed out" in error_message.to_lower():
|
||||
return {"code": "ERR_CONNECTION_TIMED_OUT", "title": "This site can't be reached", "icon": "⏰"}
|
||||
return {"code": "ERR_CONNECTION_TIMED_OUT", "title": "This site can't be reached", "icon": "...?"}
|
||||
elif "Failed to fetch" in error_message or "No response" in error_message:
|
||||
return {"code": "ERR_CONNECTION_REFUSED", "title": "This site can't be reached", "icon": "🚫"}
|
||||
return {"code": "ERR_CONNECTION_REFUSED", "title": "This site can't be reached", "icon": ">:("}
|
||||
elif "Invalid domain format" in error_message:
|
||||
return {"code": "ERR_INVALID_URL", "title": "This page isn't working", "icon": "⚠️"}
|
||||
return {"code": "ERR_INVALID_URL", "title": "This page isn't working", "icon": ":|"}
|
||||
else:
|
||||
return {"code": "ERR_UNKNOWN", "title": "Something went wrong", "icon": "❌"}
|
||||
return {"code": "ERR_UNKNOWN", "title": "Something went wrong", "icon": ">_<"}
|
||||
|
||||
static func create_error_page(error_message: String) -> PackedByteArray:
|
||||
var error_info = get_error_type(error_message)
|
||||
|
||||
@@ -112,13 +112,40 @@ func fetch_gurt_resource(url: String) -> String:
|
||||
if not GurtProtocol.is_gurt_domain(url):
|
||||
return ""
|
||||
|
||||
var result = await GurtProtocol.handle_gurt_domain(url)
|
||||
var gurt_url = url
|
||||
if not gurt_url.begins_with("gurt://"):
|
||||
gurt_url = "gurt://" + gurt_url
|
||||
|
||||
if result.has("error"):
|
||||
print("GURT resource error: ", result.error)
|
||||
if gurt_url.contains("localhost"):
|
||||
gurt_url = gurt_url.replace("localhost", "127.0.0.1")
|
||||
|
||||
var client = GurtProtocolClient.new()
|
||||
|
||||
for ca_cert in CertificateManager.trusted_ca_certificates:
|
||||
client.add_ca_certificate(ca_cert)
|
||||
|
||||
if not client.create_client(30):
|
||||
print("GURT resource error: Failed to create client")
|
||||
return ""
|
||||
|
||||
if result.has("html"):
|
||||
return result.html.get_string_from_utf8()
|
||||
var host_domain = gurt_url
|
||||
if host_domain.begins_with("gurt://"):
|
||||
host_domain = host_domain.substr(7)
|
||||
var slash_pos = host_domain.find("/")
|
||||
if slash_pos != -1:
|
||||
host_domain = host_domain.substr(0, slash_pos)
|
||||
|
||||
return ""
|
||||
var response = client.request(gurt_url, {
|
||||
"method": "GET",
|
||||
"headers": {"Host": host_domain}
|
||||
})
|
||||
client.disconnect()
|
||||
|
||||
if not response or not response.is_success:
|
||||
var error_msg = "Failed to load GURT resource"
|
||||
if response:
|
||||
error_msg += ": " + str(response.status_code) + " " + response.status_message
|
||||
print("GURT resource error: ", error_msg)
|
||||
return ""
|
||||
|
||||
return response.body.get_string_from_utf8()
|
||||
|
||||
@@ -67,6 +67,7 @@ static func apply_element_styles(node: Control, element: HTMLParser.HTMLElement,
|
||||
if width is String and width == "100%":
|
||||
node.size_flags_horizontal = Control.SIZE_EXPAND_FILL
|
||||
node.custom_minimum_size.x = 0
|
||||
node.set_meta("size_flags_set_by_style_manager", true)
|
||||
else:
|
||||
node.custom_minimum_size.x = width
|
||||
node.size_flags_horizontal = Control.SIZE_SHRINK_BEGIN
|
||||
@@ -75,6 +76,7 @@ static func apply_element_styles(node: Control, element: HTMLParser.HTMLElement,
|
||||
if height is String and height == "100%":
|
||||
node.size_flags_vertical = Control.SIZE_EXPAND_FILL
|
||||
node.custom_minimum_size.y = 0
|
||||
node.set_meta("size_flags_set_by_style_manager", true)
|
||||
else:
|
||||
node.custom_minimum_size.y = height
|
||||
node.size_flags_vertical = Control.SIZE_SHRINK_BEGIN
|
||||
|
||||
@@ -33,6 +33,3 @@ func load_image_async(src: String, element: HTMLParser.HTMLElement, parser: HTML
|
||||
size_flags_horizontal = Control.SIZE_EXPAND_FILL
|
||||
size_flags_vertical = Control.SIZE_EXPAND_FILL
|
||||
custom_minimum_size = Vector2.ZERO
|
||||
else:
|
||||
custom_minimum_size = Vector2(1, 1)
|
||||
size = Vector2(100, 100) # StyleManager will handle this
|
||||
|
||||
@@ -444,6 +444,16 @@ func apply_input_styles(element: HTMLParser.HTMLElement, parser: HTMLParser) ->
|
||||
var placeholder_color = Color(text_color.r, text_color.g, text_color.b, text_color.a * 0.6)
|
||||
line_edit.add_theme_color_override("font_placeholder_color", placeholder_color)
|
||||
|
||||
if styles.has("font-size"):
|
||||
var font_size = int(styles["font-size"])
|
||||
if active_child is LineEdit:
|
||||
active_child.add_theme_font_size_override("font_size", font_size)
|
||||
elif active_child is SpinBox:
|
||||
active_child.add_theme_font_size_override("font_size", font_size)
|
||||
var line_edit = active_child.get_line_edit()
|
||||
if line_edit:
|
||||
line_edit.add_theme_font_size_override("font_size", font_size)
|
||||
|
||||
# Apply stylebox for borders, background, padding, etc.
|
||||
if BackgroundUtils.needs_background_wrapper(styles) or active_child is SpinBox:
|
||||
apply_stylebox_to_input(active_child, styles)
|
||||
@@ -464,7 +474,7 @@ func apply_input_styles(element: HTMLParser.HTMLElement, parser: HTMLParser) ->
|
||||
var new_height = max(active_child.custom_minimum_size.y, active_child.size.y)
|
||||
|
||||
if width:
|
||||
if typeof(width) == TYPE_STRING and width == "100%":
|
||||
if typeof(width) == TYPE_STRING and (width == "100%" or width == "full"):
|
||||
active_child.size_flags_horizontal = Control.SIZE_EXPAND_FILL
|
||||
size_flags_horizontal = Control.SIZE_EXPAND_FILL
|
||||
new_width = 0
|
||||
@@ -483,7 +493,7 @@ func apply_input_styles(element: HTMLParser.HTMLElement, parser: HTMLParser) ->
|
||||
|
||||
active_child.custom_minimum_size = new_child_size
|
||||
|
||||
if width and not (typeof(width) == TYPE_STRING and width == "100%"):
|
||||
if width and not (typeof(width) == TYPE_STRING and (width == "100%" or width == "full")):
|
||||
active_child.size_flags_horizontal = Control.SIZE_SHRINK_BEGIN
|
||||
if height:
|
||||
active_child.size_flags_vertical = Control.SIZE_SHRINK_BEGIN
|
||||
@@ -494,7 +504,7 @@ func apply_input_styles(element: HTMLParser.HTMLElement, parser: HTMLParser) ->
|
||||
custom_minimum_size = new_child_size
|
||||
|
||||
# Root Control adjusts size flags to match child
|
||||
if width and not (typeof(width) == TYPE_STRING and width == "100%"):
|
||||
if width and not (typeof(width) == TYPE_STRING and (width == "100%" or width == "full")):
|
||||
size_flags_horizontal = Control.SIZE_SHRINK_BEGIN
|
||||
else:
|
||||
size_flags_horizontal = Control.SIZE_EXPAND_FILL
|
||||
|
||||
@@ -4,18 +4,19 @@ extends HBoxContainer
|
||||
var _element: HTMLParser.HTMLElement
|
||||
var _parser: HTMLParser
|
||||
|
||||
func init(element: HTMLParser.HTMLElement, parser: HTMLParser) -> void:
|
||||
const BROWSER_THEME = preload("res://Scenes/Styles/BrowserText.tres")
|
||||
|
||||
func init(element, parser: HTMLParser) -> void:
|
||||
_element = element
|
||||
_parser = parser
|
||||
size_flags_horizontal = Control.SIZE_EXPAND_FILL
|
||||
size_flags_vertical = Control.SIZE_SHRINK_BEGIN
|
||||
|
||||
mouse_filter = Control.MOUSE_FILTER_PASS
|
||||
|
||||
if get_child_count() > 0:
|
||||
return
|
||||
|
||||
var content_parts = []
|
||||
var current_text = ""
|
||||
|
||||
var element_text = element.text_content
|
||||
var child_texts = []
|
||||
|
||||
@@ -27,7 +28,7 @@ func init(element: HTMLParser.HTMLElement, parser: HTMLParser) -> void:
|
||||
parent_only_text = parent_only_text.replace(child_text, "")
|
||||
|
||||
if not parent_only_text.strip_edges().is_empty():
|
||||
var parent_label = create_styled_label(parent_only_text.strip_edges(), element, parser)
|
||||
create_styled_label(parent_only_text.strip_edges(), element, parser)
|
||||
|
||||
for child in element.children:
|
||||
var child_label = create_styled_label(child.get_bbcode_formatted_text(parser), element, parser)
|
||||
@@ -35,13 +36,30 @@ func init(element: HTMLParser.HTMLElement, parser: HTMLParser) -> void:
|
||||
if contains_hyperlink(child):
|
||||
child_label.meta_clicked.connect(_on_meta_clicked)
|
||||
|
||||
func create_styled_label(text: String, element: HTMLParser.HTMLElement, parser: HTMLParser) -> RichTextLabel:
|
||||
func create_styled_label(text: String, element, parser: HTMLParser) -> RichTextLabel:
|
||||
var label = RichTextLabel.new()
|
||||
|
||||
label.theme = BROWSER_THEME
|
||||
label.focus_mode = Control.FOCUS_ALL
|
||||
label.add_theme_color_override("default_color", Color.BLACK)
|
||||
label.bbcode_enabled = true
|
||||
label.fit_content = true
|
||||
label.vertical_alignment = VERTICAL_ALIGNMENT_CENTER
|
||||
label.selection_enabled = true
|
||||
|
||||
var parent_cursor_shape = Control.CURSOR_IBEAM
|
||||
if element.parent:
|
||||
var parent_styles = parser.get_element_styles_with_inheritance(element.parent, "", [])
|
||||
if parent_styles.has("cursor"):
|
||||
parent_cursor_shape = StyleManager.get_cursor_shape_from_type(parent_styles["cursor"])
|
||||
|
||||
label.mouse_default_cursor_shape = parent_cursor_shape
|
||||
label.mouse_filter = Control.MOUSE_FILTER_PASS
|
||||
|
||||
label.autowrap_mode = TextServer.AUTOWRAP_WORD_SMART
|
||||
label.size_flags_horizontal = Control.SIZE_EXPAND_FILL
|
||||
label.size_flags_vertical = Control.SIZE_SHRINK_CENTER
|
||||
label.bbcode_enabled = true
|
||||
|
||||
add_child(label)
|
||||
|
||||
var styles = parser.get_element_styles_with_inheritance(element, "", [])
|
||||
@@ -51,12 +69,17 @@ func create_styled_label(text: String, element: HTMLParser.HTMLElement, parser:
|
||||
return label
|
||||
|
||||
func _apply_auto_resize_to_label(label: RichTextLabel):
|
||||
if not is_instance_valid(label) or not is_instance_valid(self):
|
||||
return
|
||||
|
||||
if not label.is_inside_tree():
|
||||
await label.tree_entered
|
||||
|
||||
if not is_instance_valid(label) or not is_instance_valid(self):
|
||||
return
|
||||
|
||||
var min_width = 20
|
||||
var max_width = 800
|
||||
var min_height = 30
|
||||
|
||||
label.fit_content = true
|
||||
|
||||
@@ -65,8 +88,11 @@ func _apply_auto_resize_to_label(label: RichTextLabel):
|
||||
|
||||
await get_tree().process_frame
|
||||
|
||||
if not is_instance_valid(label) or not is_instance_valid(self):
|
||||
return
|
||||
|
||||
var natural_width = label.size.x
|
||||
natural_width *= 1.0 # font weight multiplier simplified
|
||||
natural_width *= 1.0
|
||||
|
||||
var desired_width = clampf(natural_width, min_width, max_width)
|
||||
|
||||
@@ -74,6 +100,9 @@ func _apply_auto_resize_to_label(label: RichTextLabel):
|
||||
|
||||
await get_tree().process_frame
|
||||
|
||||
if not is_instance_valid(label) or not is_instance_valid(self):
|
||||
return
|
||||
|
||||
label.custom_minimum_size = Vector2(desired_width, 0)
|
||||
|
||||
label.queue_redraw()
|
||||
@@ -110,18 +139,35 @@ func set_text(new_text: String) -> void:
|
||||
child.queue_free()
|
||||
|
||||
if _element and _parser:
|
||||
var label = create_styled_label(new_text, _element, _parser)
|
||||
create_styled_label(new_text, _element, _parser)
|
||||
else:
|
||||
var label = create_label(new_text)
|
||||
create_label(new_text)
|
||||
|
||||
func create_label(text: String) -> RichTextLabel:
|
||||
var label = RichTextLabel.new()
|
||||
label.text = text
|
||||
|
||||
label.theme = BROWSER_THEME
|
||||
label.focus_mode = Control.FOCUS_ALL
|
||||
label.add_theme_color_override("default_color", Color.BLACK)
|
||||
label.bbcode_enabled = true
|
||||
label.fit_content = true
|
||||
label.vertical_alignment = VERTICAL_ALIGNMENT_CENTER
|
||||
label.selection_enabled = true
|
||||
|
||||
var parent_cursor_shape = Control.CURSOR_IBEAM
|
||||
if _element and _parser and _element.parent:
|
||||
var parent_styles = _parser.get_element_styles_with_inheritance(_element.parent, "", [])
|
||||
if parent_styles.has("cursor"):
|
||||
parent_cursor_shape = StyleManager.get_cursor_shape_from_type(parent_styles["cursor"])
|
||||
|
||||
label.mouse_default_cursor_shape = parent_cursor_shape
|
||||
label.mouse_filter = Control.MOUSE_FILTER_PASS
|
||||
|
||||
label.text = text
|
||||
label.autowrap_mode = TextServer.AUTOWRAP_WORD_SMART
|
||||
label.size_flags_horizontal = Control.SIZE_EXPAND_FILL
|
||||
label.size_flags_vertical = Control.SIZE_SHRINK_CENTER
|
||||
label.bbcode_enabled = true
|
||||
|
||||
add_child(label)
|
||||
call_deferred("_apply_auto_resize_to_label", label)
|
||||
return label
|
||||
|
||||
@@ -195,12 +195,42 @@ static func setup_panel_hover_support(panel: PanelContainer, normal_styles: Dict
|
||||
panel.set_meta("hover_stylebox", hover_stylebox)
|
||||
panel.set_meta("normal_styles", normal_styles.duplicate(true))
|
||||
panel.set_meta("hover_styles", merged_hover_styles.duplicate(true))
|
||||
panel.set_meta("is_hovering", false)
|
||||
|
||||
# Connect mouse events
|
||||
panel.mouse_entered.connect(_on_panel_mouse_entered.bind(panel))
|
||||
panel.mouse_exited.connect(_on_panel_mouse_exited.bind(panel))
|
||||
panel.mouse_exited.connect(_on_panel_mouse_exited_with_delay.bind(panel))
|
||||
|
||||
_setup_child_hover_listeners(panel)
|
||||
|
||||
static func _setup_child_hover_listeners(panel: PanelContainer):
|
||||
for child in panel.get_children():
|
||||
_connect_child_hover_events(child, panel)
|
||||
|
||||
panel.child_entered_tree.connect(_on_child_added.bind(panel))
|
||||
|
||||
static func _connect_child_hover_events(child: Node, panel: PanelContainer):
|
||||
if child is Control:
|
||||
# Only connect if not already connected
|
||||
if not child.mouse_entered.is_connected(_on_child_mouse_entered.bind(panel)):
|
||||
child.mouse_entered.connect(_on_child_mouse_entered.bind(panel))
|
||||
if not child.mouse_exited.is_connected(_on_child_mouse_exited.bind(panel)):
|
||||
child.mouse_exited.connect(_on_child_mouse_exited.bind(panel))
|
||||
|
||||
for grandchild in child.get_children():
|
||||
_connect_child_hover_events(grandchild, panel)
|
||||
|
||||
static func _on_child_added(child: Node, panel: PanelContainer):
|
||||
_connect_child_hover_events(child, panel)
|
||||
|
||||
static func _on_child_mouse_entered(panel: PanelContainer):
|
||||
_on_panel_mouse_entered(panel)
|
||||
|
||||
static func _on_child_mouse_exited(panel: PanelContainer):
|
||||
panel.get_tree().create_timer(0.01).timeout.connect(func(): _check_panel_hover(panel))
|
||||
|
||||
static func _on_panel_mouse_entered(panel: PanelContainer):
|
||||
panel.set_meta("is_hovering", true)
|
||||
if panel.has_meta("hover_stylebox"):
|
||||
var hover_stylebox = panel.get_meta("hover_stylebox")
|
||||
panel.add_theme_stylebox_override("panel", hover_stylebox)
|
||||
@@ -210,15 +240,27 @@ static func _on_panel_mouse_entered(panel: PanelContainer):
|
||||
var transform_target = find_transform_target_for_panel(panel)
|
||||
StyleManager.apply_transform_properties_direct(transform_target, hover_styles)
|
||||
|
||||
static func _on_panel_mouse_exited(panel: PanelContainer):
|
||||
if panel.has_meta("normal_stylebox"):
|
||||
var normal_stylebox = panel.get_meta("normal_stylebox")
|
||||
panel.add_theme_stylebox_override("panel", normal_stylebox)
|
||||
static func _on_panel_mouse_exited_with_delay(panel: PanelContainer):
|
||||
panel.get_tree().create_timer(0.01).timeout.connect(func(): _check_panel_hover(panel))
|
||||
|
||||
static func _check_panel_hover(panel: PanelContainer):
|
||||
if not panel or not is_instance_valid(panel):
|
||||
return
|
||||
|
||||
if panel.has_meta("normal_styles"):
|
||||
var normal_styles = panel.get_meta("normal_styles")
|
||||
var transform_target = find_transform_target_for_panel(panel)
|
||||
StyleManager.apply_transform_properties_direct(transform_target, normal_styles)
|
||||
var mouse_pos = panel.get_global_mouse_position()
|
||||
var panel_rect = panel.get_global_rect()
|
||||
var is_mouse_over = panel_rect.has_point(mouse_pos)
|
||||
|
||||
if not is_mouse_over and panel.get_meta("is_hovering", false):
|
||||
panel.set_meta("is_hovering", false)
|
||||
if panel.has_meta("normal_stylebox"):
|
||||
var normal_stylebox = panel.get_meta("normal_stylebox")
|
||||
panel.add_theme_stylebox_override("panel", normal_stylebox)
|
||||
|
||||
if panel.has_meta("normal_styles"):
|
||||
var normal_styles = panel.get_meta("normal_styles")
|
||||
var transform_target = find_transform_target_for_panel(panel)
|
||||
StyleManager.apply_transform_properties_direct(transform_target, normal_styles)
|
||||
|
||||
static func find_transform_target_for_panel(panel: PanelContainer) -> Control:
|
||||
var parent = panel.get_parent()
|
||||
|
||||
@@ -64,6 +64,7 @@ static func apply_flex_container_properties(node, styles: Dictionary) -> void:
|
||||
if width_val == "full" or width_val == "100%":
|
||||
# For flex containers, w-full should expand to fill parent
|
||||
node.set_meta("should_fill_horizontal", true)
|
||||
node.set_meta("size_flags_set_by_style_manager", true)
|
||||
elif typeof(width_val) == TYPE_STRING and width_val.ends_with("%"):
|
||||
node.set_meta("custom_css_width_percentage", width_val)
|
||||
else:
|
||||
@@ -73,6 +74,7 @@ static func apply_flex_container_properties(node, styles: Dictionary) -> void:
|
||||
if height_val == "full":
|
||||
# For flex containers, h-full should expand to fill parent
|
||||
node.set_meta("should_fill_vertical", true)
|
||||
node.set_meta("size_flags_set_by_style_manager", true)
|
||||
elif typeof(height_val) == TYPE_STRING and height_val.ends_with("%"):
|
||||
node.set_meta("custom_css_height_percentage", height_val)
|
||||
else:
|
||||
|
||||
@@ -278,7 +278,7 @@ static func update_div_hover_styles(dom_node: Control, element: HTMLParser.HTMLE
|
||||
|
||||
if dom_node.mouse_entered.is_connected(BackgroundUtils._on_panel_mouse_entered):
|
||||
dom_node.mouse_entered.disconnect(BackgroundUtils._on_panel_mouse_entered)
|
||||
if dom_node.mouse_exited.is_connected(BackgroundUtils._on_panel_mouse_exited):
|
||||
dom_node.mouse_exited.disconnect(BackgroundUtils._on_panel_mouse_exited)
|
||||
if dom_node.mouse_exited.is_connected(BackgroundUtils._on_panel_mouse_exited_with_delay):
|
||||
dom_node.mouse_exited.disconnect(BackgroundUtils._on_panel_mouse_exited_with_delay)
|
||||
|
||||
update_element_text_content(dom_node, element, dom_parser)
|
||||
|
||||
@@ -640,6 +640,12 @@ static func add_element_methods(vm: LuauVM, lua_api: LuaAPI) -> void:
|
||||
vm.lua_pushcallable(LuaDOMUtils._element_hide_wrapper, "element.hide")
|
||||
vm.lua_setfield(-2, "hide")
|
||||
|
||||
vm.lua_pushcallable(LuaDOMUtils._element_focus_wrapper, "element.focus")
|
||||
vm.lua_setfield(-2, "focus")
|
||||
|
||||
vm.lua_pushcallable(LuaDOMUtils._element_unfocus_wrapper, "element.unfocus")
|
||||
vm.lua_setfield(-2, "unfocus")
|
||||
|
||||
_add_classlist_support(vm, lua_api)
|
||||
|
||||
vm.lua_newtable()
|
||||
@@ -1420,6 +1426,48 @@ static func _element_hide_wrapper(vm: LuauVM) -> int:
|
||||
|
||||
return 0
|
||||
|
||||
static func _element_focus_wrapper(vm: LuauVM) -> int:
|
||||
var lua_api = vm.get_meta("lua_api") as LuaAPI
|
||||
if not lua_api:
|
||||
vm.lua_pushboolean(false)
|
||||
return 1
|
||||
|
||||
vm.luaL_checktype(1, vm.LUA_TTABLE)
|
||||
|
||||
vm.lua_getfield(1, "_element_id")
|
||||
var element_id: String = vm.lua_tostring(-1)
|
||||
vm.lua_pop(1)
|
||||
|
||||
var operation = {
|
||||
"type": "focus_element",
|
||||
"element_id": element_id
|
||||
}
|
||||
|
||||
emit_dom_operation(lua_api, operation)
|
||||
vm.lua_pushboolean(true)
|
||||
return 1
|
||||
|
||||
static func _element_unfocus_wrapper(vm: LuauVM) -> int:
|
||||
var lua_api = vm.get_meta("lua_api") as LuaAPI
|
||||
if not lua_api:
|
||||
vm.lua_pushboolean(false)
|
||||
return 1
|
||||
|
||||
vm.luaL_checktype(1, vm.LUA_TTABLE)
|
||||
|
||||
vm.lua_getfield(1, "_element_id")
|
||||
var element_id: String = vm.lua_tostring(-1)
|
||||
vm.lua_pop(1)
|
||||
|
||||
var operation = {
|
||||
"type": "unfocus_element",
|
||||
"element_id": element_id
|
||||
}
|
||||
|
||||
emit_dom_operation(lua_api, operation)
|
||||
vm.lua_pushboolean(true)
|
||||
return 1
|
||||
|
||||
static func _element_create_tween_wrapper(vm: LuauVM) -> int:
|
||||
var lua_api = vm.get_meta("lua_api") as LuaAPI
|
||||
if not lua_api:
|
||||
|
||||
@@ -118,6 +118,12 @@ static func string_replace_all_handler(vm: LuauVM) -> int:
|
||||
|
||||
return 1
|
||||
|
||||
static func string_trim_handler(vm: LuauVM) -> int:
|
||||
var subject: String = vm.luaL_checkstring(1)
|
||||
var trimmed = subject.strip_edges()
|
||||
vm.lua_pushstring(trimmed)
|
||||
return 1
|
||||
|
||||
static func setup_regex_api(vm: LuauVM) -> void:
|
||||
vm.lua_newtable()
|
||||
|
||||
@@ -139,4 +145,7 @@ static func setup_regex_api(vm: LuauVM) -> void:
|
||||
vm.lua_pushcallable(string_replace_all_handler, "string.replaceAll")
|
||||
vm.lua_setfield(-2, "replaceAll")
|
||||
|
||||
vm.lua_pushcallable(string_trim_handler, "string.trim")
|
||||
vm.lua_setfield(-2, "trim")
|
||||
|
||||
vm.lua_pop(1)
|
||||
|
||||
@@ -53,6 +53,7 @@ func stop_lua_thread():
|
||||
while lua_thread.is_alive() and (Time.get_ticks_msec() - timeout_start) < 500:
|
||||
OS.delay_msec(10)
|
||||
|
||||
lua_thread.wait_to_finish()
|
||||
lua_thread = null
|
||||
|
||||
func execute_script_async(script_code: String):
|
||||
@@ -356,6 +357,7 @@ func _setup_additional_lua_apis():
|
||||
LuaAudioUtils.setup_audio_api(lua_vm)
|
||||
LuaCrumbsUtils.setup_crumbs_api(lua_vm)
|
||||
LuaRegexUtils.setup_regex_api(lua_vm)
|
||||
LuaURLUtils.setup_url_api(lua_vm)
|
||||
|
||||
func _table_tostring_handler(vm: LuauVM) -> int:
|
||||
vm.luaL_checktype(1, vm.LUA_TTABLE)
|
||||
|
||||
21
flumi/Scripts/Utils/Lua/URL.gd
Normal file
21
flumi/Scripts/Utils/Lua/URL.gd
Normal file
@@ -0,0 +1,21 @@
|
||||
class_name LuaURLUtils
|
||||
extends RefCounted
|
||||
|
||||
static func url_encode_handler(vm: LuauVM) -> int:
|
||||
var input: String = vm.luaL_checkstring(1)
|
||||
var encoded = input.uri_encode()
|
||||
vm.lua_pushstring(encoded)
|
||||
return 1
|
||||
|
||||
static func url_decode_handler(vm: LuauVM) -> int:
|
||||
var input: String = vm.luaL_checkstring(1)
|
||||
var decoded = input.uri_decode()
|
||||
vm.lua_pushstring(decoded)
|
||||
return 1
|
||||
|
||||
static func setup_url_api(vm: LuauVM) -> void:
|
||||
vm.lua_pushcallable(url_encode_handler, "urlEncode")
|
||||
vm.lua_setglobal("urlEncode")
|
||||
|
||||
vm.lua_pushcallable(url_decode_handler, "urlDecode")
|
||||
vm.lua_setglobal("urlDecode")
|
||||
1
flumi/Scripts/Utils/Lua/URL.gd.uid
Normal file
1
flumi/Scripts/Utils/Lua/URL.gd.uid
Normal file
@@ -0,0 +1 @@
|
||||
uid://bjiiw0qfqg2he
|
||||
@@ -83,33 +83,78 @@ func _on_search_submitted(url: String) -> void:
|
||||
var tab = tab_container.tabs[tab_container.active_tab]
|
||||
tab.start_loading()
|
||||
|
||||
var result = await GurtProtocol.handle_gurt_domain(url)
|
||||
var gurt_url = url
|
||||
if not gurt_url.begins_with("gurt://"):
|
||||
gurt_url = "gurt://" + gurt_url
|
||||
|
||||
if result.has("error"):
|
||||
print("GURT domain error: ", result.error)
|
||||
const GLOBE_ICON = preload("res://Assets/Icons/globe.svg")
|
||||
tab.stop_loading()
|
||||
tab.set_icon(GLOBE_ICON)
|
||||
return
|
||||
|
||||
var html_bytes = result.html
|
||||
|
||||
if result.has("display_url"):
|
||||
current_domain = result.display_url
|
||||
if not current_domain.begins_with("gurt://"):
|
||||
current_domain = "gurt://" + current_domain
|
||||
if not search_bar.has_focus():
|
||||
search_bar.text = result.display_url # Show clean version in search bar
|
||||
else:
|
||||
current_domain = url
|
||||
|
||||
render_content(html_bytes)
|
||||
|
||||
# Stop loading spinner after successful render
|
||||
tab.stop_loading()
|
||||
await fetch_gurt_content_async(gurt_url, tab, url)
|
||||
else:
|
||||
print("Non-GURT URL entered: ", url)
|
||||
|
||||
func fetch_gurt_content_async(gurt_url: String, tab: Tab, original_url: String) -> void:
|
||||
var thread = Thread.new()
|
||||
var request_data = {"gurt_url": gurt_url}
|
||||
|
||||
thread.start(_perform_gurt_request_threaded.bind(request_data))
|
||||
|
||||
while thread.is_alive():
|
||||
await get_tree().process_frame
|
||||
|
||||
var result = thread.wait_to_finish()
|
||||
|
||||
_handle_gurt_result(result, tab, original_url, gurt_url)
|
||||
|
||||
func _perform_gurt_request_threaded(request_data: Dictionary) -> Dictionary:
|
||||
var gurt_url: String = request_data.gurt_url
|
||||
var client = GurtProtocolClient.new()
|
||||
|
||||
for ca_cert in CertificateManager.trusted_ca_certificates:
|
||||
client.add_ca_certificate(ca_cert)
|
||||
|
||||
if not client.create_client_with_dns(30, GurtProtocol.DNS_SERVER_IP, GurtProtocol.DNS_SERVER_PORT):
|
||||
client.disconnect()
|
||||
return {"success": false, "error": "Failed to connect to GURT DNS server"}
|
||||
|
||||
var response = client.request(gurt_url, {
|
||||
"method": "GET"
|
||||
})
|
||||
client.disconnect()
|
||||
|
||||
if not response or not response.is_success:
|
||||
var error_msg = "Connection failed"
|
||||
if response:
|
||||
error_msg = "GURT %d: %s" % [response.status_code, response.status_message]
|
||||
elif not response:
|
||||
error_msg = "Request timed out or connection failed"
|
||||
return {"success": false, "error": error_msg}
|
||||
|
||||
return {"success": true, "html_bytes": response.body}
|
||||
|
||||
func _handle_gurt_result(result: Dictionary, tab: Tab, original_url: String, gurt_url: String) -> void:
|
||||
if not result.success:
|
||||
print("GURT request failed: ", result.error)
|
||||
handle_gurt_error(result.error, tab)
|
||||
return
|
||||
|
||||
var html_bytes = result.html_bytes
|
||||
|
||||
current_domain = gurt_url
|
||||
if not search_bar.has_focus():
|
||||
search_bar.text = original_url # Show the original input in search bar
|
||||
|
||||
render_content(html_bytes)
|
||||
|
||||
tab.stop_loading()
|
||||
|
||||
func handle_gurt_error(error_message: String, tab: Tab) -> void:
|
||||
var error_html = GurtProtocol.create_error_page(error_message)
|
||||
|
||||
render_content(error_html)
|
||||
|
||||
const GLOBE_ICON = preload("res://Assets/Icons/globe.svg")
|
||||
tab.stop_loading()
|
||||
tab.set_icon(GLOBE_ICON)
|
||||
|
||||
func _on_search_focus_entered() -> void:
|
||||
if not current_domain.is_empty():
|
||||
search_bar.text = current_domain
|
||||
@@ -298,7 +343,7 @@ func create_element_node(element: HTMLParser.HTMLElement, parser: HTMLParser) ->
|
||||
|
||||
if is_grid_container:
|
||||
if element.tag_name == "div":
|
||||
if BackgroundUtils.needs_background_wrapper(styles) or hover_styles.size() > 0:
|
||||
if BackgroundUtils.needs_background_wrapper(styles) or BackgroundUtils.needs_background_wrapper(hover_styles):
|
||||
final_node = BackgroundUtils.create_panel_container_with_background(styles, hover_styles)
|
||||
var grid_container = GridContainer.new()
|
||||
grid_container.name = "Grid_" + element.tag_name
|
||||
@@ -316,21 +361,24 @@ func create_element_node(element: HTMLParser.HTMLElement, parser: HTMLParser) ->
|
||||
elif is_flex_container:
|
||||
# The element's primary identity IS a flex container.
|
||||
if element.tag_name == "div":
|
||||
if BackgroundUtils.needs_background_wrapper(styles) or hover_styles.size() > 0:
|
||||
if BackgroundUtils.needs_background_wrapper(styles) or BackgroundUtils.needs_background_wrapper(hover_styles):
|
||||
final_node = BackgroundUtils.create_panel_container_with_background(styles, hover_styles)
|
||||
var flex_container = AUTO_SIZING_FLEX_CONTAINER.new()
|
||||
flex_container.name = "Flex_" + element.tag_name
|
||||
var vbox = final_node.get_child(0) as VBoxContainer
|
||||
vbox.add_child(flex_container)
|
||||
container_for_children = flex_container
|
||||
FlexUtils.apply_flex_container_properties(flex_container, styles)
|
||||
else:
|
||||
final_node = AUTO_SIZING_FLEX_CONTAINER.new()
|
||||
final_node.name = "Flex_" + element.tag_name
|
||||
container_for_children = final_node
|
||||
FlexUtils.apply_flex_container_properties(final_node, styles)
|
||||
else:
|
||||
final_node = AUTO_SIZING_FLEX_CONTAINER.new()
|
||||
final_node.name = "Flex_" + element.tag_name
|
||||
container_for_children = final_node
|
||||
FlexUtils.apply_flex_container_properties(final_node, styles)
|
||||
|
||||
# For FLEX ul/ol elements, we need to create the li children directly in the flex container
|
||||
if element.tag_name == "ul" or element.tag_name == "ol":
|
||||
@@ -351,7 +399,8 @@ func create_element_node(element: HTMLParser.HTMLElement, parser: HTMLParser) ->
|
||||
# If the element itself has text (like <span style="flex">TEXT</span>)
|
||||
elif not element.text_content.is_empty():
|
||||
var new_node = await create_element_node_internal(element, parser)
|
||||
container_for_children.add_child(new_node)
|
||||
if new_node:
|
||||
container_for_children.add_child(new_node)
|
||||
# For flex divs, we're done - no additional node creation needed
|
||||
elif element.tag_name == "div":
|
||||
pass
|
||||
@@ -369,26 +418,6 @@ func create_element_node(element: HTMLParser.HTMLElement, parser: HTMLParser) ->
|
||||
# Applies background, size, etc. to the FlexContainer (top-level node)
|
||||
final_node = StyleManager.apply_element_styles(final_node, element, parser)
|
||||
|
||||
# Apply flex CONTAINER properties if it's a flex container
|
||||
if is_flex_container:
|
||||
var flex_container_node = final_node
|
||||
|
||||
if final_node is FlexContainer:
|
||||
# Direct FlexContainer
|
||||
flex_container_node = final_node
|
||||
elif final_node is MarginContainer and final_node.get_child_count() > 0:
|
||||
var first_child = final_node.get_child(0)
|
||||
if first_child is FlexContainer:
|
||||
flex_container_node = first_child
|
||||
elif final_node is PanelContainer and final_node.get_child_count() > 0:
|
||||
var vbox = final_node.get_child(0)
|
||||
if vbox is VBoxContainer and vbox.get_child_count() > 0:
|
||||
var potential_flex = vbox.get_child(0)
|
||||
if potential_flex is FlexContainer:
|
||||
flex_container_node = potential_flex
|
||||
|
||||
if flex_container_node is FlexContainer:
|
||||
FlexUtils.apply_flex_container_properties(flex_container_node, styles)
|
||||
|
||||
if is_grid_container:
|
||||
var grid_container_node = final_node
|
||||
@@ -528,7 +557,7 @@ func create_element_node_internal(element: HTMLParser.HTMLElement, parser: HTMLP
|
||||
return null
|
||||
|
||||
# Create div container
|
||||
if BackgroundUtils.needs_background_wrapper(styles) or hover_styles.size() > 0:
|
||||
if BackgroundUtils.needs_background_wrapper(styles) or BackgroundUtils.needs_background_wrapper(hover_styles):
|
||||
node = BackgroundUtils.create_panel_container_with_background(styles, hover_styles)
|
||||
else:
|
||||
node = DIV.instantiate()
|
||||
|
||||
Binary file not shown.
BIN
flumi/addons/gurt-protocol/bin/windows/~gurt_godot.dll
Normal file
BIN
flumi/addons/gurt-protocol/bin/windows/~gurt_godot.dll
Normal file
Binary file not shown.
@@ -320,7 +320,13 @@ impl RequestHandler {
|
||||
}
|
||||
|
||||
pub async fn handle_file_request(&self, request_path: &str) -> std::result::Result<GurtResponse, GurtError> {
|
||||
let mut relative_path = request_path.strip_prefix('/').unwrap_or(request_path).to_string();
|
||||
let path_without_query = if let Some(query_start) = request_path.find('?') {
|
||||
&request_path[..query_start]
|
||||
} else {
|
||||
request_path
|
||||
};
|
||||
|
||||
let mut relative_path = path_without_query.strip_prefix('/').unwrap_or(path_without_query).to_string();
|
||||
|
||||
while relative_path.starts_with('/') || relative_path.starts_with('\\') {
|
||||
relative_path = relative_path[1..].to_string();
|
||||
|
||||
@@ -131,6 +131,31 @@ impl GurtProtocolClient {
|
||||
true
|
||||
}
|
||||
|
||||
#[func]
|
||||
fn create_client_with_dns(&mut self, timeout_seconds: i32, dns_ip: GString, dns_port: i32) -> bool {
|
||||
let runtime = match Runtime::new() {
|
||||
Ok(rt) => rt,
|
||||
Err(e) => {
|
||||
godot_print!("Failed to create runtime: {}", e);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
let mut config = GurtClientConfig::default();
|
||||
config.request_timeout = tokio::time::Duration::from_secs(timeout_seconds as u64);
|
||||
config.dns_server_ip = dns_ip.to_string();
|
||||
config.dns_server_port = dns_port as u16;
|
||||
|
||||
config.custom_ca_certificates = self.ca_certificates.borrow().clone();
|
||||
|
||||
let client = GurtClient::with_config(config);
|
||||
|
||||
*self.runtime.borrow_mut() = Some(runtime);
|
||||
*self.client.borrow_mut() = Some(client);
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
#[func]
|
||||
fn request(&self, url: GString, options: Dictionary) -> Option<Gd<GurtGDResponse>> {
|
||||
let runtime_binding = self.runtime.borrow();
|
||||
@@ -162,7 +187,16 @@ impl GurtProtocolClient {
|
||||
};
|
||||
|
||||
let port = parsed_url.port().unwrap_or(4878);
|
||||
let path = if parsed_url.path().is_empty() { "/" } else { parsed_url.path() };
|
||||
let path_with_query = if parsed_url.path().is_empty() {
|
||||
"/"
|
||||
} else {
|
||||
parsed_url.path()
|
||||
};
|
||||
|
||||
let path = match parsed_url.query() {
|
||||
Some(query) => format!("{}?{}", path_with_query, query),
|
||||
None => path_with_query.to_string(),
|
||||
};
|
||||
|
||||
let method_str = options.get("method").unwrap_or("GET".to_variant()).to::<String>();
|
||||
let method = match method_str.to_uppercase().as_str() {
|
||||
@@ -192,7 +226,6 @@ impl GurtProtocolClient {
|
||||
let headers_dict = options.get("headers").unwrap_or(Dictionary::new().to_variant()).to::<Dictionary>();
|
||||
|
||||
let mut request = GurtRequest::new(method, path.to_string())
|
||||
.with_header("Host", host)
|
||||
.with_header("User-Agent", "GURT-Client/1.0.0");
|
||||
|
||||
for key_variant in headers_dict.keys_array().iter_shared() {
|
||||
|
||||
4407
search-engine/Cargo.lock
generated
Normal file
4407
search-engine/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
31
search-engine/Cargo.toml
Normal file
31
search-engine/Cargo.toml
Normal file
@@ -0,0 +1,31 @@
|
||||
[package]
|
||||
name = "gurted-search-engine"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
tokio = { version = "1.38.0", features = ["full"] }
|
||||
futures = "0.3.30"
|
||||
tantivy = "0.22"
|
||||
sha2 = "0.10"
|
||||
gurt = { path = "../protocol/library" }
|
||||
sqlx = { version = "0.7", features = ["runtime-tokio-rustls", "postgres", "chrono", "uuid"] }
|
||||
scraper = "0.20"
|
||||
lol_html = "1.2"
|
||||
url = "2.5"
|
||||
toml = "0.8.13"
|
||||
serde = { version = "1.0.203", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
anyhow = "1.0.86"
|
||||
thiserror = "1.0"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
uuid = { version = "1.0", features = ["v4"] }
|
||||
regex = "1.10.4"
|
||||
mime = "0.3"
|
||||
base64 = "0.22"
|
||||
glob = "0.3"
|
||||
clap = { version = "4.5.4", features = ["derive"] }
|
||||
urlencoding = "2.1"
|
||||
reqwest = "0.11"
|
||||
5
search-engine/README.md
Normal file
5
search-engine/README.md
Normal file
@@ -0,0 +1,5 @@
|
||||
The official Gurted search engine, Ringle.
|
||||
|
||||
Copy `config.template.toml` to `config.toml` and edit as needed.
|
||||
|
||||
Run with `cargo run`
|
||||
51
search-engine/config.template.toml
Normal file
51
search-engine/config.template.toml
Normal file
@@ -0,0 +1,51 @@
|
||||
[database]
|
||||
url = "postgres://..."
|
||||
max_connections = 5
|
||||
|
||||
[server]
|
||||
address = "127.0.0.1"
|
||||
port = 4879
|
||||
cert_path = "certs/t.crt"
|
||||
key_path = "certs/t.key"
|
||||
|
||||
[search]
|
||||
index_path = "./search_indexes"
|
||||
crawl_interval_hours = 2
|
||||
max_pages_per_domain = 1000
|
||||
crawler_timeout_seconds = 30
|
||||
crawler_user_agent = "GurtedSearchBot/1.0"
|
||||
max_concurrent_crawls = 5
|
||||
content_size_limit_mb = 10
|
||||
index_rebuild_interval_hours = 48
|
||||
search_results_per_page = 20
|
||||
max_search_results = 1000
|
||||
|
||||
allowed_extensions = [
|
||||
"html", "htm", "txt", "md", "json", "xml", "rss", "atom"
|
||||
]
|
||||
|
||||
blocked_extensions = [
|
||||
"exe", "zip", "rar", "tar", "gz", "7z", "iso", "dmg",
|
||||
"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
|
||||
"jpg", "jpeg", "png", "gif", "bmp", "svg", "webp",
|
||||
"mp3", "mp4", "avi", "mov", "wmv", "flv", "webm",
|
||||
"css", "js", "woff", "woff2", "ttf", "eot"
|
||||
]
|
||||
|
||||
[crawler]
|
||||
clanker_txt = true
|
||||
crawl_delay_ms = 1000
|
||||
max_redirects = 5
|
||||
follow_external_links = false
|
||||
max_depth = 10
|
||||
|
||||
request_headers = [
|
||||
["Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"],
|
||||
["Accept-Language", "en-US,en;q=0.5"],
|
||||
["Accept-Encoding", "gzip, deflate"],
|
||||
["DNT", "1"],
|
||||
]
|
||||
|
||||
[logging]
|
||||
level = "info"
|
||||
format = "compact"
|
||||
96
search-engine/frontend/search.html
Normal file
96
search-engine/frontend/search.html
Normal file
@@ -0,0 +1,96 @@
|
||||
<head>
|
||||
<title>Gurted Search Engine</title>
|
||||
<meta name="description" content="Search across all registered GURT domains">
|
||||
<meta name="theme-color" content="#000000">
|
||||
<icon src="https://cdn-icons-png.flaticon.com/512/295/295128.png">
|
||||
|
||||
<style>
|
||||
body {
|
||||
font-sans bg-[#000000] text-[#ffffff] min-h-screen flex flex-col items-center p-8
|
||||
}
|
||||
|
||||
.search-container {
|
||||
max-w-2xl w-full flex flex-col gap-8
|
||||
}
|
||||
|
||||
.logo {
|
||||
text-center text-5xl font-bold text-[#dc2626] mb-8
|
||||
}
|
||||
|
||||
.search-box {
|
||||
flex flex-col gap-4
|
||||
}
|
||||
|
||||
.search-input {
|
||||
w-[600px] p-3 text-base bg-[#1a1a1a] border-2 border-[#333333] rounded-lg text-[#ffffff] outline-none active:border-[#dc2626]
|
||||
}
|
||||
|
||||
.search-btn {
|
||||
bg-[#1a1a1a] text-[#ffffff] px-4 py-2 rounded text-sm border border-[#333333] cursor-pointer hover:border-[#dc2626] hover:shadow-sm
|
||||
}
|
||||
|
||||
.results {
|
||||
w-full flex flex-col gap-4 mt-8
|
||||
}
|
||||
|
||||
.result-item {
|
||||
p-4 cursor-pointer hover:bg-[#0a0a0a] rounded
|
||||
}
|
||||
|
||||
.result-header {
|
||||
inline-flex gap-2 mb-2
|
||||
}
|
||||
|
||||
.result-icon {
|
||||
w-6 h-6 min-w-6 min-h-6 rounded border border-[#333333] bg-[#1a1a1a] object-cover mt-1
|
||||
}
|
||||
|
||||
.result-content {
|
||||
flex-1 min-w-0
|
||||
}
|
||||
|
||||
.result-title {
|
||||
text-xl font-normal text-[#dc2626] mb-1 hover:underline
|
||||
}
|
||||
|
||||
.result-url {
|
||||
text-[#4ade80] text-sm mb-2
|
||||
}
|
||||
|
||||
.result-preview {
|
||||
text-[#cccccc] text-sm leading-relaxed
|
||||
}
|
||||
|
||||
.stats {
|
||||
text-left text-[#999999] text-sm mt-4 ml-1
|
||||
}
|
||||
|
||||
.loading {
|
||||
text-center text-[#999999] mt-8
|
||||
}
|
||||
</style>
|
||||
|
||||
<script src="search.lua" />
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div style="search-container">
|
||||
<h1 style="logo">Ringle</h1>
|
||||
|
||||
<div style="w-full flex flex-col gap-4 items-center justify-center content-center">
|
||||
<input type="text" id="searchQuery" style="search-input" placeholder="Search across the Gurted..." />
|
||||
<div style="inline-flex gap-2 items-center justify-center">
|
||||
<button id="searchButton" style="search-btn">Search</button>
|
||||
<button id="luckyButton" style="search-btn">I'm feeling lucky</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="loading" style="loading hidden">
|
||||
Searching...
|
||||
</div>
|
||||
|
||||
<div id="results" style="results"></div>
|
||||
|
||||
<div id="stats" style="stats"></div>
|
||||
</div>
|
||||
</body>
|
||||
150
search-engine/frontend/search.lua
Normal file
150
search-engine/frontend/search.lua
Normal file
@@ -0,0 +1,150 @@
|
||||
local searchBtn = gurt.select('#searchButton')
|
||||
local luckyBtn = gurt.select('#luckyButton')
|
||||
local searchQuery = gurt.select('#searchQuery')
|
||||
local loading = gurt.select('#loading')
|
||||
local results = gurt.select('#results')
|
||||
local stats = gurt.select('#stats')
|
||||
|
||||
local function showLoading()
|
||||
loading.classList:remove('hidden')
|
||||
results.text = ''
|
||||
stats.text = ''
|
||||
end
|
||||
|
||||
local function displayResults(data)
|
||||
loading.classList:add('hidden')
|
||||
results.text = ''
|
||||
|
||||
if not data.results or #data.results == 0 then
|
||||
local noResultsItem = gurt.create('div', {
|
||||
text = 'No results found for your query.',
|
||||
style = 'result-item'
|
||||
})
|
||||
results:append(noResultsItem)
|
||||
stats.text = 'No results found'
|
||||
return
|
||||
end
|
||||
|
||||
for i, result in ipairs(data.results) do
|
||||
local resultItem = gurt.create('div', { style = 'result-item' })
|
||||
|
||||
resultItem:on('click', function()
|
||||
gurt.location.goto(result.url)
|
||||
end)
|
||||
|
||||
local headerDiv = gurt.create('div', { style = 'result-header' })
|
||||
|
||||
if result.icon and result.icon ~= '' then
|
||||
local iconImg = gurt.create('img', {
|
||||
src = result.icon,
|
||||
style = 'result-icon',
|
||||
alt = 'Site icon'
|
||||
})
|
||||
headerDiv:append(iconImg)
|
||||
end
|
||||
|
||||
local titleDiv = gurt.create('p', {
|
||||
text = result.title or result.url,
|
||||
style = 'result-title'
|
||||
})
|
||||
|
||||
headerDiv:append(titleDiv)
|
||||
|
||||
local urlDiv = gurt.create('p', {
|
||||
text = result.url,
|
||||
style = 'result-url'
|
||||
})
|
||||
|
||||
local previewText = result.preview or result.description or ''
|
||||
if #previewText > 150 then
|
||||
previewText = previewText:sub(1, 147) .. '...'
|
||||
end
|
||||
|
||||
local previewDiv = gurt.create('p', {
|
||||
text = previewText,
|
||||
style = 'result-preview'
|
||||
})
|
||||
|
||||
resultItem:append(headerDiv)
|
||||
resultItem:append(urlDiv)
|
||||
resultItem:append(previewDiv)
|
||||
|
||||
results:append(resultItem)
|
||||
end
|
||||
|
||||
local resultCount = #data.results
|
||||
local totalResults = data.total_results or resultCount
|
||||
stats.text = 'Found ' .. totalResults .. ' result' .. (totalResults == 1 and '' or 's')
|
||||
end
|
||||
|
||||
local function performSearch(query)
|
||||
if not query or query == '' then
|
||||
return
|
||||
end
|
||||
|
||||
showLoading()
|
||||
|
||||
local url = '/api/search?q=' .. urlEncode(query) .. '&per_page=20'
|
||||
local response = fetch(url, {
|
||||
method = 'GET'
|
||||
})
|
||||
|
||||
if response:ok() then
|
||||
local data = response:json()
|
||||
displayResults(data)
|
||||
else
|
||||
loading.classList:add('hidden')
|
||||
results.text = ''
|
||||
stats.text = 'Search failed: ' .. response.status .. ' ' .. response.statusText
|
||||
end
|
||||
end
|
||||
|
||||
local function performLuckySearch()
|
||||
showLoading()
|
||||
|
||||
local luckyTerms = {'test', 'demo', 'api', 'web', 'site', 'page', 'home', 'index'}
|
||||
local randomTerm = luckyTerms[math.random(#luckyTerms)]
|
||||
|
||||
local url = '/api/search?q=' .. urlEncode(randomTerm) .. '&per_page=50'
|
||||
local response = fetch(url, {
|
||||
method = 'GET'
|
||||
})
|
||||
|
||||
if response:ok() then
|
||||
local data = response:json()
|
||||
if data.results and #data.results > 0 then
|
||||
local randomResult = data.results[math.random(#data.results)]
|
||||
gurt.location.goto(randomResult.url)
|
||||
else
|
||||
loading.classList:add('hidden')
|
||||
results.text = ''
|
||||
stats.text = 'No sites available for lucky search'
|
||||
end
|
||||
else
|
||||
loading.classList:add('hidden')
|
||||
results.text = ''
|
||||
stats.text = 'Lucky search failed'
|
||||
end
|
||||
end
|
||||
|
||||
searchBtn:on('click', function()
|
||||
local query = searchQuery.value
|
||||
if query and query ~= '' then
|
||||
performSearch(query:trim())
|
||||
end
|
||||
end)
|
||||
|
||||
luckyBtn:on('click', function()
|
||||
performLuckySearch()
|
||||
end)
|
||||
|
||||
searchQuery:on('keydown', function(e)
|
||||
if e.key == 'Enter' then
|
||||
local query = searchQuery.value
|
||||
if query and query ~= '' then
|
||||
performSearch(query:trim())
|
||||
end
|
||||
end
|
||||
end)
|
||||
|
||||
searchQuery:focus()
|
||||
121
search-engine/src/config.rs
Normal file
121
search-engine/src/config.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct Config {
|
||||
pub database: DatabaseConfig,
|
||||
pub server: ServerConfig,
|
||||
pub search: SearchConfig,
|
||||
pub crawler: CrawlerConfig,
|
||||
pub logging: LoggingConfig,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct DatabaseConfig {
|
||||
pub url: String,
|
||||
pub max_connections: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct ServerConfig {
|
||||
pub address: String,
|
||||
pub port: u16,
|
||||
pub cert_path: PathBuf,
|
||||
pub key_path: PathBuf,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct SearchConfig {
|
||||
pub index_path: PathBuf,
|
||||
pub crawl_interval_hours: u64,
|
||||
pub max_pages_per_domain: usize,
|
||||
pub crawler_timeout_seconds: u64,
|
||||
pub crawler_user_agent: String,
|
||||
pub max_concurrent_crawls: usize,
|
||||
pub content_size_limit_mb: usize,
|
||||
pub index_rebuild_interval_hours: u64,
|
||||
pub search_results_per_page: usize,
|
||||
pub max_search_results: usize,
|
||||
pub allowed_extensions: Vec<String>,
|
||||
pub blocked_extensions: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct CrawlerConfig {
|
||||
pub clanker_txt: bool,
|
||||
pub crawl_delay_ms: u64,
|
||||
pub max_redirects: usize,
|
||||
pub follow_external_links: bool,
|
||||
pub max_depth: usize,
|
||||
pub request_headers: Vec<(String, String)>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct LoggingConfig {
|
||||
pub level: String,
|
||||
pub format: String,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
pub fn load_from_file(path: &str) -> anyhow::Result<Self> {
|
||||
let content = std::fs::read_to_string(path)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to read config file {}: {}", path, e))?;
|
||||
|
||||
let config: Config = toml::from_str(&content)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to parse config file {}: {}", path, e))?;
|
||||
|
||||
Ok(config)
|
||||
}
|
||||
|
||||
pub fn database_url(&self) -> &str {
|
||||
&self.database.url
|
||||
}
|
||||
|
||||
pub fn server_bind_address(&self) -> String {
|
||||
format!("{}:{}", self.server.address, self.server.port)
|
||||
}
|
||||
|
||||
pub fn gurt_protocol_url(&self) -> String {
|
||||
format!("gurt://{}:{}", self.server.address, self.server.port)
|
||||
}
|
||||
|
||||
pub fn is_allowed_extension(&self, extension: &str) -> bool {
|
||||
if self.is_blocked_extension(extension) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if self.search.allowed_extensions.is_empty() {
|
||||
return true;
|
||||
}
|
||||
self.search.allowed_extensions.iter()
|
||||
.any(|ext| ext.eq_ignore_ascii_case(extension))
|
||||
}
|
||||
|
||||
pub fn is_blocked_extension(&self, extension: &str) -> bool {
|
||||
self.search.blocked_extensions.iter()
|
||||
.any(|ext| ext.eq_ignore_ascii_case(extension))
|
||||
}
|
||||
|
||||
pub fn content_size_limit_bytes(&self) -> usize {
|
||||
self.search
|
||||
.content_size_limit_mb
|
||||
.saturating_mul(1024)
|
||||
.saturating_mul(1024)
|
||||
}
|
||||
|
||||
pub fn crawler_timeout(&self) -> std::time::Duration {
|
||||
std::time::Duration::from_secs(self.search.crawler_timeout_seconds)
|
||||
}
|
||||
|
||||
pub fn crawl_delay(&self) -> std::time::Duration {
|
||||
std::time::Duration::from_millis(self.crawler.crawl_delay_ms)
|
||||
}
|
||||
|
||||
pub fn crawl_interval(&self) -> std::time::Duration {
|
||||
std::time::Duration::from_secs(self.search.crawl_interval_hours * 3600)
|
||||
}
|
||||
|
||||
pub fn index_rebuild_interval(&self) -> std::time::Duration {
|
||||
std::time::Duration::from_secs(self.search.index_rebuild_interval_hours * 3600)
|
||||
}
|
||||
}
|
||||
706
search-engine/src/crawler.rs
Normal file
706
search-engine/src/crawler.rs
Normal file
@@ -0,0 +1,706 @@
|
||||
use anyhow::{Result, Context};
|
||||
use chrono::Utc;
|
||||
use gurt::{GurtClient, GurtClientConfig};
|
||||
use scraper::{Html, Selector};
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::sync::Arc;
|
||||
use tracing::{info, debug, warn, error};
|
||||
use url::Url;
|
||||
|
||||
use crate::config::Config;
|
||||
use crate::models::{Domain, DomainRepository, CrawledPage};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct CrawledPageWithHtml {
|
||||
crawled_page: CrawledPage,
|
||||
original_html: String,
|
||||
}
|
||||
use crate::indexer::SearchEngine;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct DomainCrawler {
|
||||
config: Config,
|
||||
gurt_client: GurtClient,
|
||||
domain_repo: DomainRepository,
|
||||
search_engine: Arc<SearchEngine>,
|
||||
}
|
||||
|
||||
impl DomainCrawler {
|
||||
pub async fn new(config: Config, domain_repo: DomainRepository, search_engine: Arc<SearchEngine>) -> Result<Self> {
|
||||
// Fetch the Gurted CA certificate from the DNS server
|
||||
let ca_cert = Self::fetch_ca_certificate().await
|
||||
.context("Failed to fetch Gurted CA certificate")?;
|
||||
|
||||
let gurt_config = GurtClientConfig {
|
||||
request_timeout: config.crawler_timeout(),
|
||||
user_agent: config.search.crawler_user_agent.clone(),
|
||||
max_redirects: config.crawler.max_redirects,
|
||||
custom_ca_certificates: vec![ca_cert],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let gurt_client = GurtClient::with_config(gurt_config);
|
||||
|
||||
Ok(Self {
|
||||
config,
|
||||
gurt_client,
|
||||
domain_repo,
|
||||
search_engine,
|
||||
})
|
||||
}
|
||||
|
||||
async fn fetch_ca_certificate() -> Result<String> {
|
||||
// Use GurtClient's DNS server configuration to build the HTTP URL
|
||||
let dns_ip = GurtClientConfig::default().dns_server_ip;
|
||||
|
||||
// The HTTP bootstrap server runs on port 8876 (hardcoded in DNS server)
|
||||
let http_url = format!("http://{}:8876/ca/root", dns_ip);
|
||||
|
||||
let response = reqwest::get(&http_url).await
|
||||
.context("Failed to fetch CA certificate from HTTP bootstrap server")?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
return Err(anyhow::anyhow!("Failed to fetch CA certificate: HTTP {}", response.status()));
|
||||
}
|
||||
|
||||
let ca_cert = response.text().await
|
||||
.context("Failed to read CA certificate response")?;
|
||||
|
||||
if ca_cert.trim().is_empty() {
|
||||
return Err(anyhow::anyhow!("Received empty CA certificate"));
|
||||
}
|
||||
|
||||
Ok(ca_cert)
|
||||
}
|
||||
|
||||
pub async fn crawl_domain(&self, domain: &Domain) -> Result<CrawlStats> {
|
||||
info!("Starting crawl for domain: {}", domain.full_domain());
|
||||
|
||||
let start_time = std::time::Instant::now();
|
||||
let mut stats = CrawlStats::new();
|
||||
|
||||
self.domain_repo
|
||||
.update_crawl_status(domain.id, "crawling", None, None, None)
|
||||
.await
|
||||
.context("Failed to update crawl status to crawling")?;
|
||||
|
||||
let result = self.crawl_domain_internal(domain, &mut stats).await;
|
||||
|
||||
let duration = start_time.elapsed();
|
||||
stats.duration_seconds = duration.as_secs();
|
||||
|
||||
match result {
|
||||
Ok(()) => {
|
||||
info!(
|
||||
"Successfully crawled domain {} - {} pages found, {} indexed in {:.2}s",
|
||||
domain.full_domain(),
|
||||
stats.pages_found,
|
||||
stats.pages_indexed,
|
||||
duration.as_secs_f64()
|
||||
);
|
||||
|
||||
self.domain_repo
|
||||
.update_crawl_status(
|
||||
domain.id,
|
||||
"completed",
|
||||
None,
|
||||
Some(stats.pages_found as i32),
|
||||
Some(self.config.search.crawl_interval_hours as i64),
|
||||
)
|
||||
.await
|
||||
.context("Failed to update crawl status to completed")?;
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to crawl domain {}: {}",
|
||||
domain.full_domain(),
|
||||
e
|
||||
);
|
||||
|
||||
self.domain_repo
|
||||
.update_crawl_status(
|
||||
domain.id,
|
||||
"failed",
|
||||
Some(&e.to_string()),
|
||||
Some(stats.pages_found as i32),
|
||||
Some(24),
|
||||
)
|
||||
.await
|
||||
.context("Failed to update crawl status to failed")?;
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(stats)
|
||||
}
|
||||
|
||||
async fn check_clanker_txt(&self, base_url: &str) -> Result<Vec<String>> {
|
||||
let clanker_url = format!("{}/clanker.txt", base_url);
|
||||
debug!("Checking clanker.txt at: {}", clanker_url);
|
||||
|
||||
match self.gurt_client.get(&clanker_url).await {
|
||||
Ok(response) => {
|
||||
if response.status_code == 200 {
|
||||
let content = String::from_utf8_lossy(&response.body);
|
||||
let urls = self.parse_clanker_txt(&content, base_url)?;
|
||||
debug!("Found {} allowed URLs in clanker.txt", urls.len());
|
||||
return Ok(urls);
|
||||
}
|
||||
// If clanker.txt doesn't exist (404), crawling is allowed
|
||||
Ok(vec![])
|
||||
}
|
||||
Err(_) => {
|
||||
// If we can't fetch clanker.txt, assume crawling is allowed
|
||||
Ok(vec![])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_clanker_txt(&self, content: &str, base_url: &str) -> Result<Vec<String>> {
|
||||
let user_agent = &self.config.search.crawler_user_agent;
|
||||
let mut disallow_all = false;
|
||||
let mut user_agent_matches = false;
|
||||
let mut allowed_urls = Vec::new();
|
||||
|
||||
for line in content.lines() {
|
||||
let line = line.trim();
|
||||
if line.is_empty() || line.starts_with('#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(user_agent_value) = line.to_lowercase().strip_prefix("user-agent:") {
|
||||
let current_user_agent = user_agent_value.trim().to_string();
|
||||
user_agent_matches = current_user_agent == "*" || current_user_agent.eq_ignore_ascii_case(user_agent);
|
||||
continue;
|
||||
}
|
||||
|
||||
if user_agent_matches {
|
||||
if let Some(path_value) = line.to_lowercase().strip_prefix("disallow:") {
|
||||
let path = path_value.trim();
|
||||
if path == "/" {
|
||||
disallow_all = true;
|
||||
break;
|
||||
}
|
||||
} else if let Some(path_value) = line.to_lowercase().strip_prefix("allow:") {
|
||||
let path = path_value.trim();
|
||||
if !path.is_empty() {
|
||||
let full_url = format!("{}{}", base_url, path);
|
||||
debug!("Added allowed URL from clanker.txt: {}", full_url);
|
||||
allowed_urls.push(full_url);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if disallow_all {
|
||||
Err(anyhow::anyhow!("Crawling disallowed by clanker.txt"))
|
||||
} else {
|
||||
Ok(allowed_urls)
|
||||
}
|
||||
}
|
||||
|
||||
async fn crawl_domain_internal(&self, domain: &Domain, stats: &mut CrawlStats) -> Result<()> {
|
||||
let base_url = domain.gurt_url();
|
||||
let mut visited_urls = HashSet::new();
|
||||
let mut queue = VecDeque::new();
|
||||
let mut pages_to_index = Vec::new();
|
||||
|
||||
// Check clanker.txt if enabled and get allowed URLs
|
||||
let mut clanker_urls = Vec::new();
|
||||
if self.config.crawler.clanker_txt {
|
||||
match self.check_clanker_txt(&base_url).await {
|
||||
Ok(urls) => {
|
||||
clanker_urls = urls;
|
||||
info!("Found {} URLs in clanker.txt for {}", clanker_urls.len(), domain.full_domain());
|
||||
},
|
||||
Err(e) => {
|
||||
warn!("Clanker.txt check failed for {}: {}", domain.full_domain(), e);
|
||||
return Err(anyhow::anyhow!("Crawling disabled by clanker.txt: {}", e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start with the root URL
|
||||
queue.push_back(CrawlItem {
|
||||
url: base_url.clone(),
|
||||
depth: 0,
|
||||
});
|
||||
|
||||
// Add all URLs from clanker.txt to the queue
|
||||
for url in clanker_urls {
|
||||
if !visited_urls.contains(&url) {
|
||||
queue.push_back(CrawlItem {
|
||||
url: url.clone(),
|
||||
depth: 0, // Treat clanker.txt URLs as root level
|
||||
});
|
||||
debug!("Added clanker.txt URL to queue: {}", url);
|
||||
}
|
||||
}
|
||||
|
||||
while let Some(item) = queue.pop_front() {
|
||||
if visited_urls.contains(&item.url) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if item.depth > self.config.crawler.max_depth {
|
||||
debug!("Skipping URL due to depth limit: {}", item.url);
|
||||
continue;
|
||||
}
|
||||
|
||||
if stats.pages_found >= self.config.search.max_pages_per_domain {
|
||||
info!("Reached page limit for domain: {}", domain.full_domain());
|
||||
break;
|
||||
}
|
||||
|
||||
visited_urls.insert(item.url.clone());
|
||||
stats.pages_found += 1;
|
||||
|
||||
// Add crawl delay between requests
|
||||
if stats.pages_found > 1 {
|
||||
tokio::time::sleep(self.config.crawl_delay()).await;
|
||||
}
|
||||
|
||||
match self.crawl_page(&item.url, domain).await {
|
||||
Ok(Some(page_with_html)) => {
|
||||
// Extract links if not at max depth
|
||||
if item.depth < self.config.crawler.max_depth {
|
||||
if let Ok(links) = self.extract_links(&page_with_html.original_html, &base_url).await {
|
||||
debug!("Found {} links on {}", links.len(), item.url);
|
||||
for link in links {
|
||||
if self.should_crawl_url(&link, domain) {
|
||||
debug!("Adding link to crawl queue: {}", link);
|
||||
queue.push_back(CrawlItem {
|
||||
url: link,
|
||||
depth: item.depth + 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pages_to_index.push(page_with_html.crawled_page);
|
||||
stats.pages_indexed += 1;
|
||||
|
||||
// Index in batches
|
||||
if pages_to_index.len() >= 50 {
|
||||
let batch = std::mem::take(&mut pages_to_index);
|
||||
self.search_engine.index_pages(batch).await?;
|
||||
}
|
||||
}
|
||||
Ok(None) => {
|
||||
debug!("Skipped page: {}", item.url);
|
||||
stats.pages_skipped += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to crawl page {}: {}", item.url, e);
|
||||
stats.errors += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Index remaining pages
|
||||
if !pages_to_index.is_empty() {
|
||||
self.search_engine.index_pages(pages_to_index).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn crawl_page(&self, url: &str, domain: &Domain) -> Result<Option<CrawledPageWithHtml>> {
|
||||
debug!("Crawling page: {}", url);
|
||||
|
||||
let response = match self.gurt_client.get(url).await {
|
||||
Ok(response) => response,
|
||||
Err(e) => {
|
||||
return Err(anyhow::anyhow!("Failed to fetch URL: {} - {}", url, e));
|
||||
}
|
||||
};
|
||||
|
||||
let status_code = response.status_code;
|
||||
let content_type = response
|
||||
.headers
|
||||
.get("content-type")
|
||||
.map(|s| s.to_string());
|
||||
|
||||
// Check if we should process this content type
|
||||
if let Some(ref ct) = content_type {
|
||||
if !self.is_allowed_content_type(ct) {
|
||||
debug!("Skipping URL with unsupported content type: {} ({})", url, ct);
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
|
||||
if status_code != 200 {
|
||||
return Err(anyhow::anyhow!(
|
||||
"HTTP error {}: {}",
|
||||
status_code,
|
||||
response.status_message
|
||||
));
|
||||
}
|
||||
|
||||
let content_bytes = response.body;
|
||||
|
||||
// Check content size limit
|
||||
if content_bytes.len() > self.config.content_size_limit_bytes() {
|
||||
warn!("Skipping URL due to size limit: {} ({} bytes)", url, content_bytes.len());
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Convert bytes to string
|
||||
let content = String::from_utf8_lossy(&content_bytes);
|
||||
|
||||
// Extract metadata from HTML
|
||||
let title = self.extract_title(&content);
|
||||
let icon = self.extract_icon(&content, url);
|
||||
let description = self.extract_meta_description(&content);
|
||||
let cleaned_content = self.clean_content(&content);
|
||||
|
||||
let page = CrawledPageWithHtml {
|
||||
crawled_page: CrawledPage {
|
||||
url: url.to_string(),
|
||||
domain: domain.full_domain(),
|
||||
title,
|
||||
content: cleaned_content.clone(),
|
||||
content_hash: Self::calculate_content_hash(&cleaned_content),
|
||||
indexed_at: Utc::now(),
|
||||
icon,
|
||||
description,
|
||||
},
|
||||
original_html: content.to_string(),
|
||||
};
|
||||
|
||||
Ok(Some(page))
|
||||
}
|
||||
|
||||
async fn extract_links(&self, content: &str, base_url: &str) -> Result<Vec<String>> {
|
||||
debug!("Extracting links from content length: {} chars", content.len());
|
||||
let document = Html::parse_document(content);
|
||||
let link_selector = Selector::parse("a[href]").unwrap();
|
||||
let base = Url::parse(base_url)?;
|
||||
let mut links = Vec::new();
|
||||
|
||||
let all_links = document.select(&link_selector).collect::<Vec<_>>();
|
||||
debug!("Found {} anchor tags in HTML", all_links.len());
|
||||
|
||||
for element in all_links {
|
||||
if let Some(href) = element.value().attr("href") {
|
||||
// Skip empty links and fragments
|
||||
if href.is_empty() || href.starts_with('#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip mailto, tel, javascript links
|
||||
if href.starts_with("mailto:") || href.starts_with("tel:") || href.starts_with("javascript:") {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Resolve relative URLs
|
||||
match base.join(href) {
|
||||
Ok(absolute_url) => {
|
||||
let url_str = absolute_url.to_string();
|
||||
|
||||
// Only include GURT protocol URLs for the same domain
|
||||
if url_str.starts_with("gurt://") {
|
||||
if let Ok(parsed) = Url::parse(&url_str) {
|
||||
if let Some(host) = parsed.host_str() {
|
||||
if let Ok(base_parsed) = Url::parse(base_url) {
|
||||
if let Some(base_host) = base_parsed.host_str() {
|
||||
if host == base_host {
|
||||
links.push(url_str);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to resolve URL {}: {}", href, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove duplicates
|
||||
links.sort();
|
||||
links.dedup();
|
||||
|
||||
Ok(links)
|
||||
}
|
||||
|
||||
fn extract_title(&self, content: &str) -> Option<String> {
|
||||
let document = Html::parse_document(content);
|
||||
|
||||
// Try <title> tag first
|
||||
if let Ok(title_selector) = Selector::parse("title") {
|
||||
if let Some(title_element) = document.select(&title_selector).next() {
|
||||
let title_text = title_element.text().collect::<Vec<_>>().join(" ").trim().to_string();
|
||||
if !title_text.is_empty() {
|
||||
return Some(title_text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to first <h1>
|
||||
if let Ok(h1_selector) = Selector::parse("h1") {
|
||||
if let Some(h1_element) = document.select(&h1_selector).next() {
|
||||
let h1_text = h1_element.text().collect::<Vec<_>>().join(" ").trim().to_string();
|
||||
if !h1_text.is_empty() {
|
||||
return Some(h1_text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn extract_icon(&self, content: &str, base_url: &str) -> Option<String> {
|
||||
let document = Html::parse_document(content);
|
||||
|
||||
// Try to find icon tag first (custom GURT standard)
|
||||
if let Ok(icon_selector) = Selector::parse("icon") {
|
||||
if let Some(icon_element) = document.select(&icon_selector).next() {
|
||||
if let Some(src) = icon_element.value().attr("src") {
|
||||
return Some(src.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to standard link rel="icon" or link rel="shortcut icon"
|
||||
if let Ok(link_selector) = Selector::parse("link[rel~=\"icon\"], link[rel=\"shortcut icon\"]") {
|
||||
if let Some(link_element) = document.select(&link_selector).next() {
|
||||
if let Some(href) = link_element.value().attr("href") {
|
||||
// Convert relative URLs to absolute
|
||||
if href.starts_with("http") || href.starts_with("gurt") {
|
||||
return Some(href.to_string());
|
||||
} else if let Ok(base) = Url::parse(base_url) {
|
||||
if let Ok(absolute_url) = base.join(href) {
|
||||
return Some(absolute_url.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn extract_meta_description(&self, content: &str) -> Option<String> {
|
||||
let document = Html::parse_document(content);
|
||||
|
||||
// Look for meta name="description"
|
||||
if let Ok(meta_selector) = Selector::parse("meta[name=\"description\"]") {
|
||||
if let Some(meta_element) = document.select(&meta_selector).next() {
|
||||
if let Some(content_attr) = meta_element.value().attr("content") {
|
||||
let description = content_attr.trim();
|
||||
if !description.is_empty() {
|
||||
return Some(description.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn clean_content(&self, content: &str) -> String {
|
||||
use lol_html::{element, rewrite_str, RewriteStrSettings};
|
||||
|
||||
// First pass: remove script, style, noscript elements
|
||||
let settings = RewriteStrSettings {
|
||||
element_content_handlers: vec![
|
||||
element!("script", |el| {
|
||||
el.remove();
|
||||
Ok(())
|
||||
}),
|
||||
element!("style", |el| {
|
||||
el.remove();
|
||||
Ok(())
|
||||
}),
|
||||
element!("noscript", |el| {
|
||||
el.remove();
|
||||
Ok(())
|
||||
}),
|
||||
],
|
||||
..RewriteStrSettings::default()
|
||||
};
|
||||
|
||||
let cleaned_html = match rewrite_str(content, settings) {
|
||||
Ok(html) => html,
|
||||
Err(_) => content.to_string(),
|
||||
};
|
||||
|
||||
// Second pass: extract text using scraper (already imported)
|
||||
let document = Html::parse_document(&cleaned_html);
|
||||
let text_content = document.root_element()
|
||||
.text()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
|
||||
// Clean up whitespace
|
||||
text_content
|
||||
.split_whitespace()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
}
|
||||
|
||||
fn should_crawl_url(&self, url: &str, domain: &Domain) -> bool {
|
||||
// Parse the URL
|
||||
let parsed_url = match Url::parse(url) {
|
||||
Ok(u) => u,
|
||||
Err(_) => return false,
|
||||
};
|
||||
|
||||
// Must be GURT protocol
|
||||
if parsed_url.scheme() != "gurt" {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Must be same domain
|
||||
if let Some(host) = parsed_url.host_str() {
|
||||
if host != domain.full_domain() {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
if let Some(path) = parsed_url.path().split('/').last() {
|
||||
if let Some(extension) = path.split('.').last() {
|
||||
if path.contains('.') && extension != path {
|
||||
if self.config.is_blocked_extension(extension) {
|
||||
return false;
|
||||
}
|
||||
if !self.config.search.allowed_extensions.is_empty()
|
||||
&& !self.config.is_allowed_extension(extension) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
fn is_allowed_content_type(&self, content_type: &str) -> bool {
|
||||
let ct_lower = content_type.to_lowercase();
|
||||
|
||||
if ct_lower.contains("text/html") || ct_lower.contains("application/xhtml") {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ct_lower.contains("text/plain") {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ct_lower.contains("text/markdown") || ct_lower.contains("application/json") {
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn calculate_content_hash(content: &str) -> String {
|
||||
use sha2::{Sha256, Digest};
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(content.as_bytes());
|
||||
format!("{:x}", hasher.finalize())
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn run_crawl_all(config: Config) -> Result<()> {
|
||||
info!("Starting crawl of all registered domains");
|
||||
|
||||
let pool = sqlx::PgPool::connect(&config.database_url()).await
|
||||
.context("Failed to connect to database")?;
|
||||
|
||||
let domain_repo = DomainRepository::new(pool);
|
||||
let search_engine = Arc::new(SearchEngine::new(config.clone())?);
|
||||
let crawler = DomainCrawler::new(config.clone(), domain_repo.clone(), search_engine).await?;
|
||||
|
||||
let domains = domain_repo.get_domains_for_crawling(None).await
|
||||
.context("Failed to fetch domains for crawling")?;
|
||||
|
||||
if domains.is_empty() {
|
||||
info!("No domains found that need crawling");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!("Found {} domains to crawl", domains.len());
|
||||
|
||||
let mut total_stats = CrawlStats::new();
|
||||
let max_concurrent = config.search.max_concurrent_crawls;
|
||||
|
||||
let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(max_concurrent));
|
||||
let mut tasks = Vec::new();
|
||||
|
||||
for domain in domains {
|
||||
let crawler = Arc::new(crawler.clone());
|
||||
let permit = semaphore.clone().acquire_owned().await
|
||||
.context("Failed to acquire semaphore permit")?;
|
||||
|
||||
let task = tokio::spawn(async move {
|
||||
let _permit = permit; // Keep permit alive
|
||||
crawler.crawl_domain(&domain).await
|
||||
});
|
||||
|
||||
tasks.push(task);
|
||||
}
|
||||
|
||||
for task in tasks {
|
||||
match task.await {
|
||||
Ok(Ok(stats)) => {
|
||||
total_stats.pages_found += stats.pages_found;
|
||||
total_stats.pages_indexed += stats.pages_indexed;
|
||||
total_stats.pages_skipped += stats.pages_skipped;
|
||||
total_stats.errors += stats.errors;
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
error!("Crawl task failed: {}", e);
|
||||
total_stats.errors += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Task join error: {}", e);
|
||||
total_stats.errors += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"Crawl completed - {} pages found, {} indexed, {} skipped, {} errors",
|
||||
total_stats.pages_found,
|
||||
total_stats.pages_indexed,
|
||||
total_stats.pages_skipped,
|
||||
total_stats.errors
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct CrawlItem {
|
||||
url: String,
|
||||
depth: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CrawlStats {
|
||||
pub pages_found: usize,
|
||||
pub pages_indexed: usize,
|
||||
pub pages_skipped: usize,
|
||||
pub errors: usize,
|
||||
pub duration_seconds: u64,
|
||||
}
|
||||
|
||||
impl CrawlStats {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
pages_found: 0,
|
||||
pages_indexed: 0,
|
||||
pages_skipped: 0,
|
||||
errors: 0,
|
||||
duration_seconds: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
405
search-engine/src/indexer.rs
Normal file
405
search-engine/src/indexer.rs
Normal file
@@ -0,0 +1,405 @@
|
||||
use anyhow::{Result, Context};
|
||||
use chrono::{DateTime, Utc};
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
use std::time::Instant;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING, TEXT};
|
||||
use tantivy::{doc, Index, IndexReader, IndexWriter, ReloadPolicy, Term, TantivyDocument};
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::tokenizer::*;
|
||||
use tantivy::schema::Value;
|
||||
use tracing::{info, debug};
|
||||
|
||||
use crate::config::Config;
|
||||
use crate::models::{SearchResult, SearchResponse, IndexStats, CrawledPage};
|
||||
|
||||
pub struct SearchEngine {
|
||||
config: Config,
|
||||
index: Index,
|
||||
reader: IndexReader,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
impl SearchEngine {
|
||||
pub fn new(config: Config) -> Result<Self> {
|
||||
let index_path = &config.search.index_path;
|
||||
|
||||
std::fs::create_dir_all(index_path)
|
||||
.with_context(|| format!("Failed to create index directory: {:?}", index_path))?;
|
||||
|
||||
let schema = build_schema();
|
||||
|
||||
let index = if index_path.join("meta.json").exists() {
|
||||
info!("Loading existing search index from {:?}", index_path);
|
||||
Index::open_in_dir(index_path)
|
||||
.with_context(|| format!("Failed to open existing index at {:?}", index_path))?
|
||||
} else {
|
||||
info!("Creating new search index at {:?}", index_path);
|
||||
Index::create_in_dir(index_path, schema.clone())
|
||||
.with_context(|| format!("Failed to create new index at {:?}", index_path))?
|
||||
};
|
||||
|
||||
// Configure tokenizers
|
||||
let tokenizer_manager = index.tokenizers();
|
||||
tokenizer_manager.register(
|
||||
"gurted_text",
|
||||
TextAnalyzer::builder(SimpleTokenizer::default())
|
||||
.filter(RemoveLongFilter::limit(40))
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::new(Language::English).unwrap())
|
||||
.build(),
|
||||
);
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommitWithDelay)
|
||||
.try_into()
|
||||
.context("Failed to create index reader")?;
|
||||
|
||||
Ok(Self {
|
||||
config,
|
||||
index,
|
||||
reader,
|
||||
schema,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn index_pages(&self, pages: Vec<CrawledPage>) -> Result<usize> {
|
||||
if pages.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let start_time = Instant::now();
|
||||
let mut writer = self.get_writer()?;
|
||||
let mut indexed_count = 0;
|
||||
let mut duplicate_count = 0;
|
||||
|
||||
let url_field = self.schema.get_field("url").unwrap();
|
||||
let title_field = self.schema.get_field("title").unwrap();
|
||||
let content_field = self.schema.get_field("content").unwrap();
|
||||
let preview_field = self.schema.get_field("preview").unwrap();
|
||||
let domain_field = self.schema.get_field("domain").unwrap();
|
||||
let indexed_at_field = self.schema.get_field("indexed_at").unwrap();
|
||||
let content_hash_field = self.schema.get_field("content_hash").unwrap();
|
||||
let icon_field = self.schema.get_field("icon").unwrap();
|
||||
let description_field = self.schema.get_field("description").unwrap();
|
||||
|
||||
info!("Indexing {} pages...", pages.len());
|
||||
|
||||
for page in pages {
|
||||
// Check for duplicates (always enabled)
|
||||
if let Ok(existing_hash) = self.get_document_hash(&page.url).await {
|
||||
if existing_hash == page.content_hash {
|
||||
duplicate_count += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove existing document for this URL
|
||||
let url_term = Term::from_field_text(url_field, &page.url);
|
||||
writer.delete_term(url_term);
|
||||
|
||||
let preview = page.generate_preview(500);
|
||||
let title = page.title.unwrap_or_else(|| extract_title_from_content(&page.content));
|
||||
|
||||
// Add new document
|
||||
writer.add_document(doc!(
|
||||
url_field => page.url.clone(),
|
||||
title_field => title,
|
||||
content_field => page.content.clone(),
|
||||
preview_field => preview,
|
||||
domain_field => page.domain.clone(),
|
||||
indexed_at_field => page.indexed_at.timestamp(),
|
||||
content_hash_field => page.content_hash.clone(),
|
||||
icon_field => page.icon.unwrap_or_default(),
|
||||
description_field => page.description.unwrap_or_default()
|
||||
))?;
|
||||
|
||||
indexed_count += 1;
|
||||
|
||||
// Commit in batches
|
||||
if indexed_count % 100 == 0 {
|
||||
writer.commit()
|
||||
.context("Failed to commit batch of documents")?;
|
||||
writer = self.get_writer()?; // Get new writer after commit
|
||||
|
||||
let elapsed = start_time.elapsed().as_secs_f64();
|
||||
let rate = indexed_count as f64 / elapsed;
|
||||
info!("Indexed {} pages ({:.1} pages/sec)", indexed_count, rate);
|
||||
}
|
||||
}
|
||||
|
||||
// Final commit
|
||||
writer.commit().context("Failed to commit final batch")?;
|
||||
|
||||
let total_time = start_time.elapsed();
|
||||
info!(
|
||||
"Indexing completed: {} pages indexed, {} duplicates skipped in {:.2}s",
|
||||
indexed_count,
|
||||
duplicate_count,
|
||||
total_time.as_secs_f64()
|
||||
);
|
||||
|
||||
Ok(indexed_count)
|
||||
}
|
||||
|
||||
pub async fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchResult>> {
|
||||
let start_time = Instant::now();
|
||||
let searcher = self.reader.searcher();
|
||||
|
||||
let url_field = self.schema.get_field("url").unwrap();
|
||||
let title_field = self.schema.get_field("title").unwrap();
|
||||
let content_field = self.schema.get_field("content").unwrap();
|
||||
let preview_field = self.schema.get_field("preview").unwrap();
|
||||
let domain_field = self.schema.get_field("domain").unwrap();
|
||||
let indexed_at_field = self.schema.get_field("indexed_at").unwrap();
|
||||
let icon_field = self.schema.get_field("icon").unwrap();
|
||||
let description_field = self.schema.get_field("description").unwrap();
|
||||
|
||||
// Create query parser for title and content fields
|
||||
let query_parser = QueryParser::for_index(
|
||||
&self.index,
|
||||
vec![title_field, content_field]
|
||||
);
|
||||
|
||||
let parsed_query = query_parser
|
||||
.parse_query(query)
|
||||
.with_context(|| format!("Failed to parse query: {}", query))?;
|
||||
|
||||
let top_docs = searcher
|
||||
.search(&parsed_query, &TopDocs::with_limit(limit))
|
||||
.context("Search query execution failed")?;
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
for (score, doc_address) in top_docs {
|
||||
let doc: TantivyDocument = searcher.doc(doc_address)?;
|
||||
|
||||
let url = doc.get_first(url_field)
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
|
||||
let title = doc.get_first(title_field)
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("Untitled")
|
||||
.to_string();
|
||||
|
||||
let preview = doc.get_first(preview_field)
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
|
||||
let domain = doc.get_first(domain_field)
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
|
||||
let indexed_at_timestamp = doc.get_first(indexed_at_field)
|
||||
.and_then(|v| v.as_i64())
|
||||
.unwrap_or(0);
|
||||
|
||||
let indexed_at = DateTime::from_timestamp(indexed_at_timestamp, 0)
|
||||
.unwrap_or_else(|| Utc::now());
|
||||
|
||||
let icon = doc.get_first(icon_field)
|
||||
.and_then(|v| v.as_str())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string());
|
||||
|
||||
let description = doc.get_first(description_field)
|
||||
.and_then(|v| v.as_str())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string());
|
||||
|
||||
results.push(SearchResult {
|
||||
url,
|
||||
title,
|
||||
preview,
|
||||
domain,
|
||||
score,
|
||||
indexed_at,
|
||||
icon,
|
||||
description,
|
||||
});
|
||||
}
|
||||
|
||||
let search_time = start_time.elapsed();
|
||||
debug!(
|
||||
"Search completed: {} results for '{}' in {:.2}ms",
|
||||
results.len(),
|
||||
query,
|
||||
search_time.as_millis()
|
||||
);
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
pub async fn search_with_response(&self, query: &str, page: usize, per_page: usize) -> Result<SearchResponse> {
|
||||
let offset = page.saturating_sub(1) * per_page;
|
||||
let limit = std::cmp::min(per_page, self.config.search.max_search_results);
|
||||
|
||||
let all_results = self.search(query, offset + limit).await?;
|
||||
let results = all_results.into_iter().skip(offset).take(per_page).collect();
|
||||
let total_results = self.get_total_document_count().await?;
|
||||
|
||||
Ok(SearchResponse {
|
||||
query: query.to_string(),
|
||||
results,
|
||||
total_results,
|
||||
page,
|
||||
per_page,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_stats(&self) -> Result<IndexStats> {
|
||||
let searcher = self.reader.searcher();
|
||||
let total_documents = searcher.num_docs() as usize;
|
||||
|
||||
// Count unique domains (simplified approach)
|
||||
let domains: HashSet<String> = HashSet::new();
|
||||
// TODO: Implement domain counting when needed
|
||||
|
||||
let total_domains = domains.len();
|
||||
|
||||
// Calculate index size
|
||||
let index_size_mb = calculate_directory_size(&self.config.search.index_path)?;
|
||||
|
||||
// Get last update time (approximate)
|
||||
let last_updated = get_index_last_modified(&self.config.search.index_path)?;
|
||||
|
||||
Ok(IndexStats {
|
||||
total_documents,
|
||||
total_domains,
|
||||
index_size_mb,
|
||||
last_updated,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_total_document_count(&self) -> Result<usize> {
|
||||
let searcher = self.reader.searcher();
|
||||
Ok(searcher.num_docs() as usize)
|
||||
}
|
||||
|
||||
async fn get_document_hash(&self, url: &str) -> Result<String> {
|
||||
let searcher = self.reader.searcher();
|
||||
let url_field = self.schema.get_field("url").unwrap();
|
||||
let content_hash_field = self.schema.get_field("content_hash").unwrap();
|
||||
|
||||
let query_parser = QueryParser::for_index(&self.index, vec![url_field]);
|
||||
let query = query_parser.parse_query(&format!("\"{}\"", url))?;
|
||||
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(1))?;
|
||||
|
||||
if let Some((_, doc_address)) = top_docs.first() {
|
||||
let doc: TantivyDocument = searcher.doc(*doc_address)?;
|
||||
if let Some(hash_value) = doc.get_first(content_hash_field) {
|
||||
if let Some(hash_str) = hash_value.as_str() {
|
||||
return Ok(hash_str.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow::anyhow!("Document not found: {}", url))
|
||||
}
|
||||
|
||||
fn get_writer(&self) -> Result<IndexWriter> {
|
||||
self.index
|
||||
.writer_with_num_threads(4, 256 * 1024 * 1024) // 256MB buffer
|
||||
.context("Failed to create index writer")
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn rebuild_index(config: Config) -> Result<()> {
|
||||
info!("Starting index rebuild...");
|
||||
|
||||
// Remove existing index
|
||||
if config.search.index_path.exists() {
|
||||
std::fs::remove_dir_all(&config.search.index_path)
|
||||
.context("Failed to remove existing index")?;
|
||||
}
|
||||
|
||||
// Create new search engine (which will create a new index)
|
||||
let _search_engine = SearchEngine::new(config)?;
|
||||
|
||||
info!("Index rebuild completed - new empty index created");
|
||||
info!("Run a crawl to populate the index with content");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn build_schema() -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
schema_builder.add_text_field("url", STRING | STORED | FAST);
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
schema_builder.add_text_field("content", TEXT);
|
||||
schema_builder.add_text_field("preview", STRING | STORED);
|
||||
schema_builder.add_text_field("domain", STRING | STORED | FAST);
|
||||
schema_builder.add_i64_field("indexed_at", INDEXED | STORED | FAST);
|
||||
schema_builder.add_text_field("content_hash", STRING | STORED);
|
||||
schema_builder.add_text_field("icon", STRING | STORED);
|
||||
schema_builder.add_text_field("description", STRING | STORED);
|
||||
|
||||
schema_builder.build()
|
||||
}
|
||||
|
||||
fn extract_title_from_content(content: &str) -> String {
|
||||
// Try to extract title from HTML content
|
||||
let document = scraper::Html::parse_document(content);
|
||||
let title_selector = scraper::Selector::parse("title").unwrap();
|
||||
if let Some(title_element) = document.select(&title_selector).next() {
|
||||
let title = title_element.text().collect::<Vec<_>>().join(" ");
|
||||
if !title.trim().is_empty() {
|
||||
return title.trim().to_string();
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to h1
|
||||
let h1_selector = scraper::Selector::parse("h1").unwrap();
|
||||
if let Some(h1_element) = document.select(&h1_selector).next() {
|
||||
let h1_text = h1_element.text().collect::<Vec<_>>().join(" ");
|
||||
if !h1_text.trim().is_empty() {
|
||||
return h1_text.trim().to_string();
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to first line of content
|
||||
content.lines()
|
||||
.find(|line| !line.trim().is_empty())
|
||||
.unwrap_or("Untitled")
|
||||
.trim()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
fn calculate_directory_size(path: &Path) -> Result<f64> {
|
||||
let mut total_size = 0u64;
|
||||
|
||||
if path.is_dir() {
|
||||
for entry in std::fs::read_dir(path)? {
|
||||
let entry = entry?;
|
||||
let metadata = entry.metadata()?;
|
||||
if metadata.is_file() {
|
||||
total_size += metadata.len();
|
||||
} else if metadata.is_dir() {
|
||||
total_size += (calculate_directory_size(&entry.path())? * 1024.0 * 1024.0) as u64;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(total_size as f64 / 1024.0 / 1024.0) // Convert to MB
|
||||
}
|
||||
|
||||
fn get_index_last_modified(path: &Path) -> Result<DateTime<Utc>> {
|
||||
let meta_path = path.join("meta.json");
|
||||
|
||||
if meta_path.exists() {
|
||||
let metadata = std::fs::metadata(meta_path)?;
|
||||
let modified = metadata.modified()?;
|
||||
let datetime = DateTime::<Utc>::from(modified);
|
||||
Ok(datetime)
|
||||
} else {
|
||||
Ok(Utc::now())
|
||||
}
|
||||
}
|
||||
124
search-engine/src/main.rs
Normal file
124
search-engine/src/main.rs
Normal file
@@ -0,0 +1,124 @@
|
||||
mod config;
|
||||
mod indexer;
|
||||
mod crawler;
|
||||
mod scheduler;
|
||||
mod server;
|
||||
mod models;
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::{Parser, Subcommand};
|
||||
use config::Config;
|
||||
use tracing::info;
|
||||
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "gurted-search-engine")]
|
||||
#[command(about = "Crawl and index registered GURT domains")]
|
||||
#[command(version = "0.1.0")]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
|
||||
#[arg(long, default_value = "config.toml")]
|
||||
config: String,
|
||||
|
||||
#[arg(short, long, action = clap::ArgAction::Count)]
|
||||
verbose: u8,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
Server,
|
||||
Crawl,
|
||||
RebuildIndex,
|
||||
Search {
|
||||
query: String,
|
||||
#[arg(short, long, default_value = "10")]
|
||||
limit: usize,
|
||||
},
|
||||
Stats,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
init_logging(&cli)?;
|
||||
|
||||
let config = Config::load_from_file(&cli.config).unwrap();
|
||||
|
||||
info!("Starting Gurted Search Engine v{}", env!("CARGO_PKG_VERSION"));
|
||||
info!("Configuration loaded from: {}", cli.config);
|
||||
|
||||
match cli.command {
|
||||
Commands::Server => {
|
||||
info!("Starting search engine server on {}", config.server_bind_address());
|
||||
server::run_server(config).await?;
|
||||
}
|
||||
Commands::Crawl => {
|
||||
info!("Starting one-time crawl of all registered domains");
|
||||
crawler::run_crawl_all(config).await?;
|
||||
}
|
||||
Commands::RebuildIndex => {
|
||||
info!("Rebuilding search index from scratch");
|
||||
indexer::rebuild_index(config).await?;
|
||||
}
|
||||
Commands::Search { query, limit } => {
|
||||
info!("Testing search with query: '{}'", query);
|
||||
test_search(config, query, limit).await?;
|
||||
}
|
||||
Commands::Stats => {
|
||||
info!("Displaying search index statistics");
|
||||
show_stats(config).await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_logging(cli: &Cli) -> Result<()> {
|
||||
let log_level = match cli.verbose {
|
||||
0 => "info",
|
||||
1 => "debug",
|
||||
_ => "trace",
|
||||
};
|
||||
|
||||
tracing_subscriber::registry()
|
||||
.with(
|
||||
tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| format!("gurted_search_engine={}", log_level).into()),
|
||||
)
|
||||
.with(tracing_subscriber::fmt::layer())
|
||||
.init();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn test_search(config: Config, query: String, limit: usize) -> Result<()> {
|
||||
let search_engine = indexer::SearchEngine::new(config)?;
|
||||
let results = search_engine.search(&query, limit).await?;
|
||||
|
||||
println!("Search results for '{}' (showing {} results):",
|
||||
query, results.len());
|
||||
|
||||
for (i, result) in results.iter().enumerate() {
|
||||
println!("{}. {} - {}", i + 1, result.title, result.url);
|
||||
println!(" {}", result.preview);
|
||||
println!();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn show_stats(config: Config) -> Result<()> {
|
||||
let search_engine = indexer::SearchEngine::new(config)?;
|
||||
let stats = search_engine.get_stats().await?;
|
||||
|
||||
println!("Search Index Statistics:");
|
||||
println!(" Total documents: {}", stats.total_documents);
|
||||
println!(" Total domains: {}", stats.total_domains);
|
||||
println!(" Index size: {} MB", stats.index_size_mb);
|
||||
println!(" Last updated: {}", stats.last_updated);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
185
search-engine/src/models.rs
Normal file
185
search-engine/src/models.rs
Normal file
@@ -0,0 +1,185 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::{FromRow, types::chrono::{DateTime, Utc}};
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, FromRow)]
|
||||
pub struct Domain {
|
||||
pub id: i32,
|
||||
pub name: String,
|
||||
pub tld: String,
|
||||
pub user_id: Option<i32>,
|
||||
pub status: Option<String>,
|
||||
pub created_at: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
||||
impl Domain {
|
||||
pub fn full_domain(&self) -> String {
|
||||
format!("{}.{}", self.name, self.tld)
|
||||
}
|
||||
|
||||
pub fn gurt_url(&self) -> String {
|
||||
format!("gurt://{}.{}", self.name, self.tld)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, FromRow)]
|
||||
pub struct DnsRecord {
|
||||
pub id: i32,
|
||||
pub domain_id: i32,
|
||||
pub record_type: String,
|
||||
pub name: String,
|
||||
pub value: String,
|
||||
pub ttl: Option<i32>,
|
||||
pub priority: Option<i32>,
|
||||
pub created_at: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct SearchResult {
|
||||
pub url: String,
|
||||
pub title: String,
|
||||
pub preview: String,
|
||||
pub domain: String,
|
||||
pub score: f32,
|
||||
pub indexed_at: DateTime<Utc>,
|
||||
pub icon: Option<String>,
|
||||
pub description: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct SearchResponse {
|
||||
pub query: String,
|
||||
pub results: Vec<SearchResult>,
|
||||
pub total_results: usize,
|
||||
pub page: usize,
|
||||
pub per_page: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct IndexStats {
|
||||
pub total_documents: usize,
|
||||
pub total_domains: usize,
|
||||
pub index_size_mb: f64,
|
||||
pub last_updated: DateTime<Utc>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CrawledPage {
|
||||
pub url: String,
|
||||
pub domain: String,
|
||||
pub title: Option<String>,
|
||||
pub content: String,
|
||||
pub content_hash: String,
|
||||
pub indexed_at: DateTime<Utc>,
|
||||
pub icon: Option<String>,
|
||||
pub description: Option<String>,
|
||||
}
|
||||
|
||||
impl CrawledPage {
|
||||
pub fn generate_preview(&self, max_len: usize) -> String {
|
||||
let text = self.content.trim();
|
||||
if text.len() <= max_len {
|
||||
text.to_string()
|
||||
} else {
|
||||
let mut preview = text.chars().take(max_len).collect::<String>();
|
||||
if let Some(last_space) = preview.rfind(' ') {
|
||||
preview.truncate(last_space);
|
||||
}
|
||||
preview.push_str("...");
|
||||
preview
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct DomainRepository {
|
||||
pool: sqlx::PgPool,
|
||||
}
|
||||
|
||||
impl DomainRepository {
|
||||
pub fn new(pool: sqlx::PgPool) -> Self {
|
||||
Self { pool }
|
||||
}
|
||||
|
||||
pub async fn get_domains_for_crawling(&self, limit: Option<i32>) -> Result<Vec<Domain>, sqlx::Error> {
|
||||
let query = if let Some(limit) = limit {
|
||||
sqlx::query_as::<_, Domain>(
|
||||
"SELECT d.id, d.name, d.tld, d.user_id, d.status, d.created_at
|
||||
FROM domains d
|
||||
LEFT JOIN domain_crawl_status dcs ON d.id = dcs.domain_id
|
||||
WHERE d.status = 'approved'
|
||||
AND (dcs.crawl_status IS NULL
|
||||
OR (dcs.crawl_status = 'completed' AND dcs.next_crawl_at <= NOW())
|
||||
OR (dcs.crawl_status = 'failed' AND dcs.next_crawl_at <= NOW())
|
||||
OR (dcs.crawl_status = 'pending' AND dcs.next_crawl_at <= NOW()))
|
||||
ORDER BY COALESCE(dcs.last_crawled_at, '1970-01-01'::timestamptz) ASC
|
||||
LIMIT $1"
|
||||
)
|
||||
.bind(limit)
|
||||
} else {
|
||||
sqlx::query_as::<_, Domain>(
|
||||
"SELECT d.id, d.name, d.tld, d.user_id, d.status, d.created_at
|
||||
FROM domains d
|
||||
LEFT JOIN domain_crawl_status dcs ON d.id = dcs.domain_id
|
||||
WHERE d.status = 'approved'
|
||||
AND (dcs.crawl_status IS NULL
|
||||
OR (dcs.crawl_status = 'completed' AND dcs.next_crawl_at <= NOW())
|
||||
OR (dcs.crawl_status = 'failed' AND dcs.next_crawl_at <= NOW())
|
||||
OR (dcs.crawl_status = 'pending' AND dcs.next_crawl_at <= NOW()))
|
||||
ORDER BY COALESCE(dcs.last_crawled_at, '1970-01-01'::timestamptz) ASC"
|
||||
)
|
||||
};
|
||||
|
||||
query.fetch_all(&self.pool).await
|
||||
}
|
||||
|
||||
pub async fn update_crawl_status(
|
||||
&self,
|
||||
domain_id: i32,
|
||||
status: &str,
|
||||
error_message: Option<&str>,
|
||||
pages_found: Option<i32>,
|
||||
next_crawl_hours: Option<i64>
|
||||
) -> Result<(), sqlx::Error> {
|
||||
let next_crawl_at = next_crawl_hours
|
||||
.map(|hours| chrono::Utc::now() + chrono::Duration::hours(hours));
|
||||
|
||||
sqlx::query(
|
||||
"INSERT INTO domain_crawl_status (domain_id, crawl_status, error_message, pages_found, last_crawled_at, next_crawl_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4,
|
||||
CASE WHEN $2 IN ('completed','failed') THEN NOW() ELSE NULL END,
|
||||
$5, NOW())
|
||||
ON CONFLICT (domain_id)
|
||||
DO UPDATE SET
|
||||
crawl_status = $2,
|
||||
error_message = $3,
|
||||
pages_found = CASE WHEN $2 = 'completed'
|
||||
THEN COALESCE($4, domain_crawl_status.pages_found)
|
||||
ELSE domain_crawl_status.pages_found END,
|
||||
last_crawled_at = CASE WHEN $2 IN ('completed','failed')
|
||||
THEN NOW()
|
||||
ELSE domain_crawl_status.last_crawled_at END,
|
||||
next_crawl_at = COALESCE($5, domain_crawl_status.next_crawl_at),
|
||||
updated_at = NOW()"
|
||||
)
|
||||
.bind(domain_id)
|
||||
.bind(status)
|
||||
.bind(error_message)
|
||||
.bind(pages_found)
|
||||
.bind(next_crawl_at)
|
||||
.execute(&self.pool)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, FromRow)]
|
||||
pub struct CrawlStatus {
|
||||
pub domain_id: i32,
|
||||
pub last_crawled_at: Option<DateTime<Utc>>,
|
||||
pub next_crawl_at: Option<DateTime<Utc>>,
|
||||
pub crawl_status: Option<String>,
|
||||
pub error_message: Option<String>,
|
||||
pub pages_found: Option<i32>,
|
||||
pub updated_at: Option<DateTime<Utc>>,
|
||||
}
|
||||
152
search-engine/src/scheduler.rs
Normal file
152
search-engine/src/scheduler.rs
Normal file
@@ -0,0 +1,152 @@
|
||||
use anyhow::{Result, Context};
|
||||
use tokio::time::{interval, Instant};
|
||||
use tracing::{info, error};
|
||||
|
||||
use crate::config::Config;
|
||||
use crate::crawler::run_crawl_all;
|
||||
use crate::indexer::SearchEngine;
|
||||
|
||||
pub struct CrawlScheduler {
|
||||
config: Config,
|
||||
}
|
||||
|
||||
impl CrawlScheduler {
|
||||
pub fn new(config: Config) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> Result<()> {
|
||||
info!("Starting crawl scheduler");
|
||||
info!(
|
||||
"Crawl interval: {} hours ({} seconds)",
|
||||
self.config.search.crawl_interval_hours,
|
||||
self.config.crawl_interval().as_secs()
|
||||
);
|
||||
info!(
|
||||
"Index rebuild interval: {} hours ({} seconds)",
|
||||
self.config.search.index_rebuild_interval_hours,
|
||||
self.config.index_rebuild_interval().as_secs()
|
||||
);
|
||||
|
||||
let mut crawl_interval = interval(self.config.crawl_interval());
|
||||
let mut index_rebuild_interval = interval(self.config.index_rebuild_interval());
|
||||
|
||||
crawl_interval.tick().await;
|
||||
index_rebuild_interval.tick().await;
|
||||
|
||||
info!("Running initial crawl...");
|
||||
if let Err(e) = self.run_scheduled_crawl().await {
|
||||
error!("Initial crawl failed: {}", e);
|
||||
error!("Error details: {:?}", e);
|
||||
|
||||
// Log the error chain
|
||||
let mut source = e.source();
|
||||
let mut depth = 1;
|
||||
while let Some(err) = source {
|
||||
error!(" Caused by ({}): {}", depth, err);
|
||||
source = err.source();
|
||||
depth += 1;
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = crawl_interval.tick() => {
|
||||
info!("Running scheduled crawl");
|
||||
if let Err(e) = self.run_scheduled_crawl().await {
|
||||
error!("Scheduled crawl failed: {}", e);
|
||||
error!("Error details: {:?}", e);
|
||||
|
||||
// Log the error chain
|
||||
let mut source = e.source();
|
||||
let mut depth = 1;
|
||||
while let Some(err) = source {
|
||||
error!(" Caused by ({}): {}", depth, err);
|
||||
source = err.source();
|
||||
depth += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ = index_rebuild_interval.tick() => {
|
||||
info!("Running scheduled index rebuild");
|
||||
if let Err(e) = self.run_scheduled_index_rebuild().await {
|
||||
error!("Scheduled index rebuild failed: {}", e);
|
||||
}
|
||||
}
|
||||
_ = tokio::signal::ctrl_c() => {
|
||||
info!("Received shutdown signal, stopping scheduler");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("Crawl scheduler stopped");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_scheduled_crawl(&self) -> Result<()> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
run_crawl_all(self.config.clone()).await
|
||||
.context("Scheduled crawl failed")?;
|
||||
|
||||
let duration = start_time.elapsed();
|
||||
info!("Scheduled crawl completed in {:.2} seconds", duration.as_secs_f64());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_scheduled_index_rebuild(&self) -> Result<()> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
let search_engine = SearchEngine::new(self.config.clone())?;
|
||||
let stats_before = search_engine.get_stats().await?;
|
||||
|
||||
info!(
|
||||
"Starting index rebuild - current index has {} documents from {} domains ({:.2} MB)",
|
||||
stats_before.total_documents,
|
||||
stats_before.total_domains,
|
||||
stats_before.index_size_mb
|
||||
);
|
||||
|
||||
crate::indexer::rebuild_index(self.config.clone()).await
|
||||
.context("Index rebuild failed")?;
|
||||
|
||||
info!("Repopulating rebuilt index with fresh crawl");
|
||||
run_crawl_all(self.config.clone()).await
|
||||
.context("Post-rebuild crawl failed")?;
|
||||
|
||||
let duration = start_time.elapsed();
|
||||
|
||||
let new_search_engine = SearchEngine::new(self.config.clone())?;
|
||||
let stats_after = new_search_engine.get_stats().await?;
|
||||
|
||||
info!(
|
||||
"Index rebuild completed in {:.2} seconds - new index has {} documents from {} domains ({:.2} MB)",
|
||||
duration.as_secs_f64(),
|
||||
stats_after.total_documents,
|
||||
stats_after.total_domains,
|
||||
stats_after.index_size_mb
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BackgroundScheduler {
|
||||
scheduler: CrawlScheduler,
|
||||
}
|
||||
|
||||
impl BackgroundScheduler {
|
||||
pub fn new(config: Config) -> Self {
|
||||
Self {
|
||||
scheduler: CrawlScheduler::new(config),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(self) -> tokio::task::JoinHandle<Result<()>> {
|
||||
tokio::spawn(async move {
|
||||
self.scheduler.start().await
|
||||
})
|
||||
}
|
||||
}
|
||||
195
search-engine/src/server.rs
Normal file
195
search-engine/src/server.rs
Normal file
@@ -0,0 +1,195 @@
|
||||
use anyhow::{Result, Context};
|
||||
use gurt::prelude::*;
|
||||
use gurt::GurtError;
|
||||
use serde_json::json;
|
||||
use std::sync::Arc;
|
||||
use tracing::{info, error};
|
||||
|
||||
use crate::config::Config;
|
||||
use crate::indexer::SearchEngine;
|
||||
use crate::scheduler::BackgroundScheduler;
|
||||
|
||||
pub struct SearchServer {
|
||||
config: Config,
|
||||
search_engine: Arc<SearchEngine>,
|
||||
}
|
||||
|
||||
impl SearchServer {
|
||||
pub async fn new(config: Config) -> Result<Self> {
|
||||
// Connect to database
|
||||
sqlx::PgPool::connect(&config.database_url()).await
|
||||
.context("Failed to connect to database")?;
|
||||
|
||||
let search_engine = Arc::new(SearchEngine::new(config.clone())?);
|
||||
|
||||
Ok(Self {
|
||||
config,
|
||||
search_engine,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn run(self) -> Result<()> {
|
||||
info!("Starting GURT search server on {}", self.config.server_bind_address());
|
||||
|
||||
let scheduler_config = self.config.clone();
|
||||
let _scheduler_handle = BackgroundScheduler::new(scheduler_config).start();
|
||||
info!("Background crawler scheduler started");
|
||||
|
||||
let server = GurtServer::with_tls_certificates(
|
||||
&self.config.server.cert_path.to_string_lossy(),
|
||||
&self.config.server.key_path.to_string_lossy()
|
||||
)?;
|
||||
|
||||
let search_engine = self.search_engine.clone();
|
||||
let config = self.config.clone();
|
||||
|
||||
let server = server
|
||||
.get("/search", {
|
||||
let search_engine = search_engine.clone();
|
||||
let config = config.clone();
|
||||
move |ctx| {
|
||||
let search_engine = search_engine.clone();
|
||||
let config = config.clone();
|
||||
let path = ctx.path().to_string();
|
||||
async move {
|
||||
handle_search(path, search_engine, config).await
|
||||
}
|
||||
}
|
||||
})
|
||||
.get("/api/search*", {
|
||||
let search_engine = search_engine.clone();
|
||||
let config = config.clone();
|
||||
move |ctx| {
|
||||
let search_engine = search_engine.clone();
|
||||
let config = config.clone();
|
||||
|
||||
let path = ctx.path().to_string();
|
||||
async move {
|
||||
handle_api_search(path, search_engine, config).await
|
||||
}
|
||||
}
|
||||
})
|
||||
.get("/", {
|
||||
move |_ctx| async {
|
||||
Ok(GurtResponse::ok().with_string_body(include_str!("../frontend/search.html")))
|
||||
}
|
||||
})
|
||||
.get("/search.lua", {
|
||||
move |_ctx| async {
|
||||
Ok(GurtResponse::ok().with_string_body(include_str!("../frontend/search.lua")))
|
||||
}
|
||||
})
|
||||
.get("/health", |_ctx| async {
|
||||
Ok(GurtResponse::ok().with_json_body(&json!({"status": "healthy"}))?)
|
||||
})
|
||||
.get("/test*", |ctx| {
|
||||
let path = ctx.path().to_string();
|
||||
async move {
|
||||
println!("Test request path: '{}'", path);
|
||||
Ok(GurtResponse::ok().with_string_body(format!("Path received: {}", path)))
|
||||
}
|
||||
});
|
||||
|
||||
info!("GURT search server listening on {}", self.config.gurt_protocol_url());
|
||||
server.listen(&self.config.server_bind_address()).await.map_err(|e| anyhow::anyhow!("GURT server error: {}", e))
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn run_server(config: Config) -> Result<()> {
|
||||
let server = SearchServer::new(config).await?;
|
||||
server.run().await
|
||||
}
|
||||
|
||||
fn parse_query_param(path: &str, param: &str) -> String {
|
||||
let param_with_eq = format!("{}=", param);
|
||||
if let Some(start) = path.find(&format!("?{}", param_with_eq)) {
|
||||
let start_pos = start + 1 + param_with_eq.len(); // Skip the '?' and 'param='
|
||||
let query_part = &path[start_pos..];
|
||||
let end_pos = query_part.find('&').unwrap_or(query_part.len());
|
||||
urlencoding::decode(&query_part[..end_pos]).unwrap_or_default().to_string()
|
||||
} else if let Some(start) = path.find(&format!("&{}", param_with_eq)) {
|
||||
let start_pos = start + 1 + param_with_eq.len(); // Skip the '&' and 'param='
|
||||
let query_part = &path[start_pos..];
|
||||
let end_pos = query_part.find('&').unwrap_or(query_part.len());
|
||||
urlencoding::decode(&query_part[..end_pos]).unwrap_or_default().to_string()
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_query_param_usize(path: &str, param: &str) -> Option<usize> {
|
||||
let value = parse_query_param(path, param);
|
||||
if value.is_empty() { None } else { value.parse().ok() }
|
||||
}
|
||||
|
||||
async fn handle_search(
|
||||
path: String,
|
||||
search_engine: Arc<SearchEngine>,
|
||||
config: Config
|
||||
) -> Result<GurtResponse, GurtError> {
|
||||
let query = parse_query_param(&path, "q");
|
||||
|
||||
if query.is_empty() {
|
||||
return Ok(GurtResponse::bad_request()
|
||||
.with_json_body(&json!({"error": "Query parameter 'q' is required"}))?);
|
||||
}
|
||||
|
||||
println!("Search query: '{}'", query);
|
||||
|
||||
let limit = parse_query_param_usize(&path, "limit")
|
||||
.unwrap_or(config.search.search_results_per_page)
|
||||
.min(config.search.max_search_results);
|
||||
|
||||
match search_engine.search(&query, limit).await {
|
||||
Ok(results) => {
|
||||
let response = json!({
|
||||
"query": query,
|
||||
"results": results,
|
||||
"count": results.len()
|
||||
});
|
||||
|
||||
Ok(GurtResponse::ok()
|
||||
.with_header("content-type", "application/json")
|
||||
.with_json_body(&response)?)
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Search failed: {}", e);
|
||||
Ok(GurtResponse::internal_server_error()
|
||||
.with_json_body(&json!({"error": "Search failed", "details": e.to_string()}))?)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_api_search(
|
||||
path: String,
|
||||
search_engine: Arc<SearchEngine>,
|
||||
config: Config
|
||||
) -> Result<GurtResponse, GurtError> {
|
||||
let query = parse_query_param(&path, "q");
|
||||
|
||||
if query.is_empty() {
|
||||
return Ok(GurtResponse::bad_request()
|
||||
.with_json_body(&json!({"error": "Query parameter 'q' is required"}))?);
|
||||
}
|
||||
|
||||
let page = parse_query_param_usize(&path, "page")
|
||||
.unwrap_or(1)
|
||||
.max(1);
|
||||
|
||||
let per_page = parse_query_param_usize(&path, "per_page")
|
||||
.unwrap_or(config.search.search_results_per_page)
|
||||
.min(config.search.max_search_results);
|
||||
|
||||
match search_engine.search_with_response(&query, page, per_page).await {
|
||||
Ok(response) => {
|
||||
Ok(GurtResponse::ok()
|
||||
.with_header("content-type", "application/json")
|
||||
.with_json_body(&response)?)
|
||||
}
|
||||
Err(e) => {
|
||||
error!("API search failed: {}", e);
|
||||
Ok(GurtResponse::internal_server_error()
|
||||
.with_json_body(&json!({"error": "Search failed", "details": e.to_string()}))?)
|
||||
}
|
||||
}
|
||||
}
|
||||
20
tests/clanker.txt
Normal file
20
tests/clanker.txt
Normal file
@@ -0,0 +1,20 @@
|
||||
User-agent: *
|
||||
Allow: /add-remove-child.html
|
||||
Allow: /attribute.html
|
||||
Allow: /audio.html
|
||||
Allow: /center-and-button.html
|
||||
Allow: /clipboard.html
|
||||
Allow: /crumbs.html
|
||||
Allow: /css-selector.html
|
||||
Allow: /dashboard.html
|
||||
Allow: /dom-utils.html
|
||||
Allow: /index.html
|
||||
Allow: /input-events.html
|
||||
Allow: /interval-and-network.html
|
||||
Allow: /lua-api.html
|
||||
Allow: /network-and-json.html
|
||||
Allow: /object-fit.html
|
||||
Allow: /signal.html
|
||||
Allow: /transform.html
|
||||
Allow: /tween.html
|
||||
Allow: /websocket.html
|
||||
Reference in New Issue
Block a user