General documentation improvements

2025-09-08 13:13:38 +02:00
parent f4fd000322
commit e71fdc70d1
33 changed files with 4232 additions and 1551 deletions
--- a/search-engine/README.md
+++ b/search-engine/README.md
@@ -1,5 +1,75 @@
-The official Gurted search engine, Ringle.
+# Ringle

-Copy `config.template.toml` to `config.toml` and edit as needed.
+The official Gurted search engine.

-Run with `cargo run`
+## Configuration
+```sh
+cp config.template.toml config.toml
+```
+### Values
+```toml
+[database]
+url = "postgres://..." # A valid postgres database url
+max_connections = 5 # The maximum amount of simultaneous connections to the database
+```
+```toml
+[server]
+address = "127.0.0.1" # The binding adress the server will listen to
+port = 4879 # The port the server will listen on
+cert_path = "certs/t.crt" # A path to the certificate
+key_path = "certs/t.key" # A path to the key for the certificate
+```
+
+```toml
+[search]
+index_path = "./search_indexes" # The path where the indexed pages will be saved to
+crawl_interval_hours = 2 # How frequently should the search engine crawl (in hours)
+max_pages_per_domain = 1000 # Maximum amount of pages indexed per domain
+crawler_timeout_seconds = 30 # The maximum amount of seconds before a page times out and is skipped
+crawler_user_agent = "RingleBot/1.0" # The user agent the crawler should use
+max_concurrent_crawls = 5 # How many pages should the bot crawl concurrently
+content_size_limit_mb = 10 # The maximum amount of data a page can be
+index_rebuild_interval_hours = 48 # How often (in hours) should the index be rebuilt
+search_results_per_page = 20 # How many search results should be displayed per page
+max_search_results = 1000 # The maximum amount of results displayed
+
+allowed_extensions = [ # Extensions allowed to be indexed
+    "html", "htm", "txt", "md", "json", "xml", "rss", "atom"
+]
+
+blocked_extensions = [ # Extension that should not be indexed
+    "exe", "zip", "rar", "tar", "gz", "7z", "iso", "dmg",
+    "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
+    "jpg", "jpeg", "png", "gif", "bmp", "svg", "webp",
+    "mp3", "mp4", "avi", "mov", "wmv", "flv", "webm",
+    "css", "js", "woff", "woff2", "ttf", "eot"
+]
+```
+
+```toml
+[crawler]
+clanker_txt = true # Wheter or not should the crawler respect clanker.txt
+crawl_delay_ms = 1000 # The delay between each page crawl
+max_redirects = 5 # The maximum amount of redirects the crawler shoul follow
+follow_external_links = false # Crawl external links found in the page?
+max_depth = 10 # The maximum amount of nested pages
+
+request_headers = [ # The headers the crawler will include in the request while crawling
+    ["Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"],
+    ["Accept-Language", "en-US,en;q=0.5"],
+    ["Accept-Encoding", "gzip, deflate"],
+    ["DNT", "1"],
+]
+```
+
+```toml
+[logging]
+level = "info" # How much should the search engine log, can be info, debug or trace
+format = "compact" # The format for the logs
+```
+
+## Running
+Run with:
+```sh
+cargo run
+```