From 6b024d00bd18bb15eff95cd44ce7ba75bbe5ff57 Mon Sep 17 00:00:00 2001 From: Mariano Riefolo Date: Tue, 24 Sep 2024 10:09:05 +0200 Subject: [PATCH] Wrote program for Day 61 --- README.md | 2 +- .../day61/Cargo.toml | 9 ++++ .../day61/src/lib.rs | 43 +++++++++++++++++++ .../day61/src/main.rs | 20 +++++++++ 4 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 Week-09/Day-61_Write-A-Web-Crawler/day61/Cargo.toml create mode 100644 Week-09/Day-61_Write-A-Web-Crawler/day61/src/lib.rs create mode 100644 Week-09/Day-61_Write-A-Web-Crawler/day61/src/main.rs diff --git a/README.md b/README.md index 6da335f..857c5a3 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ We encourage you to share your progress and ask questions in the Discussions sec | Day #58 | [Create A Dice Roller](https://github.com/LiveGray/100-Days-Of-Rust/tree/main/Week-09/Day-58_Create-A-Dice-Roller) | :white_check_mark: | | Day #59 | [Perfectly Balanced](https://github.com/LiveGray/100-Days-Of-Rust/tree/main/Week-09/Day-59_Perfectly-Balanced) | :white_check_mark: | | Day #60 | [A Game Of Threes](https://github.com/LiveGray/100-Days-Of-Rust/tree/main/Week-09/Day-60_A-Game-Of-Thrones) | :white_check_mark: | -| Day #61 | [Write A Web Crawler](https://github.com/LiveGray/100-Days-Of-Rust/tree/main/Week-09/Day-61_Write-A-Web-Crawler) | :white_large_square: | +| Day #61 | [Write A Web Crawler](https://github.com/LiveGray/100-Days-Of-Rust/tree/main/Week-09/Day-61_Write-A-Web-Crawler) | :white_check_mark: | | Day #62 | [Funny Plant](https://github.com/LiveGray/100-Days-Of-Rust/tree/main/Week-09/Day-62_Funny-Plant) | :white_large_square: | | Day #63 | [The Rabbit Problem](https://github.com/LiveGray/100-Days-Of-Rust/tree/main/Week-09/Day-63_The-Rabbit-Problem) | :white_large_square: | | Day #64 | [First Recurring Character](https://github.com/LiveGray/100-Days-Of-Rust/tree/main/Week-10/Day-64_First-Recurring-Character) | :white_large_square: | diff --git a/Week-09/Day-61_Write-A-Web-Crawler/day61/Cargo.toml b/Week-09/Day-61_Write-A-Web-Crawler/day61/Cargo.toml new file mode 100644 index 0000000..c324ebd --- /dev/null +++ b/Week-09/Day-61_Write-A-Web-Crawler/day61/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "day61" +version = "0.1.0" +edition = "2021" + +[dependencies] +tokio = { version = "1.40.0", features = ["macros", "rt-multi-thread"] } +reqwest = "0.12.7" +scraper = "0.20.0" diff --git a/Week-09/Day-61_Write-A-Web-Crawler/day61/src/lib.rs b/Week-09/Day-61_Write-A-Web-Crawler/day61/src/lib.rs new file mode 100644 index 0000000..477226c --- /dev/null +++ b/Week-09/Day-61_Write-A-Web-Crawler/day61/src/lib.rs @@ -0,0 +1,43 @@ +pub async fn crawl(url: &str, max_depth: u8) -> Vec { + let client = reqwest::Client::new(); + + let mut discovered_urls = Vec::new(); + let mut current_urls = vec![url.to_string()]; + let mut next_urls = Vec::new(); + + for _ in 0..=max_depth { + for url in current_urls.iter() { + let response = match client.get(url).send().await { + Ok(response) => response, + Err(_) => { + continue; + } + } + .text() + .await + .unwrap_or_default(); + + let document = scraper::Html::parse_document(&response); + for element in document.select(&scraper::Selector::parse("[href],[src]").unwrap()) { + let url = if element.value().attr("src").is_some() { + element.value().attr("src").unwrap() + } else { + element.value().attr("href").unwrap() + }; + if url.starts_with("https") + && !discovered_urls.contains(&url.to_string()) + && !current_urls.contains(&url.to_string()) + && !next_urls.contains(&url.to_string()) + { + next_urls.push(url.to_string()); + } + } + } + + discovered_urls.append(&mut current_urls); + current_urls = next_urls; + next_urls = Vec::new(); + } + + discovered_urls +} diff --git a/Week-09/Day-61_Write-A-Web-Crawler/day61/src/main.rs b/Week-09/Day-61_Write-A-Web-Crawler/day61/src/main.rs new file mode 100644 index 0000000..53a4a14 --- /dev/null +++ b/Week-09/Day-61_Write-A-Web-Crawler/day61/src/main.rs @@ -0,0 +1,20 @@ +use std::io::{self, Write}; + +#[tokio::main] +async fn main() { + let mut buffer = String::new(); + + print!("Enter the URL to crawl: "); + io::stdout().flush().unwrap(); + io::stdin().read_line(&mut buffer).unwrap(); + let url = buffer.trim().to_string(); + + buffer.clear(); + print!("Enter the max depth: "); + io::stdout().flush().unwrap(); + io::stdin().read_line(&mut buffer).unwrap(); + let max_depth = buffer.trim().parse().unwrap(); + + let discovered_urls = day61::crawl(&url, max_depth).await; + println!("{:?}", discovered_urls); +}