diff --git a/.gitignore b/.gitignore index ea8c4bf..763bcb2 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ /target +pdf +csv diff --git a/Cargo.toml b/Cargo.toml index 9a63dbd..86badb2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,10 @@ edition = "2021" [dependencies] axum = "0.7.5" -reqwest = "0.12.7" +failure = "0.1.8" +hex = "0.4.3" +reqwest = { version = "0.12.7", features = ["stream"] } scraper = "0.20.0" serde_json = "1.0.127" +sha1 = "0.10.6" tokio = { version = "1.40.0", features = ["full"] } diff --git a/pdf2csv.py b/pdf2csv.py new file mode 100644 index 0000000..f93bf02 --- /dev/null +++ b/pdf2csv.py @@ -0,0 +1,24 @@ +import pdfplumber +import csv +import sys + +if __name__ == '__main__': + if len(sys.argv) != 3: + print("Usage: pdf2csv.py ") + sys.exit(1) + + input_pdf = sys.argv[1] + output_csv = sys.argv[2] + + with open(input_pdf, 'rb') as f: + pdf = pdfplumber.open(f) + + with open(output_csv, 'w', newline='') as csvfile: + writer = csv.writer(csvfile) + for page in pdf.pages: + tables = page.extract_tables() + for table in tables: + flattened_table = [row for row in table] + writer.writerows(flattened_table) + + pdf.close() diff --git a/src/api/mod.rs b/src/api/mod.rs index 96f9d77..c51eee2 100644 --- a/src/api/mod.rs +++ b/src/api/mod.rs @@ -4,8 +4,11 @@ use axum::{ routing::get, Router, }; +use failure::Error; use scraper::{Html, Selector}; use serde_json::json; +use sha1::{Digest, Sha1}; +use std::{fs, process::Command}; pub fn get_routes() -> Router { Router::new().nest("/api", Router::new().route("/pdf", get(pdf))) @@ -34,3 +37,34 @@ pub async fn pdf() -> impl IntoResponse { Err(e) => (StatusCode::OK, Json(json!({ "error": e.to_string()}))), } } + +async fn download_pdf(id: u8) -> Result { + let links = get_pdf_links().await.unwrap(); + let url = &links[id as usize]; + + let mut hasher = Sha1::new(); + hasher.update(url.as_bytes()); + let result = hasher.finalize(); + let filename = hex::encode(result); + + fs::create_dir_all("pdf")?; + fs::create_dir_all("csv")?; + + let pdf_path = format!("pdf/{}.pdf", filename); + let csv_path = format!("csv/{}.csv", filename); + + if fs::metadata(&pdf_path).is_err() { + let response = reqwest::get(url).await?; + let body = response.bytes().await?; + std::fs::write(&pdf_path, body)?; + + Command::new("python") + .arg("pdf2csv.py") + .arg(&pdf_path) + .arg(&csv_path) + .spawn() + .expect("Failed to generate csv file"); + } + + Ok(csv_path) +}