feat(api): add PDF download and CSV conversion functionality

This commit is contained in:
Mariano Riefolo 2024-08-31 16:16:04 +02:00
parent cc92e3360e
commit 5b95bae6e3
4 changed files with 64 additions and 1 deletions

2
.gitignore vendored
View File

@ -1 +1,3 @@
/target /target
pdf
csv

View File

@ -5,7 +5,10 @@ edition = "2021"
[dependencies] [dependencies]
axum = "0.7.5" axum = "0.7.5"
reqwest = "0.12.7" failure = "0.1.8"
hex = "0.4.3"
reqwest = { version = "0.12.7", features = ["stream"] }
scraper = "0.20.0" scraper = "0.20.0"
serde_json = "1.0.127" serde_json = "1.0.127"
sha1 = "0.10.6"
tokio = { version = "1.40.0", features = ["full"] } tokio = { version = "1.40.0", features = ["full"] }

24
pdf2csv.py Normal file
View File

@ -0,0 +1,24 @@
import pdfplumber
import csv
import sys
if __name__ == '__main__':
if len(sys.argv) != 3:
print("Usage: pdf2csv.py <input.pdf> <output.csv>")
sys.exit(1)
input_pdf = sys.argv[1]
output_csv = sys.argv[2]
with open(input_pdf, 'rb') as f:
pdf = pdfplumber.open(f)
with open(output_csv, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
flattened_table = [row for row in table]
writer.writerows(flattened_table)
pdf.close()

View File

@ -4,8 +4,11 @@ use axum::{
routing::get, routing::get,
Router, Router,
}; };
use failure::Error;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use serde_json::json; use serde_json::json;
use sha1::{Digest, Sha1};
use std::{fs, process::Command};
pub fn get_routes() -> Router { pub fn get_routes() -> Router {
Router::new().nest("/api", Router::new().route("/pdf", get(pdf))) Router::new().nest("/api", Router::new().route("/pdf", get(pdf)))
@ -34,3 +37,34 @@ pub async fn pdf() -> impl IntoResponse {
Err(e) => (StatusCode::OK, Json(json!({ "error": e.to_string()}))), Err(e) => (StatusCode::OK, Json(json!({ "error": e.to_string()}))),
} }
} }
async fn download_pdf(id: u8) -> Result<String, Error> {
let links = get_pdf_links().await.unwrap();
let url = &links[id as usize];
let mut hasher = Sha1::new();
hasher.update(url.as_bytes());
let result = hasher.finalize();
let filename = hex::encode(result);
fs::create_dir_all("pdf")?;
fs::create_dir_all("csv")?;
let pdf_path = format!("pdf/{}.pdf", filename);
let csv_path = format!("csv/{}.csv", filename);
if fs::metadata(&pdf_path).is_err() {
let response = reqwest::get(url).await?;
let body = response.bytes().await?;
std::fs::write(&pdf_path, body)?;
Command::new("python")
.arg("pdf2csv.py")
.arg(&pdf_path)
.arg(&csv_path)
.spawn()
.expect("Failed to generate csv file");
}
Ok(csv_path)
}