feat(api): add PDF download and CSV conversion functionality
This commit is contained in:
parent
cc92e3360e
commit
5b95bae6e3
2
.gitignore
vendored
2
.gitignore
vendored
@ -1 +1,3 @@
|
||||
/target
|
||||
pdf
|
||||
csv
|
||||
|
@ -5,7 +5,10 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
axum = "0.7.5"
|
||||
reqwest = "0.12.7"
|
||||
failure = "0.1.8"
|
||||
hex = "0.4.3"
|
||||
reqwest = { version = "0.12.7", features = ["stream"] }
|
||||
scraper = "0.20.0"
|
||||
serde_json = "1.0.127"
|
||||
sha1 = "0.10.6"
|
||||
tokio = { version = "1.40.0", features = ["full"] }
|
||||
|
24
pdf2csv.py
Normal file
24
pdf2csv.py
Normal file
@ -0,0 +1,24 @@
|
||||
import pdfplumber
|
||||
import csv
|
||||
import sys
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: pdf2csv.py <input.pdf> <output.csv>")
|
||||
sys.exit(1)
|
||||
|
||||
input_pdf = sys.argv[1]
|
||||
output_csv = sys.argv[2]
|
||||
|
||||
with open(input_pdf, 'rb') as f:
|
||||
pdf = pdfplumber.open(f)
|
||||
|
||||
with open(output_csv, 'w', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
for page in pdf.pages:
|
||||
tables = page.extract_tables()
|
||||
for table in tables:
|
||||
flattened_table = [row for row in table]
|
||||
writer.writerows(flattened_table)
|
||||
|
||||
pdf.close()
|
@ -4,8 +4,11 @@ use axum::{
|
||||
routing::get,
|
||||
Router,
|
||||
};
|
||||
use failure::Error;
|
||||
use scraper::{Html, Selector};
|
||||
use serde_json::json;
|
||||
use sha1::{Digest, Sha1};
|
||||
use std::{fs, process::Command};
|
||||
|
||||
pub fn get_routes() -> Router {
|
||||
Router::new().nest("/api", Router::new().route("/pdf", get(pdf)))
|
||||
@ -34,3 +37,34 @@ pub async fn pdf() -> impl IntoResponse {
|
||||
Err(e) => (StatusCode::OK, Json(json!({ "error": e.to_string()}))),
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_pdf(id: u8) -> Result<String, Error> {
|
||||
let links = get_pdf_links().await.unwrap();
|
||||
let url = &links[id as usize];
|
||||
|
||||
let mut hasher = Sha1::new();
|
||||
hasher.update(url.as_bytes());
|
||||
let result = hasher.finalize();
|
||||
let filename = hex::encode(result);
|
||||
|
||||
fs::create_dir_all("pdf")?;
|
||||
fs::create_dir_all("csv")?;
|
||||
|
||||
let pdf_path = format!("pdf/{}.pdf", filename);
|
||||
let csv_path = format!("csv/{}.csv", filename);
|
||||
|
||||
if fs::metadata(&pdf_path).is_err() {
|
||||
let response = reqwest::get(url).await?;
|
||||
let body = response.bytes().await?;
|
||||
std::fs::write(&pdf_path, body)?;
|
||||
|
||||
Command::new("python")
|
||||
.arg("pdf2csv.py")
|
||||
.arg(&pdf_path)
|
||||
.arg(&csv_path)
|
||||
.spawn()
|
||||
.expect("Failed to generate csv file");
|
||||
}
|
||||
|
||||
Ok(csv_path)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user