feat(api): add PDF download and CSV conversion functionality
This commit is contained in:
parent
cc92e3360e
commit
5b95bae6e3
2
.gitignore
vendored
2
.gitignore
vendored
@ -1 +1,3 @@
|
|||||||
/target
|
/target
|
||||||
|
pdf
|
||||||
|
csv
|
||||||
|
@ -5,7 +5,10 @@ edition = "2021"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
axum = "0.7.5"
|
axum = "0.7.5"
|
||||||
reqwest = "0.12.7"
|
failure = "0.1.8"
|
||||||
|
hex = "0.4.3"
|
||||||
|
reqwest = { version = "0.12.7", features = ["stream"] }
|
||||||
scraper = "0.20.0"
|
scraper = "0.20.0"
|
||||||
serde_json = "1.0.127"
|
serde_json = "1.0.127"
|
||||||
|
sha1 = "0.10.6"
|
||||||
tokio = { version = "1.40.0", features = ["full"] }
|
tokio = { version = "1.40.0", features = ["full"] }
|
||||||
|
24
pdf2csv.py
Normal file
24
pdf2csv.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import pdfplumber
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print("Usage: pdf2csv.py <input.pdf> <output.csv>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
input_pdf = sys.argv[1]
|
||||||
|
output_csv = sys.argv[2]
|
||||||
|
|
||||||
|
with open(input_pdf, 'rb') as f:
|
||||||
|
pdf = pdfplumber.open(f)
|
||||||
|
|
||||||
|
with open(output_csv, 'w', newline='') as csvfile:
|
||||||
|
writer = csv.writer(csvfile)
|
||||||
|
for page in pdf.pages:
|
||||||
|
tables = page.extract_tables()
|
||||||
|
for table in tables:
|
||||||
|
flattened_table = [row for row in table]
|
||||||
|
writer.writerows(flattened_table)
|
||||||
|
|
||||||
|
pdf.close()
|
@ -4,8 +4,11 @@ use axum::{
|
|||||||
routing::get,
|
routing::get,
|
||||||
Router,
|
Router,
|
||||||
};
|
};
|
||||||
|
use failure::Error;
|
||||||
use scraper::{Html, Selector};
|
use scraper::{Html, Selector};
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
use sha1::{Digest, Sha1};
|
||||||
|
use std::{fs, process::Command};
|
||||||
|
|
||||||
pub fn get_routes() -> Router {
|
pub fn get_routes() -> Router {
|
||||||
Router::new().nest("/api", Router::new().route("/pdf", get(pdf)))
|
Router::new().nest("/api", Router::new().route("/pdf", get(pdf)))
|
||||||
@ -34,3 +37,34 @@ pub async fn pdf() -> impl IntoResponse {
|
|||||||
Err(e) => (StatusCode::OK, Json(json!({ "error": e.to_string()}))),
|
Err(e) => (StatusCode::OK, Json(json!({ "error": e.to_string()}))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn download_pdf(id: u8) -> Result<String, Error> {
|
||||||
|
let links = get_pdf_links().await.unwrap();
|
||||||
|
let url = &links[id as usize];
|
||||||
|
|
||||||
|
let mut hasher = Sha1::new();
|
||||||
|
hasher.update(url.as_bytes());
|
||||||
|
let result = hasher.finalize();
|
||||||
|
let filename = hex::encode(result);
|
||||||
|
|
||||||
|
fs::create_dir_all("pdf")?;
|
||||||
|
fs::create_dir_all("csv")?;
|
||||||
|
|
||||||
|
let pdf_path = format!("pdf/{}.pdf", filename);
|
||||||
|
let csv_path = format!("csv/{}.csv", filename);
|
||||||
|
|
||||||
|
if fs::metadata(&pdf_path).is_err() {
|
||||||
|
let response = reqwest::get(url).await?;
|
||||||
|
let body = response.bytes().await?;
|
||||||
|
std::fs::write(&pdf_path, body)?;
|
||||||
|
|
||||||
|
Command::new("python")
|
||||||
|
.arg("pdf2csv.py")
|
||||||
|
.arg(&pdf_path)
|
||||||
|
.arg(&csv_path)
|
||||||
|
.spawn()
|
||||||
|
.expect("Failed to generate csv file");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(csv_path)
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user