From 711952dcca6621458553feea9ce8aad5997fb32c Mon Sep 17 00:00:00 2001 From: Matthias Queitsch Date: Thu, 21 Mar 2024 20:09:49 +0100 Subject: [PATCH] add: automatically detect filetype without rely on ending --- Cargo.lock | 7 +++++++ Cargo.toml | 3 ++- src/utils.rs | 32 ++++++++++++++++---------------- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2b1d719..dd08bed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -418,6 +418,7 @@ version = "0.18.0" dependencies = [ "comfy-table 6.2.0", "datafusion", + "file-format", "structopt", "thiserror", "tokio", @@ -1018,6 +1019,12 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +[[package]] +name = "file-format" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ba1b81b3c213cf1c071f8bf3b83531f310df99642e58c48247272eef006cae5" + [[package]] name = "fixedbitset" version = "0.4.2" diff --git a/Cargo.toml b/Cargo.toml index 7ab6e1b..ec1ad22 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,4 +22,5 @@ comfy-table = "6.1.2" datafusion = { version = "35.0", features = ["avro"] } structopt = "0.3" tokio = { version = "1.36", features = ["rt-multi-thread"] } -thiserror = "1" \ No newline at end of file +thiserror = "1" +file-format = { version = "0.24.0", features = ["reader-txt"] } diff --git a/src/utils.rs b/src/utils.rs index b7afb12..4948a09 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -7,29 +7,29 @@ use datafusion::prelude::{ AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions, SessionContext, }; +use file_format::FileFormat as DetectFileFormat; use std::path::Path; pub fn file_format(filename: &str) -> Result { - match file_ending(filename)?.as_str() { - "avro" => Ok(FileFormat::Avro), - "csv" => Ok(FileFormat::Csv), - "json" => Ok(FileFormat::Json), - "parquet" | "parq" => Ok(FileFormat::Parquet), - other => Err(Error::General(format!( - "unsupported file extension '{}'", - other - ))), + match DetectFileFormat::from_file(filename)? { + DetectFileFormat::ApacheAvro => Ok(FileFormat::Avro), + DetectFileFormat::ApacheParquet => Ok(FileFormat::Parquet), + DetectFileFormat::PlainText => match file_ending(filename)?.as_str() { + "json" => Ok(FileFormat::Json), + "csv" => Ok(FileFormat::Csv), + other => Err(Error::General(format!( + "unsupported file extension '{}'", + other + ))), + }, + other => Err(Error::General(format!("unsupported file type '{}'", other))), } } pub fn file_ending(filename: &str) -> Result { - if let Some(ending) = std::path::Path::new(filename).extension() { - Ok(ending.to_string_lossy().to_string()) - } else { - Err(Error::General( - "Could not determine file extension".to_string(), - )) - } + Ok(std::path::Path::new(filename) + .extension() + .map_or_else(|| "".to_owned(), |e| e.to_string_lossy().to_string())) } pub fn parse_filename(filename: &Path) -> Result<&str, Error> {