diff --git a/README.md b/README.md index 818146a..015bcaf 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,18 @@ Document File Text Extractor ============================= Simple Rust library to extract readable text from specific document format like Word Document (docx). -Currently only support for docx and xlsx, other format coming soon. +Currently only support several format, other format coming soon. + +Supported Document +------------------------- + + +- [x] Microsoft Word (docx) +- [x] Microsoft Excel (xlsx) +- [x] Microsoft Power Point (pptx) +- [ ] OpenOffice Writer (odt) + + Usage ------ diff --git a/data/sample.pptx b/data/sample.pptx new file mode 100644 index 0000000..ea72794 Binary files /dev/null and b/data/sample.pptx differ diff --git a/src/lib.rs b/src/lib.rs index 139f9a4..d6eefd7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,8 +29,10 @@ extern crate quick_xml as xml; pub mod msdoc; pub mod docx; pub mod xlsx; +pub mod pptx; pub use msdoc::MsDoc; pub use docx::Docx; -pub use xlsx::Xlsx; \ No newline at end of file +pub use xlsx::Xlsx; +pub use pptx::Pptx; diff --git a/src/pptx.rs b/src/pptx.rs new file mode 100644 index 0000000..2304c2f --- /dev/null +++ b/src/pptx.rs @@ -0,0 +1,128 @@ + +use zip::ZipArchive; + +use xml::reader::Reader; +use xml::events::Event; + +use std::path::{Path, PathBuf}; +use std::fs::File; +use std::io::prelude::*; +use std::io; +use std::clone::Clone; +use zip::read::ZipFile; + +use msdoc::MsDoc; + +pub struct Pptx { + path: PathBuf, + data: String, + offset: usize +} + +impl MsDoc for Pptx { + fn open>(path: P) -> io::Result { + let file = File::open(path.as_ref())?; + let mut archive = ZipArchive::new(file)?; + + let mut xml_data = String::new(); +// let xml_data_list = Vec::new(); + + for i in 0..archive.len(){ + let mut c_file = archive.by_index(i).unwrap(); + if c_file.name().starts_with("ppt/slides") { + let mut _buff = String::new(); + c_file.read_to_string(&mut _buff); + xml_data += _buff.as_str(); +// break + } + } + + + let mut buf = Vec::new(); + let mut txt = Vec::new(); + + if xml_data.len() > 0 { + let mut to_read = false; + let mut xml_reader = Reader::from_str(xml_data.as_ref()); + loop { + match xml_reader.read_event(&mut buf){ + Ok(Event::Start(ref e)) => { + match e.name() { + b"a:p" => { + to_read = true; + txt.push("\n".to_string()); + }, + b"a:t" => { + to_read = true; +// txt.push("\n".to_string()); + }, + _ => (), + } + }, + Ok(Event::Text(e)) => { + if to_read { + let text = e.unescape_and_decode(&xml_reader).unwrap(); +// println!("# {} #", text); + txt.push(text); + to_read = false; + } + }, + Ok(Event::Eof) => break, // exits the loop when reaching end of file + Err(e) => panic!("Error at position {}: {:?}", xml_reader.buffer_position(), e), + _ => (), + } + } + } + + Ok( + Pptx { + path: path.as_ref().to_path_buf(), + data: txt.join(""), + offset: 0 + } + ) + } + +} + +impl Read for Pptx { + fn read(&mut self, mut buf: &mut [u8]) -> io::Result { + let bytes = self.data.as_bytes(); + let limit = if bytes.len() < self.offset + 10 { + bytes.len() + }else{ + self.offset + 10 + }; + + if self.offset > limit { + Ok(0) + }else{ + + let rv = buf.write(&bytes[self.offset..limit])?; +// println!("offset: {}, limit: {}, rv: {}", self.offset, limit, rv); + self.offset = self.offset + rv; + Ok(rv) + } + } +} + + +#[cfg(test)] +mod tests { + use std::path::{Path, PathBuf}; + use super::*; + + #[test] + fn instantiate(){ + let _ = Pptx::open(Path::new("data/sample.pptx")); + } + + #[test] + fn read(){ + let mut f = Pptx::open(Path::new("data/sample.pptx")).unwrap(); + + let mut data = String::new(); + let len = f.read_to_string(&mut data).unwrap(); + println!("len: {}, data: {}", len, data); + } +}