Skip to content

Commit

Permalink
[Added] support for pptx
Browse files Browse the repository at this point in the history
  • Loading branch information
robin committed Nov 25, 2017
1 parent fe3ba21 commit cc09294
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 2 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,18 @@ Document File Text Extractor
=============================

Simple Rust library to extract readable text from specific document format like Word Document (docx).
Currently only support for docx and xlsx, other format coming soon.
Currently only support several format, other format coming soon.

Supported Document
-------------------------


- [x] Microsoft Word (docx)
- [x] Microsoft Excel (xlsx)
- [x] Microsoft Power Point (pptx)
- [ ] OpenOffice Writer (odt)



Usage
------
Expand Down
Binary file added data/sample.pptx
Binary file not shown.
4 changes: 3 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@ extern crate quick_xml as xml;
pub mod msdoc;
pub mod docx;
pub mod xlsx;
pub mod pptx;


pub use msdoc::MsDoc;
pub use docx::Docx;
pub use xlsx::Xlsx;
pub use xlsx::Xlsx;
pub use pptx::Pptx;
128 changes: 128 additions & 0 deletions src/pptx.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@

use zip::ZipArchive;

use xml::reader::Reader;
use xml::events::Event;

use std::path::{Path, PathBuf};
use std::fs::File;
use std::io::prelude::*;
use std::io;
use std::clone::Clone;
use zip::read::ZipFile;

use msdoc::MsDoc;

pub struct Pptx {
path: PathBuf,
data: String,
offset: usize
}

impl MsDoc<Pptx> for Pptx {
fn open<P: AsRef<Path>>(path: P) -> io::Result<Pptx> {
let file = File::open(path.as_ref())?;
let mut archive = ZipArchive::new(file)?;

let mut xml_data = String::new();
// let xml_data_list = Vec::new();

for i in 0..archive.len(){
let mut c_file = archive.by_index(i).unwrap();
if c_file.name().starts_with("ppt/slides") {
let mut _buff = String::new();
c_file.read_to_string(&mut _buff);
xml_data += _buff.as_str();
// break
}
}


let mut buf = Vec::new();
let mut txt = Vec::new();

if xml_data.len() > 0 {
let mut to_read = false;
let mut xml_reader = Reader::from_str(xml_data.as_ref());
loop {
match xml_reader.read_event(&mut buf){
Ok(Event::Start(ref e)) => {
match e.name() {
b"a:p" => {
to_read = true;
txt.push("\n".to_string());
},
b"a:t" => {
to_read = true;
// txt.push("\n".to_string());
},
_ => (),
}
},
Ok(Event::Text(e)) => {
if to_read {
let text = e.unescape_and_decode(&xml_reader).unwrap();
// println!("# {} #", text);
txt.push(text);
to_read = false;
}
},
Ok(Event::Eof) => break, // exits the loop when reaching end of file
Err(e) => panic!("Error at position {}: {:?}", xml_reader.buffer_position(), e),
_ => (),
}
}
}

Ok(
Pptx {
path: path.as_ref().to_path_buf(),
data: txt.join(""),
offset: 0
}
)
}

}

impl Read for Pptx {
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
let bytes = self.data.as_bytes();
let limit = if bytes.len() < self.offset + 10 {
bytes.len()
}else{
self.offset + 10
};

if self.offset > limit {
Ok(0)
}else{

let rv = buf.write(&bytes[self.offset..limit])?;
// println!("offset: {}, limit: {}, rv: {}", self.offset, limit, rv);
self.offset = self.offset + rv;
Ok(rv)
}
}
}


#[cfg(test)]
mod tests {
use std::path::{Path, PathBuf};
use super::*;

#[test]
fn instantiate(){
let _ = Pptx::open(Path::new("data/sample.pptx"));
}

#[test]
fn read(){
let mut f = Pptx::open(Path::new("data/sample.pptx")).unwrap();

let mut data = String::new();
let len = f.read_to_string(&mut data).unwrap();
println!("len: {}, data: {}", len, data);
}
}

0 comments on commit cc09294

Please sign in to comment.