Skip to content

Commit

Permalink
Bug fixes
Browse files Browse the repository at this point in the history
- Prevent downloading images with base64 strings as the source
- Add escaping of quotation characters in the serializer
- Disable redirects when downloading images which fails on multiple sites
- Remove invalid characters for making the epub export file name
- Fix version number in release
  • Loading branch information
hipstermojo committed Dec 24, 2020
1 parent 3bfa82b commit 8407c61
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ description = "A web article downloader"
homepage = "https://github.com/hipstermojo/paperoni"
repository = "https://github.com/hipstermojo/paperoni"
name = "paperoni"
version = "0.2.0-alpha1"
version = "0.2.1-alpha1"
authors = ["Kenneth Gitere <[email protected]>"]
edition = "2018"
license = "MIT"
Expand Down
2 changes: 1 addition & 1 deletion src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pub fn cli_init() -> App<'static, 'static> {
AppSettings::ArgRequiredElseHelp,
AppSettings::UnifiedHelpMessage,
])
.version("0.1.0-alpha1")
.version("0.2.1-alpha1")
.about(
"
Paperoni is an article downloader.
Expand Down
11 changes: 8 additions & 3 deletions src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use crate::moz_readability::{MetaData, Readability};
pub type ResourceInfo = (String, Option<String>);

lazy_static! {
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r"(&|<|>)").unwrap();
static ref ESC_SEQ_REGEX: regex::Regex = regex::Regex::new(r#"(&|<|>|'|")"#).unwrap();
}

pub struct Extractor {
Expand Down Expand Up @@ -56,7 +56,7 @@ impl Extractor {
for img_ref in content_ref.select("img").unwrap() {
img_ref.as_node().as_element().map(|img_elem| {
img_elem.attributes.borrow().get("src").map(|img_url| {
if !img_url.is_empty() {
if !(img_url.is_empty() || img_url.starts_with("data:image")) {
self.img_urls.push((img_url.to_string(), None))
}
})
Expand All @@ -75,7 +75,9 @@ impl Extractor {

async_download_tasks.push(task::spawn(async move {
let mut img_response = surf::Client::new()
.with(surf::middleware::Redirect::default())
// The middleware has been temporarily commented out because it happens
// to affect downloading images when there is no redirecting
// .with(surf::middleware::Redirect::default())
.get(&abs_url)
.await
.expect("Unable to retrieve file");
Expand Down Expand Up @@ -185,6 +187,8 @@ pub fn serialize_to_xhtml<W: std::io::Write>(
escape_map.insert("<", "&lt;");
escape_map.insert(">", "&gt;");
escape_map.insert("&", "&amp;");
escape_map.insert("\"", "&quot;");
escape_map.insert("'", "&apos;");
for edge in node_ref.traverse_inclusive() {
match edge {
kuchiki::iter::NodeEdge::Start(n) => match n.data() {
Expand Down Expand Up @@ -248,6 +252,7 @@ mod test {
<p>Some Lorem Ipsum text here</p>
<p>Observe this picture</p>
<img src="./img.jpg" alt="Random image">
<img src="">
</article>
<footer>
<p>Made in HTML</p>
Expand Down
11 changes: 9 additions & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,14 @@ fn download(urls: Vec<String>) {
.download_images(&Url::parse(&url).unwrap())
.await
.expect("Unable to download images");
let file_name = format!("{}.epub", extractor.metadata().title());
let file_name = format!(
"{}.epub",
extractor
.metadata()
.title()
.replace("/", " ")
.replace("\\", " ")
);
let mut out_file = File::create(&file_name).unwrap();
let mut html_buf = Vec::new();
extractor::serialize_to_xhtml(extractor.article().unwrap(), &mut html_buf)
Expand All @@ -68,7 +75,7 @@ fn download(urls: Vec<String>) {
}
epub.metadata("title", extractor.metadata().title().replace("&", "&amp;"))
.unwrap();
epub.add_content(EpubContent::new("code.xhtml", html_buf.as_bytes()))
epub.add_content(EpubContent::new("index.xhtml", html_buf.as_bytes()))
.unwrap();
for img in extractor.img_urls {
let mut file_path = std::env::temp_dir();
Expand Down

0 comments on commit 8407c61

Please sign in to comment.