Skip to content

Commit

Permalink
Small HTML parsing fix
Browse files Browse the repository at this point in the history
  • Loading branch information
lukehasawii committed Apr 19, 2023
1 parent dfb9e65 commit 3585107
Showing 1 changed file with 16 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -614,33 +614,46 @@ static String sanitizeHtml(String input){
Safelist list = new Safelist();
list.addProtocols("a", "href", "http", "https", "#");
list.addTags(VALID_TAGS);
for(String tag : VALID_TAGS){
list.addAttributes(tag, VALID_ATTRIBUTES);
}
//for(String tag : VALID_TAGS){
// list.addAttributes(tag, VALID_ATTRIBUTES);
//}
list.addAttributes(":all", VALID_ATTRIBUTES);

Cleaner c = new Cleaner(list);

List<String> childList = new ArrayList<>();
NodeVisitor myNodeVisitor = new MyNodeVisitor(childList);

//Log.i("HTML", "Input HTML: " + input);

Document base = Jsoup.parse(input);
base.charset(StandardCharsets.UTF_8);
base = c.clean(base);
//Log.i("HTML", "After clean HTML: " + base.outerHtml());

Elements tags = base.getAllElements();
for (Element e : tags) {
if (e.tagName().equals("div")) {
e.unwrap();
}
}
//Log.i("HTML", "After unwrap: " + base.outerHtml());

base.traverse(myNodeVisitor);

//Log.i("HTML", "After visiting: " + base.outerHtml());


String outerHTML = base.outerHtml();
HtmlCompressor comp = new HtmlCompressor();
comp.setRemoveComments(true);
comp.setRemoveQuotes(false);
comp.setRemoveStyleAttributes(true);
comp.setRemoveIntertagSpaces(true);
comp.setEnabled(true);

//Log.i("HTML", "After compress: " + comp.compress(outerHTML));

return comp.compress(outerHTML);

}
Expand Down

0 comments on commit 3585107

Please sign in to comment.