Skip to content

Commit ecddb40

Browse files
author
Tim Ng
committed
book parser example
0 parents  commit ecddb40

9 files changed

+8673
-0
lines changed

.classpath

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<classpath>
3+
<classpathentry including="**/*.java" kind="src" output="target/test-classes" path="src/test/java"/>
4+
<classpathentry including="**/*.java" kind="src" path="src/main/java"/>
5+
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
6+
<classpathentry kind="var" path="M2_REPO/org/jsoup/jsoup/1.7.2/jsoup-1.7.2.jar"/>
7+
<classpathentry kind="var" path="M2_REPO/commons-lang/commons-lang/2.4/commons-lang-2.4.jar"/>
8+
<classpathentry kind="lib" path="/Users/ting/dev/Selenium_workspace/BookParser/commons-lang3-3.1.jar"/>
9+
<classpathentry kind="lib" path="/Users/ting/dev/Selenium_workspace/BookParser/data-exporter-1.0.2.jar"/>
10+
<classpathentry kind="output" path="target/classes"/>
11+
</classpath>

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
target/classes/
2+
target/test-classes/
3+
bin/

.project

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<projectDescription>
3+
<name>BookParser</name>
4+
<comment>NO_M2ECLIPSE_SUPPORT: Project files created with the maven-eclipse-plugin are not supported in M2Eclipse.</comment>
5+
<projects/>
6+
<buildSpec>
7+
<buildCommand>
8+
<name>org.eclipse.jdt.core.javabuilder</name>
9+
</buildCommand>
10+
</buildSpec>
11+
<natures>
12+
<nature>org.eclipse.jdt.core.javanature</nature>
13+
</natures>
14+
</projectDescription>

.settings/org.eclipse.jdt.core.prefs

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#Thu Feb 21 16:56:50 PST 2013
2+
eclipse.preferences.version=1
3+
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
4+
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
5+
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
6+
org.eclipse.jdt.core.compiler.compliance=1.6
7+
org.eclipse.jdt.core.compiler.debug.lineNumber=generate
8+
org.eclipse.jdt.core.compiler.debug.localVariable=generate
9+
org.eclipse.jdt.core.compiler.debug.sourceFile=generate
10+
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
11+
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
12+
org.eclipse.jdt.core.compiler.source=1.6

commons-lang3-3.1.jar

308 KB
Binary file not shown.

data-exporter-1.0.2.jar

67.7 KB
Binary file not shown.

pom.xml

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
<?xml version="1.0" encoding="UTF-8"?><project>
2+
<modelVersion>4.0.0</modelVersion>
3+
<groupId>BookParser</groupId>
4+
<artifactId>BookParser</artifactId>
5+
<version>1.0-SNAPSHOT</version>
6+
<dependencies>
7+
<dependency>
8+
<groupId>org.jsoup</groupId>
9+
<artifactId>jsoup</artifactId>
10+
<version>1.7.2</version>
11+
</dependency>
12+
13+
<dependency>
14+
<groupId>commons-lang</groupId>
15+
<artifactId>commons-lang</artifactId>
16+
<version>2.4</version>
17+
</dependency>
18+
19+
<dependency>
20+
<groupId>com.brsanthu</groupId>
21+
<artifactId>data-exporter</artifactId>
22+
<version>1.0.0</version>
23+
</dependency>
24+
25+
<dependency>
26+
<groupId>org.apache.commons</groupId>
27+
<artifactId>commons-lang3</artifactId>
28+
<version>3.1</version>
29+
</dependency>
30+
</dependencies>
31+
</project>

src/test/java/BookParser.java

+255
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
2+
import java.io.BufferedWriter;
3+
import java.io.FileWriter;
4+
import java.io.IOException;
5+
import java.io.PrintWriter;
6+
import java.io.StringWriter;
7+
import java.util.Collections;
8+
import java.util.Comparator;
9+
import java.util.HashMap;
10+
import java.util.LinkedList;
11+
import java.util.List;
12+
import java.util.Map;
13+
import java.util.Map.Entry;
14+
import java.util.regex.Matcher;
15+
import java.util.regex.Pattern;
16+
17+
import org.apache.commons.lang3.StringUtils;
18+
import org.jsoup.Jsoup;
19+
import org.jsoup.nodes.Document;
20+
21+
import com.brsanthu.dataexporter.DataExporter;
22+
import com.brsanthu.dataexporter.output.texttable.TextTableExporter;
23+
24+
25+
/**
26+
*
27+
* @author tim ng
28+
*
29+
*/
30+
31+
public class BookParser {
32+
final static String BOOK_ONE_URL = "http://www.gutenberg.org/files/17192/17192-h/17192-h.htm";
33+
final static String BOOK_TWO_URL = "http://www.gutenberg.org/cache/epub/3748/pg3748.html";
34+
final static boolean DESC = false;
35+
36+
37+
38+
public static void main(String[] args) {
39+
Map<String, Integer> book1Map = new HashMap<String,Integer>();
40+
Map<String, Integer> book2Map = new HashMap<String,Integer>();
41+
42+
43+
String book1 = getBookContent(BOOK_ONE_URL);
44+
String book2 = getBookContent(BOOK_TWO_URL);
45+
countWordOccurrences(book1Map, book1);
46+
countWordOccurrences(book2Map,book2);
47+
48+
List<Entry<String, Integer>> sortedBook1ListDesc = sortWordOccurrences(book1Map,DESC);
49+
List<Entry<String, Integer>> sortedBook2ListDesc = sortWordOccurrences(book2Map,DESC);
50+
51+
//output results to log file
52+
outputToLog(sortedBook1ListDesc,sortedBook2ListDesc);
53+
}
54+
55+
56+
/**
57+
* Split book content into a string array, then put each word occurrences into a map
58+
* @param bookMap
59+
* @param s
60+
*/
61+
private static void countWordOccurrences(Map<String, Integer> bookMap,
62+
String s) {
63+
64+
String regex = "^\\W*([^\\d]*)"; //matching any punctuation from beginning for each word
65+
Pattern pattern = Pattern.compile(regex);
66+
67+
String[] words = s.split("[\\s+,;]+"); //split out words within comma, semicolon and space.
68+
69+
for(String st:words){
70+
71+
//ignore any word contains http or @
72+
if(st.contains(".") && (st.contains("http") || st.contains("www")) || st.contains("@")){
73+
continue;
74+
}
75+
76+
//setup match pattern
77+
Matcher m = pattern.matcher(StringUtils.reverse(st)); //reverse string to strip out punctuation,space,symbols and tabs at the end of each word
78+
m.find();
79+
String word = m.group(1);
80+
m = pattern.matcher(StringUtils.reverse(word)); // reverse string back to normal then strip out punctuation,space,symbols and tabs at the beginning of each word
81+
m.find();
82+
word = m.group(1);
83+
84+
//ignore any word that is Uppercase or blank space
85+
if( StringUtils.isAllUpperCase(word) || StringUtils.isBlank(word)){
86+
continue;
87+
}
88+
if(word.contains(".")){
89+
checkForSpecialCases(bookMap, word); //last checking to cover input errors in the book
90+
continue;
91+
}
92+
word = word.toLowerCase(); //force all the word to lower case
93+
if (bookMap.containsKey(word)){
94+
bookMap.put(word.toLowerCase(), bookMap.get(word) + 1);
95+
}
96+
else{
97+
bookMap.put(word, 1);
98+
}
99+
}
100+
101+
}
102+
103+
/**
104+
* Output results to a log in a tabular format
105+
* @param list1
106+
* @param list2
107+
*/
108+
private static void outputToLog(List<Entry<String,Integer>>list1,
109+
List<Entry<String,Integer>>list2){
110+
111+
StringWriter sw = new StringWriter();
112+
DataExporter exporter = new TextTableExporter(sw);
113+
114+
String word1;
115+
Integer occurrence1;
116+
117+
String word2;
118+
Integer occurrence2;
119+
120+
121+
exporter.addColumn(" Verne ",
122+
" Word ",
123+
" Occurrences ",
124+
" ",
125+
" Poe ",
126+
" Word ",
127+
" Occurrences ");
128+
129+
//If list not empty, continue to loop through it
130+
while(!list1.isEmpty() || !list2.isEmpty()){
131+
132+
//if first list is empty insert blank
133+
if(list1.isEmpty()){
134+
word1 = "";
135+
occurrence1 = null;
136+
}
137+
else{ // add row
138+
Entry<String, Integer> entry = list1.get(0); //get first entry from list
139+
word1 = entry.getKey();
140+
occurrence1 = entry.getValue();
141+
list1.remove(0); //once entry has been processed, remove enty from list
142+
}
143+
//if list 2 is empty insert blank
144+
if(list2.isEmpty()){
145+
word2 = "";
146+
occurrence2 = null;
147+
}
148+
else{ // add row
149+
Entry<String, Integer> entry = list2.get(0); //get first enty from list
150+
word2 = entry.getKey();
151+
occurrence2 = entry.getValue();
152+
list2.remove(0); // remove entry from list once entry has been processed
153+
}
154+
// add row to exporter
155+
exporter.addRow(" ", word2, occurrence2," "," ", word1, occurrence1 );
156+
}
157+
158+
exporter.finishExporting();
159+
160+
161+
//print results to log file
162+
try {
163+
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter("word_occurrence_count.txt", false)));
164+
out.println(sw.toString());
165+
out.close();
166+
} catch (IOException ioe) {
167+
System.out.print(ioe.getMessage());
168+
}
169+
}
170+
171+
172+
/**
173+
* Handle special punctuation in words.
174+
* E.g., ignores a.m., p.m., u.s., etc...
175+
* @param counts
176+
* @param word
177+
*/
178+
private static void checkForSpecialCases(Map<String, Integer> counts,
179+
String word) {
180+
181+
//split word that contains dot
182+
String[] otherWords = word.split("\\.");
183+
for(String text:otherWords){
184+
text = text.replaceAll("[^a-z\\sA-Z]", ""); //replace any non word charters with empty string
185+
if(StringUtils.isAllUpperCase(text)){
186+
continue;
187+
}
188+
189+
if(text.length() > 1){
190+
if(counts.containsKey(text)){
191+
counts.put(text, counts.get(text) + 1);
192+
193+
}
194+
else
195+
counts.put(text, 1);
196+
}
197+
}
198+
}
199+
200+
201+
/**
202+
* Download book in html format from url and extract book text.
203+
* @param url
204+
* @return
205+
*/
206+
private static String getBookContent(String url) {
207+
208+
String rval = null;
209+
try {
210+
Document doc = Jsoup.connect(url).userAgent(
211+
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0").get(); //http://www.gutenberg.org/files/17192/17192-h/17192-h.htm
212+
213+
String bodyText = doc.body().text(); //get all the text within body tag
214+
return bodyText;
215+
216+
}
217+
catch (IOException e) {
218+
e.printStackTrace();
219+
}
220+
return rval;
221+
}
222+
223+
224+
/**
225+
* sort results by descending order
226+
* @param unsortMap
227+
* @param order
228+
* @return
229+
*/
230+
private static List<Entry<String, Integer>> sortWordOccurrences(Map<String, Integer> unsortMap, final boolean order)
231+
{
232+
233+
234+
List<Entry<String, Integer>> list = new LinkedList<Entry<String, Integer>>(unsortMap.entrySet());
235+
236+
// Sorting words by the number of the times they occur
237+
Collections.sort(list, new Comparator<Entry<String, Integer>>()
238+
{
239+
public int compare(Entry<String, Integer> o1,Entry<String, Integer> o2)
240+
{
241+
if (order)
242+
{
243+
return o1.getValue().compareTo(o2.getValue());
244+
}
245+
else
246+
{
247+
return o2.getValue().compareTo(o1.getValue());
248+
249+
}
250+
}
251+
});
252+
253+
return list;
254+
}
255+
}

0 commit comments

Comments
 (0)