book parser example

Tim Ng · Tim Ng · commit ecddb40708a5 · 2013-02-23T16:37:03.000-08:00
diff --git a/.classpath b/.classpath
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry including="**/*.java" kind="src" output="target/test-classes" path="src/test/java"/>
+	<classpathentry including="**/*.java" kind="src" path="src/main/java"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry kind="var" path="M2_REPO/org/jsoup/jsoup/1.7.2/jsoup-1.7.2.jar"/>
+	<classpathentry kind="var" path="M2_REPO/commons-lang/commons-lang/2.4/commons-lang-2.4.jar"/>
+	<classpathentry kind="lib" path="/Users/ting/dev/Selenium_workspace/BookParser/commons-lang3-3.1.jar"/>
+	<classpathentry kind="lib" path="/Users/ting/dev/Selenium_workspace/BookParser/data-exporter-1.0.2.jar"/>
+	<classpathentry kind="output" path="target/classes"/>
+</classpath>
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+target/classes/
+target/test-classes/
+bin/
diff --git a/.project b/.project
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+  <name>BookParser</name>
+  <comment>NO_M2ECLIPSE_SUPPORT: Project files created with the maven-eclipse-plugin are not supported in M2Eclipse.</comment>
+  <projects/>
+  <buildSpec>
+    <buildCommand>
+      <name>org.eclipse.jdt.core.javabuilder</name>
+    </buildCommand>
+  </buildSpec>
+  <natures>
+    <nature>org.eclipse.jdt.core.javanature</nature>
+  </natures>
+</projectDescription>
diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,12 @@
+#Thu Feb 21 16:56:50 PST 2013
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
+org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.debug.lineNumber=generate
+org.eclipse.jdt.core.compiler.debug.localVariable=generate
+org.eclipse.jdt.core.compiler.debug.sourceFile=generate
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.source=1.6
diff --git a/commons-lang3-3.1.jar b/commons-lang3-3.1.jar
diff --git a/data-exporter-1.0.2.jar b/data-exporter-1.0.2.jar
diff --git a/pom.xml b/pom.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?><project>
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>BookParser</groupId>
+  <artifactId>BookParser</artifactId>
+  <version>1.0-SNAPSHOT</version>
+ <dependencies>
+ <dependency>
+	<groupId>org.jsoup</groupId>
+	<artifactId>jsoup</artifactId>
+	<version>1.7.2</version>
+  </dependency>
+  
+  <dependency>
+        <groupId>commons-lang</groupId>
+        <artifactId>commons-lang</artifactId>
+        <version>2.4</version>
+    </dependency>
+  
+  	<dependency>
+                <groupId>com.brsanthu</groupId>
+                <artifactId>data-exporter</artifactId>
+                <version>1.0.0</version>
+        </dependency>
+    
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+  	<artifactId>commons-lang3</artifactId>
+  	<version>3.1</version>
+    </dependency>
+  </dependencies>
+</project>
diff --git a/src/test/java/BookParser.java b/src/test/java/BookParser.java
@@ -0,0 +1,255 @@
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+import com.brsanthu.dataexporter.DataExporter;
+import com.brsanthu.dataexporter.output.texttable.TextTableExporter;
+
+
+/**
+ * 
+ * @author tim ng
+ *
+ */
+
+public class BookParser {
+	final static String BOOK_ONE_URL = "http://www.gutenberg.org/files/17192/17192-h/17192-h.htm";
+	final static String BOOK_TWO_URL = "http://www.gutenberg.org/cache/epub/3748/pg3748.html";
+    final static boolean DESC = false;
+	
+
+	
+	public static void main(String[] args) {
+		Map<String, Integer> book1Map = new HashMap<String,Integer>();
+		Map<String, Integer> book2Map = new HashMap<String,Integer>();
+		
+		
+		String book1 = getBookContent(BOOK_ONE_URL);	
+		String book2 = getBookContent(BOOK_TWO_URL);
+		countWordOccurrences(book1Map, book1);
+		countWordOccurrences(book2Map,book2);
+		
+		List<Entry<String, Integer>> sortedBook1ListDesc = sortWordOccurrences(book1Map,DESC);
+		List<Entry<String, Integer>> sortedBook2ListDesc = sortWordOccurrences(book2Map,DESC);
+				
+		//output results to log file
+		outputToLog(sortedBook1ListDesc,sortedBook2ListDesc);
+	}	
+
+	
+	/**
+	 * Split book content into a string array, then put each word occurrences into a map
+	 * @param bookMap
+	 * @param s
+	 */
+	private static void countWordOccurrences(Map<String, Integer> bookMap,
+			String s) {
+		
+		String regex = "^\\W*([^\\d]*)"; //matching any punctuation from beginning for each word
+		Pattern pattern = Pattern.compile(regex);
+	
+		String[] words = s.split("[\\s+,;]+");  //split out words within comma, semicolon and space.
+		
+		for(String st:words){
+	
+		//ignore any word contains http or @
+		if(st.contains(".") && (st.contains("http") || st.contains("www")) || st.contains("@")){
+			continue;
+		}
+				
+		//setup match pattern
+		Matcher m = pattern.matcher(StringUtils.reverse(st)); //reverse string to strip out punctuation,space,symbols and tabs at the end of each word
+		m.find();
+		String word = m.group(1);
+		m = pattern.matcher(StringUtils.reverse(word));  // reverse string back to normal then strip out punctuation,space,symbols and tabs at the beginning of each word
+		m.find();
+		word = m.group(1);
+
+		//ignore any word that is Uppercase or blank space
+		if( StringUtils.isAllUpperCase(word) || StringUtils.isBlank(word)){
+			continue;
+		}
+		if(word.contains(".")){
+			checkForSpecialCases(bookMap, word);  //last checking to cover input errors in the book
+			continue;
+		}
+			word = word.toLowerCase();       //force all the word to lower case
+			if (bookMap.containsKey(word)){								
+				bookMap.put(word.toLowerCase(), bookMap.get(word) + 1);
+			}
+			else{
+				bookMap.put(word, 1);
+			}					
+		 }
+
+	}
+
+	/**
+	 * Output results to a log in a tabular format
+	 * @param list1
+	 * @param list2
+	 */
+	private static void outputToLog(List<Entry<String,Integer>>list1,
+										List<Entry<String,Integer>>list2){
+		
+		StringWriter sw = new StringWriter();
+		DataExporter exporter = new TextTableExporter(sw);
+		
+		String word1;
+		Integer occurrence1;
+		
+		String word2;
+		Integer occurrence2;
+		
+		
+		exporter.addColumn("  Verne   ",
+				"        Word        ",
+				"  Occurrences  ", 
+				"     ",  
+				"  Poe  ",
+				"        Word        ",
+				"  Occurrences  ");
+		
+		//If list not empty, continue to loop through it
+		while(!list1.isEmpty() || !list2.isEmpty()){
+			
+			//if first list is empty insert blank
+			if(list1.isEmpty()){
+				word1 = "";
+				occurrence1 = null;
+			}
+			else{  // add row
+				Entry<String, Integer> entry = list1.get(0);  //get first entry from list
+				word1 = entry.getKey();
+				occurrence1 = entry.getValue();		
+				list1.remove(0);     //once entry has been processed, remove enty from list
+			}
+			//if list 2 is empty insert blank
+			if(list2.isEmpty()){
+				word2 = "";
+				occurrence2 = null;
+			}
+			else{ // add row
+				Entry<String, Integer> entry = list2.get(0);   //get first enty from list
+				word2 = entry.getKey();
+				occurrence2 = entry.getValue();
+				list2.remove(0);                            // remove entry from list once entry has been processed
+			}
+			// add row to exporter
+			exporter.addRow("    ", word2, occurrence2,"    ","      ", word1, occurrence1 );			
+		}		
+       
+        exporter.finishExporting();
+        
+        
+        //print results to log file
+        try {
+            PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter("word_occurrence_count.txt", false)));
+            out.println(sw.toString());
+            out.close();
+        } catch (IOException ioe) {
+        	System.out.print(ioe.getMessage());
+        }
+	}
+
+	
+	/**
+	 * Handle special punctuation in words.
+	 * E.g., ignores a.m., p.m., u.s., etc...
+	 * @param counts
+	 * @param word
+	 */
+	private static void checkForSpecialCases(Map<String, Integer> counts,
+			String word) {
+		
+		//split word that contains dot
+		String[] otherWords = word.split("\\.");
+		for(String text:otherWords){
+			text = text.replaceAll("[^a-z\\sA-Z]", "");  //replace any non word charters with empty string
+			if(StringUtils.isAllUpperCase(text)){
+				continue;
+			}
+			
+			if(text.length() > 1){
+				if(counts.containsKey(text)){
+					counts.put(text, counts.get(text) + 1);
+				
+				}
+				else
+				counts.put(text, 1);
+			}							
+		}
+	}
+	
+	
+	/**
+	 * Download book in html format from url and extract book text.
+	 * @param url
+	 * @return
+	 */
+	private static String getBookContent(String url) {
+	
+		String rval = null;
+		try {
+			Document doc = Jsoup.connect(url).userAgent(
+					"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0").get(); //http://www.gutenberg.org/files/17192/17192-h/17192-h.htm
+
+			String bodyText = doc.body().text();	//get all the text within body tag					
+			return bodyText;
+			 
+	  }
+	 catch (IOException e) {
+		e.printStackTrace();
+	 }
+		return rval;
+	}
+	
+	
+	/**
+	 * sort results by descending order
+	 * @param unsortMap
+	 * @param order
+	 * @return
+	 */
+	private static List<Entry<String, Integer>> sortWordOccurrences(Map<String, Integer> unsortMap, final boolean order)
+    {
+		
+
+        List<Entry<String, Integer>> list = new LinkedList<Entry<String, Integer>>(unsortMap.entrySet());
+
+        // Sorting words by the number of the times they occur 
+        Collections.sort(list, new Comparator<Entry<String, Integer>>()
+        {
+            public int compare(Entry<String, Integer> o1,Entry<String, Integer> o2)
+            {
+                if (order)
+                {
+                    return o1.getValue().compareTo(o2.getValue());
+                }
+                else
+                {
+                    return o2.getValue().compareTo(o1.getValue());
+
+                }
+            }
+        });
+
+        return list;
+    }
+}
diff --git a/word_occurrence_count.txt b/word_occurrence_count.txt

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+target/classes/`
	`2`	`+target/test-classes/`
	`3`	`+bin/`