|
| 1 | + |
| 2 | +import java.io.BufferedWriter; |
| 3 | +import java.io.FileWriter; |
| 4 | +import java.io.IOException; |
| 5 | +import java.io.PrintWriter; |
| 6 | +import java.io.StringWriter; |
| 7 | +import java.util.Collections; |
| 8 | +import java.util.Comparator; |
| 9 | +import java.util.HashMap; |
| 10 | +import java.util.LinkedList; |
| 11 | +import java.util.List; |
| 12 | +import java.util.Map; |
| 13 | +import java.util.Map.Entry; |
| 14 | +import java.util.regex.Matcher; |
| 15 | +import java.util.regex.Pattern; |
| 16 | + |
| 17 | +import org.apache.commons.lang3.StringUtils; |
| 18 | +import org.jsoup.Jsoup; |
| 19 | +import org.jsoup.nodes.Document; |
| 20 | + |
| 21 | +import com.brsanthu.dataexporter.DataExporter; |
| 22 | +import com.brsanthu.dataexporter.output.texttable.TextTableExporter; |
| 23 | + |
| 24 | + |
| 25 | +/** |
| 26 | + * |
| 27 | + * @author tim ng |
| 28 | + * |
| 29 | + */ |
| 30 | + |
| 31 | +public class BookParser { |
| 32 | + final static String BOOK_ONE_URL = "http://www.gutenberg.org/files/17192/17192-h/17192-h.htm"; |
| 33 | + final static String BOOK_TWO_URL = "http://www.gutenberg.org/cache/epub/3748/pg3748.html"; |
| 34 | + final static boolean DESC = false; |
| 35 | + |
| 36 | + |
| 37 | + |
| 38 | + public static void main(String[] args) { |
| 39 | + Map<String, Integer> book1Map = new HashMap<String,Integer>(); |
| 40 | + Map<String, Integer> book2Map = new HashMap<String,Integer>(); |
| 41 | + |
| 42 | + |
| 43 | + String book1 = getBookContent(BOOK_ONE_URL); |
| 44 | + String book2 = getBookContent(BOOK_TWO_URL); |
| 45 | + countWordOccurrences(book1Map, book1); |
| 46 | + countWordOccurrences(book2Map,book2); |
| 47 | + |
| 48 | + List<Entry<String, Integer>> sortedBook1ListDesc = sortWordOccurrences(book1Map,DESC); |
| 49 | + List<Entry<String, Integer>> sortedBook2ListDesc = sortWordOccurrences(book2Map,DESC); |
| 50 | + |
| 51 | + //output results to log file |
| 52 | + outputToLog(sortedBook1ListDesc,sortedBook2ListDesc); |
| 53 | + } |
| 54 | + |
| 55 | + |
| 56 | + /** |
| 57 | + * Split book content into a string array, then put each word occurrences into a map |
| 58 | + * @param bookMap |
| 59 | + * @param s |
| 60 | + */ |
| 61 | + private static void countWordOccurrences(Map<String, Integer> bookMap, |
| 62 | + String s) { |
| 63 | + |
| 64 | + String regex = "^\\W*([^\\d]*)"; //matching any punctuation from beginning for each word |
| 65 | + Pattern pattern = Pattern.compile(regex); |
| 66 | + |
| 67 | + String[] words = s.split("[\\s+,;]+"); //split out words within comma, semicolon and space. |
| 68 | + |
| 69 | + for(String st:words){ |
| 70 | + |
| 71 | + //ignore any word contains http or @ |
| 72 | + if(st.contains(".") && (st.contains("http") || st.contains("www")) || st.contains("@")){ |
| 73 | + continue; |
| 74 | + } |
| 75 | + |
| 76 | + //setup match pattern |
| 77 | + Matcher m = pattern.matcher(StringUtils.reverse(st)); //reverse string to strip out punctuation,space,symbols and tabs at the end of each word |
| 78 | + m.find(); |
| 79 | + String word = m.group(1); |
| 80 | + m = pattern.matcher(StringUtils.reverse(word)); // reverse string back to normal then strip out punctuation,space,symbols and tabs at the beginning of each word |
| 81 | + m.find(); |
| 82 | + word = m.group(1); |
| 83 | + |
| 84 | + //ignore any word that is Uppercase or blank space |
| 85 | + if( StringUtils.isAllUpperCase(word) || StringUtils.isBlank(word)){ |
| 86 | + continue; |
| 87 | + } |
| 88 | + if(word.contains(".")){ |
| 89 | + checkForSpecialCases(bookMap, word); //last checking to cover input errors in the book |
| 90 | + continue; |
| 91 | + } |
| 92 | + word = word.toLowerCase(); //force all the word to lower case |
| 93 | + if (bookMap.containsKey(word)){ |
| 94 | + bookMap.put(word.toLowerCase(), bookMap.get(word) + 1); |
| 95 | + } |
| 96 | + else{ |
| 97 | + bookMap.put(word, 1); |
| 98 | + } |
| 99 | + } |
| 100 | + |
| 101 | + } |
| 102 | + |
| 103 | + /** |
| 104 | + * Output results to a log in a tabular format |
| 105 | + * @param list1 |
| 106 | + * @param list2 |
| 107 | + */ |
| 108 | + private static void outputToLog(List<Entry<String,Integer>>list1, |
| 109 | + List<Entry<String,Integer>>list2){ |
| 110 | + |
| 111 | + StringWriter sw = new StringWriter(); |
| 112 | + DataExporter exporter = new TextTableExporter(sw); |
| 113 | + |
| 114 | + String word1; |
| 115 | + Integer occurrence1; |
| 116 | + |
| 117 | + String word2; |
| 118 | + Integer occurrence2; |
| 119 | + |
| 120 | + |
| 121 | + exporter.addColumn(" Verne ", |
| 122 | + " Word ", |
| 123 | + " Occurrences ", |
| 124 | + " ", |
| 125 | + " Poe ", |
| 126 | + " Word ", |
| 127 | + " Occurrences "); |
| 128 | + |
| 129 | + //If list not empty, continue to loop through it |
| 130 | + while(!list1.isEmpty() || !list2.isEmpty()){ |
| 131 | + |
| 132 | + //if first list is empty insert blank |
| 133 | + if(list1.isEmpty()){ |
| 134 | + word1 = ""; |
| 135 | + occurrence1 = null; |
| 136 | + } |
| 137 | + else{ // add row |
| 138 | + Entry<String, Integer> entry = list1.get(0); //get first entry from list |
| 139 | + word1 = entry.getKey(); |
| 140 | + occurrence1 = entry.getValue(); |
| 141 | + list1.remove(0); //once entry has been processed, remove enty from list |
| 142 | + } |
| 143 | + //if list 2 is empty insert blank |
| 144 | + if(list2.isEmpty()){ |
| 145 | + word2 = ""; |
| 146 | + occurrence2 = null; |
| 147 | + } |
| 148 | + else{ // add row |
| 149 | + Entry<String, Integer> entry = list2.get(0); //get first enty from list |
| 150 | + word2 = entry.getKey(); |
| 151 | + occurrence2 = entry.getValue(); |
| 152 | + list2.remove(0); // remove entry from list once entry has been processed |
| 153 | + } |
| 154 | + // add row to exporter |
| 155 | + exporter.addRow(" ", word2, occurrence2," "," ", word1, occurrence1 ); |
| 156 | + } |
| 157 | + |
| 158 | + exporter.finishExporting(); |
| 159 | + |
| 160 | + |
| 161 | + //print results to log file |
| 162 | + try { |
| 163 | + PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter("word_occurrence_count.txt", false))); |
| 164 | + out.println(sw.toString()); |
| 165 | + out.close(); |
| 166 | + } catch (IOException ioe) { |
| 167 | + System.out.print(ioe.getMessage()); |
| 168 | + } |
| 169 | + } |
| 170 | + |
| 171 | + |
| 172 | + /** |
| 173 | + * Handle special punctuation in words. |
| 174 | + * E.g., ignores a.m., p.m., u.s., etc... |
| 175 | + * @param counts |
| 176 | + * @param word |
| 177 | + */ |
| 178 | + private static void checkForSpecialCases(Map<String, Integer> counts, |
| 179 | + String word) { |
| 180 | + |
| 181 | + //split word that contains dot |
| 182 | + String[] otherWords = word.split("\\."); |
| 183 | + for(String text:otherWords){ |
| 184 | + text = text.replaceAll("[^a-z\\sA-Z]", ""); //replace any non word charters with empty string |
| 185 | + if(StringUtils.isAllUpperCase(text)){ |
| 186 | + continue; |
| 187 | + } |
| 188 | + |
| 189 | + if(text.length() > 1){ |
| 190 | + if(counts.containsKey(text)){ |
| 191 | + counts.put(text, counts.get(text) + 1); |
| 192 | + |
| 193 | + } |
| 194 | + else |
| 195 | + counts.put(text, 1); |
| 196 | + } |
| 197 | + } |
| 198 | + } |
| 199 | + |
| 200 | + |
| 201 | + /** |
| 202 | + * Download book in html format from url and extract book text. |
| 203 | + * @param url |
| 204 | + * @return |
| 205 | + */ |
| 206 | + private static String getBookContent(String url) { |
| 207 | + |
| 208 | + String rval = null; |
| 209 | + try { |
| 210 | + Document doc = Jsoup.connect(url).userAgent( |
| 211 | + "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0").get(); //http://www.gutenberg.org/files/17192/17192-h/17192-h.htm |
| 212 | + |
| 213 | + String bodyText = doc.body().text(); //get all the text within body tag |
| 214 | + return bodyText; |
| 215 | + |
| 216 | + } |
| 217 | + catch (IOException e) { |
| 218 | + e.printStackTrace(); |
| 219 | + } |
| 220 | + return rval; |
| 221 | + } |
| 222 | + |
| 223 | + |
| 224 | + /** |
| 225 | + * sort results by descending order |
| 226 | + * @param unsortMap |
| 227 | + * @param order |
| 228 | + * @return |
| 229 | + */ |
| 230 | + private static List<Entry<String, Integer>> sortWordOccurrences(Map<String, Integer> unsortMap, final boolean order) |
| 231 | + { |
| 232 | + |
| 233 | + |
| 234 | + List<Entry<String, Integer>> list = new LinkedList<Entry<String, Integer>>(unsortMap.entrySet()); |
| 235 | + |
| 236 | + // Sorting words by the number of the times they occur |
| 237 | + Collections.sort(list, new Comparator<Entry<String, Integer>>() |
| 238 | + { |
| 239 | + public int compare(Entry<String, Integer> o1,Entry<String, Integer> o2) |
| 240 | + { |
| 241 | + if (order) |
| 242 | + { |
| 243 | + return o1.getValue().compareTo(o2.getValue()); |
| 244 | + } |
| 245 | + else |
| 246 | + { |
| 247 | + return o2.getValue().compareTo(o1.getValue()); |
| 248 | + |
| 249 | + } |
| 250 | + } |
| 251 | + }); |
| 252 | + |
| 253 | + return list; |
| 254 | + } |
| 255 | +} |
0 commit comments