From eb75c1dd86b07e476b39b17cb49f11b3521cfad8 Mon Sep 17 00:00:00 2001 From: RENAUD-GOUD Arnaud Date: Sat, 28 May 2016 17:33:33 +0200 Subject: [PATCH] Revert back for content detection, as it doesn't work ... #205 --- .../sparss/service/FetcherService.java | 81 ++--- .../sparss/utils/ArticleTextExtractor.java | 319 ++---------------- .../utils/ArticleTextExtractorTest.java | 2 +- 3 files changed, 49 insertions(+), 353 deletions(-) diff --git a/mobile/src/main/java/net/etuldan/sparss/service/FetcherService.java b/mobile/src/main/java/net/etuldan/sparss/service/FetcherService.java index c56010ad5..1298f69ad 100755 --- a/mobile/src/main/java/net/etuldan/sparss/service/FetcherService.java +++ b/mobile/src/main/java/net/etuldan/sparss/service/FetcherService.java @@ -60,7 +60,8 @@ import android.net.Uri; import android.os.Handler; import android.os.SystemClock; -import android.util.Log; +import android.text.Html; +import android.text.TextUtils; import android.util.Xml; import android.widget.Toast; @@ -78,9 +79,6 @@ import net.etuldan.sparss.utils.NetworkUtils; import net.etuldan.sparss.utils.PrefUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; - import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.FileNotFoundException; @@ -88,9 +86,10 @@ import java.io.InputStreamReader; import java.io.StringReader; import java.io.UnsupportedEncodingException; +import java.net.Authenticator; import java.net.HttpURLConnection; +import java.net.PasswordAuthentication; import java.net.URL; -import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Date; import java.util.concurrent.Callable; @@ -104,7 +103,6 @@ import java.util.regex.Pattern; public class FetcherService extends IntentService { - private static final String TAG = "FetcherService"; public static final String ACTION_REFRESH_FEEDS = "net.etuldan.sparss.REFRESH"; public static final String ACTION_MOBILIZE_FEEDS = "net.etuldan.sparss.MOBILIZE_FEEDS"; @@ -190,7 +188,6 @@ public void run() { } }); } - Log.d(TAG, "onHandleIntent: "+intent.getAction()+" aborted due to connectivity problem"); return; } @@ -198,23 +195,22 @@ public void run() { && networkInfo.getType() != ConnectivityManager.TYPE_WIFI; // We need to skip the fetching process, so we quit if (skipFetch) { - Log.d(TAG, "onHandleIntent: abort intent action: " + intent.getAction() + " due to connectivity settings"); return; } - Log.d(TAG, "onHandleIntent: intent action: " + intent.getAction()); - PrefUtils.putBoolean(PrefUtils.IS_REFRESHING, true); if (ACTION_MOBILIZE_FEEDS.equals(intent.getAction())) { mobilizeAllEntries(); downloadAllImages(); } else if (ACTION_DOWNLOAD_IMAGES.equals(intent.getAction())) { downloadAllImages(); } else { // == Constants.ACTION_REFRESH_FEEDS + PrefUtils.putBoolean(PrefUtils.IS_REFRESHING, true); + if (isFromAutoRefresh) { PrefUtils.putLong(PrefUtils.LAST_SCHEDULED_REFRESH, SystemClock.elapsedRealtime()); } - long keepTime = Long.parseLong(PrefUtils.getString(PrefUtils.KEEP_TIME, "4")) * 86400000L; + long keepTime = Long.parseLong(PrefUtils.getString(PrefUtils.KEEP_TIME, "4")) * 86400000l; long keepDateBorderTime = keepTime > 0 ? System.currentTimeMillis() - keepTime : 0; deleteOldEntries(keepDateBorderTime); @@ -272,8 +268,9 @@ public void run() { mobilizeAllEntries(); downloadAllImages(); + + PrefUtils.putBoolean(PrefUtils.IS_REFRESHING, false); } - PrefUtils.putBoolean(PrefUtils.IS_REFRESHING, false); } private void mobilizeAllEntries() { @@ -298,10 +295,7 @@ private void mobilizeAllEntries() { if (entryCursor.moveToFirst()) { if (entryCursor.isNull(entryCursor.getColumnIndex(EntryColumns.MOBILIZED_HTML))) { // If we didn't already mobilized it - Log.d(TAG, "mobilizeAllEntries: mobilizing entry " + entryId); - int linkPos = entryCursor.getColumnIndex(EntryColumns.LINK); - int titlePos = entryCursor.getColumnIndex(EntryColumns.TITLE); int abstractHtmlPos = entryCursor.getColumnIndex(EntryColumns.ABSTRACT); int feedIdPos = entryCursor.getColumnIndex(EntryColumns.FEED_ID); HttpURLConnection connection = null; @@ -321,32 +315,22 @@ private void mobilizeAllEntries() { final String httpAuthPassValue = cursorFeed.getString(httpAuthPasswordPosition); cursorFeed.close(); - String fullSummary = entryCursor.getString(abstractHtmlPos); - - String mobilizedHtml; - if(fullSummary.length() > 1000) { - //if summary is long, it is most probably full text. use it! - mobilizedHtml = fullSummary; - } else { - // Try to find a text indicator for better content extraction - Document doc = Jsoup.parse(fullSummary); - String contentIndicator = doc.text().substring(0, Math.min(doc.text().length(), 100)); - - String titleIndicator = entryCursor.getString(titlePos); - doc = Jsoup.parse(titleIndicator); - titleIndicator = doc.text(); - -// titleIndicator = Html.fromHtml(titleIndicator).toString(); -// titleIndicator = titleIndicator.replaceAll("[\\s\\u00A0]+"," "); //normalize, all whitespaces (incl char(160)) -> single space - connection = NetworkUtils.setupConnection(link, cookieName, cookieValue, httpAuthLoginValue, httpAuthPassValue); - - mobilizedHtml = ArticleTextExtractor.extractContent(HtmlUtils.decompressStream(connection.getInputStream()), contentIndicator, titleIndicator); - if(mobilizedHtml != null) { - mobilizedHtml = HtmlUtils.improveHtmlContent(mobilizedHtml, NetworkUtils.getBaseUrl(connection.getURL().toURI().toString())); + // Try to find a text indicator for better content extraction + String contentIndicator = null; + String text = entryCursor.getString(abstractHtmlPos); + if (!TextUtils.isEmpty(text)) { + text = Html.fromHtml(text).toString(); + if (text.length() > 60) { + contentIndicator = text.substring(20, 40); } } + connection = NetworkUtils.setupConnection(link,cookieName, cookieValue,httpAuthLoginValue, httpAuthPassValue); + + String mobilizedHtml = ArticleTextExtractor.extractContent(connection.getInputStream(), contentIndicator); + if (mobilizedHtml != null) { + mobilizedHtml = HtmlUtils.improveHtmlContent(mobilizedHtml, NetworkUtils.getBaseUrl(link)); ContentValues values = new ContentValues(); values.put(EntryColumns.MOBILIZED_HTML, mobilizedHtml); @@ -375,14 +359,12 @@ private void mobilizeAllEntries() { } } } catch (Throwable ignored) { - Log.e(TAG, "Exception: " + ignored.getMessage(), ignored); } finally { if (connection != null) { connection.disconnect(); } } } else { // We already mobilized it - Log.d(TAG, "mobilizeAllEntries: entry " + entryId + "@" + entryUri + " already mobilized"); success = true; operations.add(ContentProviderOperation.newDelete(TaskColumns.CONTENT_URI(taskId)).build()); } @@ -406,7 +388,6 @@ private void mobilizeAllEntries() { try { cr.applyBatch(FeedData.AUTHORITY, operations); } catch (Throwable ignored) { - Log.e(TAG, "Exception", ignored); } } } @@ -440,7 +421,6 @@ private void downloadAllImages() { values.put(TaskColumns.NUMBER_ATTEMPT, nbAttempt + 1); operations.add(ContentProviderOperation.newUpdate(TaskColumns.CONTENT_URI(taskId)).withValues(values).build()); } - Log.e(TAG, "downloadAllImages: Exception", e); } } @@ -450,7 +430,6 @@ private void downloadAllImages() { try { cr.applyBatch(FeedData.AUTHORITY, operations); } catch (Throwable ignored) { - Log.e(TAG, "Exception", ignored); } } } @@ -499,7 +478,6 @@ public Integer call() { try { result = refreshFeed(feedId, keepDateBorderTime); } catch (Exception ignored) { - Log.e(TAG, "Exception", ignored); } return result; } @@ -513,7 +491,6 @@ public Integer call() { Future f = completionService.take(); globalResult += f.get(); } catch (Exception ignored) { - Log.e(TAG, "Exception", ignored); } } @@ -557,7 +534,7 @@ private int refreshFeed(String feedId, long keepDateBorderTime) { if (fetchMode == 0) { if (contentType != null && contentType.startsWith(CONTENT_TYPE_TEXT_HTML)) { - BufferedReader reader = new BufferedReader(new InputStreamReader(HtmlUtils.decompressStream(connection.getInputStream()))); + BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream())); String line; int posStart = -1; @@ -616,7 +593,6 @@ private int refreshFeed(String feedId, long keepDateBorderTime) { Xml.findEncodingByName(index2 > -1 ? contentType.substring(index + 8, index2) : contentType.substring(index + 8)); fetchMode = FETCHMODE_DIRECT; } catch (UnsupportedEncodingException ignored) { - Log.e(TAG, "Exception", ignored); fetchMode = FETCHMODE_REENCODE; } } else { @@ -624,7 +600,7 @@ private int refreshFeed(String feedId, long keepDateBorderTime) { } } else { - BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(HtmlUtils.decompressStream(connection.getInputStream()))); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream())); char[] chars = new char[20]; @@ -642,7 +618,6 @@ private int refreshFeed(String feedId, long keepDateBorderTime) { Xml.findEncodingByName(xmlDescription.substring(start + 10, xmlDescription.indexOf('"', start + 11))); fetchMode = FETCHMODE_DIRECT; } catch (UnsupportedEncodingException ignored) { - Log.e(TAG, "Exception", ignored); fetchMode = FETCHMODE_REENCODE; } } else { @@ -663,19 +638,19 @@ private int refreshFeed(String feedId, long keepDateBorderTime) { int index = contentType.indexOf(CHARSET); int index2 = contentType.indexOf(';', index); - InputStream inputStream = HtmlUtils.decompressStream(connection.getInputStream()); + InputStream inputStream = connection.getInputStream(); Xml.parse(inputStream, Xml.findEncodingByName(index2 > -1 ? contentType.substring(index + 8, index2) : contentType.substring(index + 8)), handler); } else { - InputStreamReader reader = new InputStreamReader(HtmlUtils.decompressStream(connection.getInputStream())); + InputStreamReader reader = new InputStreamReader(connection.getInputStream()); Xml.parse(reader, handler); } break; } case FETCHMODE_REENCODE: { ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); - InputStream inputStream = HtmlUtils.decompressStream(connection.getInputStream()); + InputStream inputStream = connection.getInputStream(); byte[] byteBuffer = new byte[4096]; @@ -705,7 +680,6 @@ private int refreshFeed(String feedId, long keepDateBorderTime) { index + 8, index2) : contentType.substring(index + 8))); Xml.parse(reader, handler); } catch (Exception ignored) { - Log.e(TAG, "Exception", ignored); } } else { StringReader reader = new StringReader(xmlText); @@ -728,7 +702,6 @@ private int refreshFeed(String feedId, long keepDateBorderTime) { values.put(FeedColumns.ERROR, getString(R.string.error_feed_error)); cr.update(FeedColumns.CONTENT_URI(id), values, null, null); } - Log.e(TAG, "refreshFeed: FileNotFoundException: ", e); } catch (Throwable e) { if (handler == null || (!handler.isDone() && !handler.isCancelled())) { ContentValues values = new ContentValues(); @@ -739,7 +712,6 @@ private int refreshFeed(String feedId, long keepDateBorderTime) { values.put(FeedColumns.ERROR, e.getMessage() != null ? e.getMessage() : getString(R.string.error_feed_process)); cr.update(FeedColumns.CONTENT_URI(id), values, null, null); } - Log.e(TAG, "refreshFeed: Exception: ", e); } finally { /* check and optionally find favicon */ @@ -753,7 +725,6 @@ private int refreshFeed(String feedId, long keepDateBorderTime) { } } } catch (Throwable ignored) { - Log.d(TAG, "Exception favicon could not be retrieved."); } if (connection != null) { diff --git a/mobile/src/main/java/net/etuldan/sparss/utils/ArticleTextExtractor.java b/mobile/src/main/java/net/etuldan/sparss/utils/ArticleTextExtractor.java index 597bd19df..62dea4362 100755 --- a/mobile/src/main/java/net/etuldan/sparss/utils/ArticleTextExtractor.java +++ b/mobile/src/main/java/net/etuldan/sparss/utils/ArticleTextExtractor.java @@ -1,9 +1,6 @@ package net.etuldan.sparss.utils; -import android.util.Log; - import org.jsoup.Jsoup; -import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; @@ -22,34 +19,27 @@ * @author Peter Karich */ public class ArticleTextExtractor { - private static final String TAG = "ArticleTextExtractor"; // Interesting nodes - private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section|main", Pattern.CASE_INSENSITIVE); //"main" is used by Joomla CMS + private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section"); // Unlikely candidates - private static final Pattern UNLIKELY = Pattern.compile("^(com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|" + private static final Pattern UNLIKELY = Pattern.compile("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|" + "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor" + "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|" - + "login|si(debar|gn|ngle))", Pattern.CASE_INSENSITIVE); - - // Most likely positive candidates for id, class, and attributes of matching node - private static final Pattern POSITIVE = Pattern.compile("(^(body|content|h?entry|main|page|post|text|blog|story|haupt" - + "|(news)?arti(cle|kel)|instapaper_body))", Pattern.CASE_INSENSITIVE); - - // Most likely positive tag name for matching node - private static final Pattern POSITIVE_TAG_NAME = Pattern.compile("(^(article|main)$)", Pattern.CASE_INSENSITIVE); + + "login|si(debar|gn|ngle)"); - // Most likely positive attribute for children of a matching parent node - private static final Pattern POSITIVE_CHILD_ATTRIBUTE = Pattern.compile("(^(paragra(ph|fo)|section))", Pattern.CASE_INSENSITIVE); + // Most likely positive candidates + private static final Pattern POSITIVE = Pattern.compile("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))" + + "|arti(cle|kel)|instapaper_body"); // Most likely negative candidates private static final Pattern NEGATIVE = Pattern.compile("nav($|igation)|user|com(ment|bx)|(^com-)|contact|" + "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|" - + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard|paragra(ph|fo)", Pattern.CASE_INSENSITIVE); + + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard"); private static final Pattern NEGATIVE_STYLE = - Pattern.compile("hidden|display: ?none|font-size: ?small", Pattern.CASE_INSENSITIVE); + Pattern.compile("hidden|display: ?none|font-size: ?small"); /** * @param input extracts article text from given html string. wasn't tested @@ -57,11 +47,11 @@ public class ArticleTextExtractor { * @param contentIndicator a text which should be included into the extracted content, or null * @return extracted article, all HTML tags stripped */ - public static String extractContent(InputStream input, String contentIndicator, String titleIndicator) throws Exception { - return extractContent(Jsoup.parse(input, null, ""), contentIndicator, titleIndicator); + public static String extractContent(InputStream input, String contentIndicator) throws Exception { + return extractContent(Jsoup.parse(input, null, ""), contentIndicator); } - private static String extractContent(Document doc, String contentIndicator, String titleIndicator) { + public static String extractContent(Document doc, String contentIndicator) { if (doc == null) throw new NullPointerException("missing document"); @@ -70,199 +60,10 @@ private static String extractContent(Document doc, String contentIndicator, Stri // init elements Collection nodes = getNodes(doc); - Element bestMatchElement = null; - - log(TAG, "======================================================"); - log(TAG, "extractContent: " + titleIndicator + ""); - if(doc.text().contains(titleIndicator)) { //newMatching can only work if title exists within text. - bestMatchElement = newMatching(nodes, contentIndicator, titleIndicator); - } - - if(bestMatchElement != null) { - log(TAG, "extractContent: new method worked. <"+bestMatchElement.tagName() + " " + - bestMatchElement.attributes().toString() + " length:" + bestMatchElement.text().length()); - } - - if(bestMatchElement == null) { - if(contentIndicator != null && !contentIndicator.isEmpty()) { - bestMatchElement = conventionalMatching(nodes, contentIndicator, true); - if(bestMatchElement != null) { - log(TAG, "extractContent: conventionalMatching worked, withContentFilter==true <" - +bestMatchElement.tagName() + " " + bestMatchElement.attributes().toString() + " length:" + bestMatchElement.text().length()); - } - } - if (bestMatchElement == null) { - bestMatchElement = conventionalMatching(nodes, contentIndicator, false); - if(bestMatchElement != null) { - log(TAG, "extractContent: conventionalMatching worked, withContentFilter==false <" - +bestMatchElement.tagName() + " " + bestMatchElement.attributes().toString() + " length:" + bestMatchElement.text().length()); - } - } - } - - if (bestMatchElement == null || bestMatchElement.text().isEmpty()) { - Log.e(TAG, "extractContent failed. Returning document body."); - bestMatchElement = doc.select("body").first(); - } - -// log(TAG, "extractContent: -----------------------------------------------------"); -// log(TAG, bestMatchElement.text()); -// log(TAG, "extractContent: -----------------------------------------------------"); - - addImageSiblings(bestMatchElement); - removeUnwantedElements(bestMatchElement); - fixVideoTags(bestMatchElement); - fixImageTags(bestMatchElement); - - return bestMatchElement.toString(); - } - - private static void fixImageTags(Element bestMatchElement) { - //search for img and remove lazy-loading -// IF IMAGE TAG LOOKS LIKE THIS: -//
-// -// ... -//
-// TRANSFORM TO THIS: -// - for (Element img : bestMatchElement.getElementsByTag("img")) { - String src = null; - if(img.parent() != null && img.parent().tag().getName().equals("figure")) { - Element parent = img.parent(); - String json = parent.attr("data-lazy-image"); -// JSONObject obj = new JSONObject(json); //does not work. -// src = obj.getString("src"); //WHY? - if(json.length() > 7 && json.substring(2, 5).equals("src")) { - json = json.substring(6);//remove "src" - int first = json.indexOf("\"") + 1; - int last = json.indexOf("\"", first); - src = json.substring(first, last); - } - } - if(src == null && img.hasAttr("data-src")) { - src = img.attr("data-src"); - } - if(src != null) { - img.attr("src", src); - log(TAG, "extractContent: removed lazy-load " + src); - } - } - } - - private static void fixVideoTags(Element bestMatchElement) { - //search for video tags and fix them if necessary -// IF VIDEO TAG LOOKS LIKE THIS: -// -// TRANSFORM TO THIS: -// - for (Element video : bestMatchElement.getElementsByTag("video")) { - String thumb = null; - String url = null; - for (Element meta : video.getElementsByTag("meta")) { - if(meta.attr("itemprop").equals("thumbnailUrl")) { - thumb = meta.attr("content"); - } - if(meta.attr("itemprop").equals("contentURL")) { - url = meta.attr("content"); - } - } - if(thumb != null && url != null) { - video.attr("controls", true); - video.attr("poster", thumb); - video.appendElement("source").attr("src", url); - log(TAG, "extractContent: fixed video " + url); - } - } - } - private static final Pattern UNWANTED_TAGS = Pattern.compile("^(aside)$", Pattern.CASE_INSENSITIVE); - private static final Pattern UNWANTED_CLASSES = Pattern.compile("^(msgCenter|correlat(ed|i)|breadcrumb|TopNews)$", Pattern.CASE_INSENSITIVE); - private static final Pattern UNWANTED_IDS = Pattern.compile("^(commentiMsgCenter|disclaimer|(comment-)?navigation)$", Pattern.CASE_INSENSITIVE); - - private static void removeUnwantedElements(Element bestMatchElement) { - for (Element child : bestMatchElement.children()) { - if (UNWANTED_CLASSES.matcher(child.className()).find()) { - child.remove(); - continue; - } - if(UNWANTED_TAGS.matcher(child.tagName()).find()) { - child.remove(); - continue; - } - if(UNWANTED_IDS.matcher(child.id()).find()) { - child.remove(); - continue; - } - } - } - - private static void addImageSiblings(Element bestMatchElement) { - //check siblings for images and add them if any available - Element previousSibling = bestMatchElement.previousElementSibling(); - while(previousSibling != null) { - if (previousSibling.select("img").size() != 0 && - previousSibling.children().size() <= 3 //only img and optional two descriptions - && previousSibling.text().length() < 200 //only short description - ) { - bestMatchElement.prependChild(previousSibling); - log(TAG, "extractContent: prepended image " + previousSibling); - previousSibling = bestMatchElement.previousElementSibling(); - - } else { - previousSibling = previousSibling.previousElementSibling(); - } - } - Element nextSibling = bestMatchElement.nextElementSibling(); - while(nextSibling != null) { - if (nextSibling.select("img").size() != 0 && - nextSibling.children().size() <= 2 //only img and optional description - && nextSibling.text().length() < 160 //only short description - ) { - bestMatchElement.appendChild(nextSibling); - log(TAG, "extractContent: appended image <" + nextSibling.tagName() + " " + nextSibling.attributes()); - nextSibling = bestMatchElement.nextElementSibling(); - - } else { - nextSibling = nextSibling.nextElementSibling(); - } - } - } - - private static void log(String tag, String s) { - Log.d(tag, s); - System.out.println(tag + ": " + s); - } - - /** - * Conventional matching algorithm. - * @param nodes All HTML elements to be considered. - * @param contentIndicator Only required if withContentFilter==true - * @param withContentFilter If true only nodes containing contentIndicator are considered - * @return Best matching node or null - */ - private static Element conventionalMatching(Collection nodes, String contentIndicator, boolean withContentFilter) { int maxWeight = 0; Element bestMatchElement = null; + for (Element entry : nodes) { - String text = entry.text(); - text = text.substring(0, Math.min(200, text.length())).replaceAll("[\\s\\u00A0]+"," "); //normalized beginning of text - //only consider entries which contain the contentIndicator if withContentFilter - if (withContentFilter && !text.contains(contentIndicator)) { - continue; - } -// if(entry.tagName().equals("article") || entry.className().equals("NewsArticle")) -// { -// maxWeight++;maxWeight--; -// } -// if(entry.attr("itemprop").equals("articleBody")) { -// maxWeight++; maxWeight--; -// } int currentWeight = getWeight(entry, contentIndicator); if (currentWeight > maxWeight) { maxWeight = currentWeight; @@ -273,62 +74,14 @@ private static Element conventionalMatching(Collection nodes, String co } } } - if (withContentFilter && maxWeight < 70) { - bestMatchElement = null; - } - return bestMatchElement; - } - - /** - * New matching algorithm. Find largest node which contains content but not title. - * @param nodes - * @param contentIndicator - * @param titleIndicator - * @return - */ - private static Element newMatching(Collection nodes, String contentIndicator, String titleIndicator) { - int maxWeight = 0; - Element bestMatchElement = null; - if(contentIndicator != null && !contentIndicator.isEmpty()) { - //first largest node which contains content but not title. that is the content we want. - for (Element entry : nodes) { -// if(entry.attr("itemprop").equals("articleBody")) { -// maxWeight++; maxWeight--; -// } -// if(entry.tagName().equals("article") || entry.className().equals("NewsArticle")) -// { -// maxWeight++;maxWeight--; -// } - String text = entry.text().replaceAll("\u00A0", ""); //entry may contain   characters which need to be filtered first. - text = Jsoup.parse(text).text(); //now text is normalized (like description from rss feed) - if(text.contains(contentIndicator)) { - if(!text.contains(titleIndicator)) { - if(entry.text().length() > 200) { //ignore very small tags - float factor = 1; - if (POSITIVE.matcher(entry.className()).find()) - factor *= 1.4; - if (POSITIVE.matcher(entry.id()).find()) - factor *= 1.4; - for (Attribute a : entry.attributes()) { - if (POSITIVE.matcher(a.getValue()).find()) - factor *= 1.4; - } - int weight = (int) ((float) entry.text().length() * factor); - if (maxWeight < weight) { //use whole content length here! - maxWeight = weight; - bestMatchElement = entry; - } - if (POSITIVE_TAG_NAME.matcher(entry.tagName()).find()) - maxWeight += 50; - } - } - } - } + if (bestMatchElement != null) { + return bestMatchElement.toString(); } - return bestMatchElement; + + return null; } - + /** * Weights current element. By matching it with positive candidates and * weighting child nodes. Since it's impossible to predict which exactly @@ -340,8 +93,7 @@ private static Element newMatching(Collection nodes, String contentIndi */ private static int getWeight(Element e, String contentIndicator) { int weight = calcWeight(e); - //often the wanted elements consists only tags, no text. so this is not needed. - //weight += (int) Math.round(e.ownText().length() / 100.0 * 10); + weight += (int) Math.round(e.ownText().length() / 100.0 * 10); weight += weightChildNodes(e, contentIndicator); return weight; } @@ -364,31 +116,20 @@ private static int weightChildNodes(Element rootEl, String contentIndicator) { Element caption = null; List pEls = new ArrayList<>(5); for (Element child : rootEl.children()) { - //if child contains only (!) a single child, get that sub-child instead (recursively!) - while(child.children().size() == 1 && child.text().length() == 0) { - child = child.child(0); - } String text = child.text(); int textLength = text.length(); if (textLength < 20) { continue; } - //this is not reliable. there are many tags (tree hierarchy) which contain contentIndicator, - //at this point we cannot be certain that this is the tag we actually want. - //if (contentIndicator != null && text.contains(contentIndicator)) { - // weight += 100; // We certainly found the item - //} - - for (Attribute a : child.attributes() ) { - if (POSITIVE_CHILD_ATTRIBUTE.matcher(a.getValue()).find()) - weight += 30; + if (contentIndicator != null && text.contains(contentIndicator)) { + weight += 100; // We certainly found the item } String ownText = child.ownText(); int ownTextLength = ownText.length(); if (ownTextLength > 200) { - weight += 20; + weight += Math.max(50, ownTextLength / 10); } if (child.tagName().equals("h1") || child.tagName().equals("h2")) { @@ -425,24 +166,12 @@ private static int calcWeightForChild(String text) { private static int calcWeight(Element e) { int weight = 0; - - if (POSITIVE_TAG_NAME.matcher(e.tagName()).find()) - weight += 35; - if (POSITIVE.matcher(e.className()).find()) weight += 35; if (POSITIVE.matcher(e.id()).find()) weight += 40; - //also allow custom HTML attributes, e.g. like Joomla uses: itemprop="articleBody" - for (Attribute a : e.attributes() ) { - if(a.getKey().equals("class") || a.getKey().equals("id") ) - continue; //already accounted for above. - if (POSITIVE.matcher(a.getValue()).find()) - weight += 35; - } - if (UNLIKELY.matcher(e.className()).find()) weight -= 20; @@ -497,13 +226,9 @@ private static Document removeScriptsAndStyles(Document doc) { item.remove(); } - //some websites include images inside noscript tags, e.g. https://www.nrdc.org/rss.xml Elements noscripts = doc.getElementsByTag("noscript"); for (Element item : noscripts) { - if(item.getElementsByTag("img").size() == 0) { - //only remove if