From eb75c1dd86b07e476b39b17cb49f11b3521cfad8 Mon Sep 17 00:00:00 2001
From: RENAUD-GOUD Arnaud <arnaud.renaudgoud@gmail.com>
Date: Sat, 28 May 2016 17:33:33 +0200
Subject: [PATCH] Revert back for content detection, as it doesn't work ...
 #205

---
 .../sparss/service/FetcherService.java        |  81 ++---
 .../sparss/utils/ArticleTextExtractor.java    | 319 ++----------------
 .../utils/ArticleTextExtractorTest.java       |   2 +-
 3 files changed, 49 insertions(+), 353 deletions(-)

diff --git a/mobile/src/main/java/net/etuldan/sparss/service/FetcherService.java b/mobile/src/main/java/net/etuldan/sparss/service/FetcherService.java
index c56010ad5..1298f69ad 100755
--- a/mobile/src/main/java/net/etuldan/sparss/service/FetcherService.java
+++ b/mobile/src/main/java/net/etuldan/sparss/service/FetcherService.java
@@ -60,7 +60,8 @@
 import android.net.Uri;
 import android.os.Handler;
 import android.os.SystemClock;
-import android.util.Log;
+import android.text.Html;
+import android.text.TextUtils;
 import android.util.Xml;
 import android.widget.Toast;
 
@@ -78,9 +79,6 @@
 import net.etuldan.sparss.utils.NetworkUtils;
 import net.etuldan.sparss.utils.PrefUtils;
 
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-
 import java.io.BufferedReader;
 import java.io.ByteArrayOutputStream;
 import java.io.FileNotFoundException;
@@ -88,9 +86,10 @@
 import java.io.InputStreamReader;
 import java.io.StringReader;
 import java.io.UnsupportedEncodingException;
+import java.net.Authenticator;
 import java.net.HttpURLConnection;
+import java.net.PasswordAuthentication;
 import java.net.URL;
-import java.net.UnknownHostException;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.concurrent.Callable;
@@ -104,7 +103,6 @@
 import java.util.regex.Pattern;
 
 public class FetcherService extends IntentService {
-    private static final String TAG = "FetcherService";
 
     public static final String ACTION_REFRESH_FEEDS = "net.etuldan.sparss.REFRESH";
     public static final String ACTION_MOBILIZE_FEEDS = "net.etuldan.sparss.MOBILIZE_FEEDS";
@@ -190,7 +188,6 @@ public void run() {
                     }
                 });
             }
-            Log.d(TAG, "onHandleIntent: "+intent.getAction()+" aborted due to connectivity problem");
             return;
         }
 
@@ -198,23 +195,22 @@ public void run() {
                 && networkInfo.getType() != ConnectivityManager.TYPE_WIFI;
         // We need to skip the fetching process, so we quit
         if (skipFetch) {
-            Log.d(TAG, "onHandleIntent: abort intent action: " + intent.getAction() + " due to connectivity settings");
             return;
         }
 
-        Log.d(TAG, "onHandleIntent: intent action: " + intent.getAction());
-        PrefUtils.putBoolean(PrefUtils.IS_REFRESHING, true);
         if (ACTION_MOBILIZE_FEEDS.equals(intent.getAction())) {
             mobilizeAllEntries();
             downloadAllImages();
         } else if (ACTION_DOWNLOAD_IMAGES.equals(intent.getAction())) {
             downloadAllImages();
         } else { // == Constants.ACTION_REFRESH_FEEDS
+            PrefUtils.putBoolean(PrefUtils.IS_REFRESHING, true);
+
             if (isFromAutoRefresh) {
                 PrefUtils.putLong(PrefUtils.LAST_SCHEDULED_REFRESH, SystemClock.elapsedRealtime());
             }
 
-            long keepTime = Long.parseLong(PrefUtils.getString(PrefUtils.KEEP_TIME, "4")) * 86400000L;
+            long keepTime = Long.parseLong(PrefUtils.getString(PrefUtils.KEEP_TIME, "4")) * 86400000l;
             long keepDateBorderTime = keepTime > 0 ? System.currentTimeMillis() - keepTime : 0;
 
             deleteOldEntries(keepDateBorderTime);
@@ -272,8 +268,9 @@ public void run() {
 
             mobilizeAllEntries();
             downloadAllImages();
+
+            PrefUtils.putBoolean(PrefUtils.IS_REFRESHING, false);
         }
-        PrefUtils.putBoolean(PrefUtils.IS_REFRESHING, false);
     }
 
     private void mobilizeAllEntries() {
@@ -298,10 +295,7 @@ private void mobilizeAllEntries() {
 
             if (entryCursor.moveToFirst()) {
                 if (entryCursor.isNull(entryCursor.getColumnIndex(EntryColumns.MOBILIZED_HTML))) { // If we didn't already mobilized it
-                    Log.d(TAG, "mobilizeAllEntries: mobilizing entry " + entryId);
-
                     int linkPos = entryCursor.getColumnIndex(EntryColumns.LINK);
-                    int titlePos = entryCursor.getColumnIndex(EntryColumns.TITLE);
                     int abstractHtmlPos = entryCursor.getColumnIndex(EntryColumns.ABSTRACT);
                     int feedIdPos = entryCursor.getColumnIndex(EntryColumns.FEED_ID);
                     HttpURLConnection connection = null;
@@ -321,32 +315,22 @@ private void mobilizeAllEntries() {
                         final String httpAuthPassValue = cursorFeed.getString(httpAuthPasswordPosition);
                         cursorFeed.close();
 
-                        String fullSummary = entryCursor.getString(abstractHtmlPos);
-
-                        String mobilizedHtml;
-                        if(fullSummary.length() > 1000) {
-                            //if summary is long, it is most probably full text. use it!
-                            mobilizedHtml = fullSummary;
-                        } else {
-                            // Try to find a text indicator for better content extraction
-                            Document doc = Jsoup.parse(fullSummary);
-                            String contentIndicator = doc.text().substring(0, Math.min(doc.text().length(), 100));
-
-                            String titleIndicator = entryCursor.getString(titlePos);
-                            doc = Jsoup.parse(titleIndicator);
-                            titleIndicator = doc.text();
-
-//                        titleIndicator = Html.fromHtml(titleIndicator).toString();
-//                        titleIndicator = titleIndicator.replaceAll("[\\s\\u00A0]+"," "); //normalize, all whitespaces (incl char(160)) -> single space
-                            connection = NetworkUtils.setupConnection(link, cookieName, cookieValue, httpAuthLoginValue, httpAuthPassValue);
-                            
-                            mobilizedHtml = ArticleTextExtractor.extractContent(HtmlUtils.decompressStream(connection.getInputStream()), contentIndicator, titleIndicator);
-                            if(mobilizedHtml != null) {
-                                mobilizedHtml = HtmlUtils.improveHtmlContent(mobilizedHtml, NetworkUtils.getBaseUrl(connection.getURL().toURI().toString()));
+                        // Try to find a text indicator for better content extraction
+                        String contentIndicator = null;
+                        String text = entryCursor.getString(abstractHtmlPos);
+                        if (!TextUtils.isEmpty(text)) {
+                            text = Html.fromHtml(text).toString();
+                            if (text.length() > 60) {
+                                contentIndicator = text.substring(20, 40);
                             }
                         }
 
+                        connection = NetworkUtils.setupConnection(link,cookieName, cookieValue,httpAuthLoginValue, httpAuthPassValue);
+
+                        String mobilizedHtml = ArticleTextExtractor.extractContent(connection.getInputStream(), contentIndicator);
+
                         if (mobilizedHtml != null) {
+                            mobilizedHtml = HtmlUtils.improveHtmlContent(mobilizedHtml, NetworkUtils.getBaseUrl(link));
                             ContentValues values = new ContentValues();
                             values.put(EntryColumns.MOBILIZED_HTML, mobilizedHtml);
 
@@ -375,14 +359,12 @@ private void mobilizeAllEntries() {
                             }
                         }
                     } catch (Throwable ignored) {
-                        Log.e(TAG, "Exception: " + ignored.getMessage(), ignored);
                     } finally {
                         if (connection != null) {
                             connection.disconnect();
                         }
                     }
                 } else { // We already mobilized it
-                    Log.d(TAG, "mobilizeAllEntries: entry " + entryId + "@" + entryUri + " already mobilized");
                     success = true;
                     operations.add(ContentProviderOperation.newDelete(TaskColumns.CONTENT_URI(taskId)).build());
                 }
@@ -406,7 +388,6 @@ private void mobilizeAllEntries() {
             try {
                 cr.applyBatch(FeedData.AUTHORITY, operations);
             } catch (Throwable ignored) {
-                Log.e(TAG, "Exception", ignored);
             }
         }
     }
@@ -440,7 +421,6 @@ private void downloadAllImages() {
                     values.put(TaskColumns.NUMBER_ATTEMPT, nbAttempt + 1);
                     operations.add(ContentProviderOperation.newUpdate(TaskColumns.CONTENT_URI(taskId)).withValues(values).build());
                 }
-                Log.e(TAG, "downloadAllImages: Exception", e);
             }
         }
 
@@ -450,7 +430,6 @@ private void downloadAllImages() {
             try {
                 cr.applyBatch(FeedData.AUTHORITY, operations);
             } catch (Throwable ignored) {
-                Log.e(TAG, "Exception", ignored);
             }
         }
     }
@@ -499,7 +478,6 @@ public Integer call() {
                     try {
                         result = refreshFeed(feedId, keepDateBorderTime);
                     } catch (Exception ignored) {
-                        Log.e(TAG, "Exception", ignored);
                     }
                     return result;
                 }
@@ -513,7 +491,6 @@ public Integer call() {
                 Future<Integer> f = completionService.take();
                 globalResult += f.get();
             } catch (Exception ignored) {
-                Log.e(TAG, "Exception", ignored);
             }
         }
 
@@ -557,7 +534,7 @@ private int refreshFeed(String feedId, long keepDateBorderTime) {
 
                 if (fetchMode == 0) {
                     if (contentType != null && contentType.startsWith(CONTENT_TYPE_TEXT_HTML)) {
-                        BufferedReader reader = new BufferedReader(new InputStreamReader(HtmlUtils.decompressStream(connection.getInputStream())));
+                        BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
 
                         String line;
                         int posStart = -1;
@@ -616,7 +593,6 @@ private int refreshFeed(String feedId, long keepDateBorderTime) {
                                 Xml.findEncodingByName(index2 > -1 ? contentType.substring(index + 8, index2) : contentType.substring(index + 8));
                                 fetchMode = FETCHMODE_DIRECT;
                             } catch (UnsupportedEncodingException ignored) {
-                                Log.e(TAG, "Exception", ignored);
                                 fetchMode = FETCHMODE_REENCODE;
                             }
                         } else {
@@ -624,7 +600,7 @@ private int refreshFeed(String feedId, long keepDateBorderTime) {
                         }
 
                     } else {
-                        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(HtmlUtils.decompressStream(connection.getInputStream())));
+                        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
 
                         char[] chars = new char[20];
 
@@ -642,7 +618,6 @@ private int refreshFeed(String feedId, long keepDateBorderTime) {
                                 Xml.findEncodingByName(xmlDescription.substring(start + 10, xmlDescription.indexOf('"', start + 11)));
                                 fetchMode = FETCHMODE_DIRECT;
                             } catch (UnsupportedEncodingException ignored) {
-                                Log.e(TAG, "Exception", ignored);
                                 fetchMode = FETCHMODE_REENCODE;
                             }
                         } else {
@@ -663,19 +638,19 @@ private int refreshFeed(String feedId, long keepDateBorderTime) {
                             int index = contentType.indexOf(CHARSET);
                             int index2 = contentType.indexOf(';', index);
 
-                            InputStream inputStream = HtmlUtils.decompressStream(connection.getInputStream());
+                            InputStream inputStream = connection.getInputStream();
                             Xml.parse(inputStream,
                                     Xml.findEncodingByName(index2 > -1 ? contentType.substring(index + 8, index2) : contentType.substring(index + 8)),
                                     handler);
                         } else {
-                            InputStreamReader reader = new InputStreamReader(HtmlUtils.decompressStream(connection.getInputStream()));
+                            InputStreamReader reader = new InputStreamReader(connection.getInputStream());
                             Xml.parse(reader, handler);
                         }
                         break;
                     }
                     case FETCHMODE_REENCODE: {
                         ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
-                        InputStream inputStream = HtmlUtils.decompressStream(connection.getInputStream());
+                        InputStream inputStream = connection.getInputStream();
 
                         byte[] byteBuffer = new byte[4096];
 
@@ -705,7 +680,6 @@ private int refreshFeed(String feedId, long keepDateBorderTime) {
                                                 index + 8, index2) : contentType.substring(index + 8)));
                                         Xml.parse(reader, handler);
                                     } catch (Exception ignored) {
-                                        Log.e(TAG, "Exception", ignored);
                                     }
                                 } else {
                                     StringReader reader = new StringReader(xmlText);
@@ -728,7 +702,6 @@ private int refreshFeed(String feedId, long keepDateBorderTime) {
                     values.put(FeedColumns.ERROR, getString(R.string.error_feed_error));
                     cr.update(FeedColumns.CONTENT_URI(id), values, null, null);
                 }
-                Log.e(TAG, "refreshFeed: FileNotFoundException: ", e);
             } catch (Throwable e) {
                 if (handler == null || (!handler.isDone() && !handler.isCancelled())) {
                     ContentValues values = new ContentValues();
@@ -739,7 +712,6 @@ private int refreshFeed(String feedId, long keepDateBorderTime) {
                     values.put(FeedColumns.ERROR, e.getMessage() != null ? e.getMessage() : getString(R.string.error_feed_process));
                     cr.update(FeedColumns.CONTENT_URI(id), values, null, null);
                 }
-                Log.e(TAG, "refreshFeed: Exception: ", e);
             } finally {
 
 				/* check and optionally find favicon */
@@ -753,7 +725,6 @@ private int refreshFeed(String feedId, long keepDateBorderTime) {
                         }
                     }
                 } catch (Throwable ignored) {
-                    Log.d(TAG, "Exception favicon could not be retrieved.");
                 }
 
                 if (connection != null) {
diff --git a/mobile/src/main/java/net/etuldan/sparss/utils/ArticleTextExtractor.java b/mobile/src/main/java/net/etuldan/sparss/utils/ArticleTextExtractor.java
index 597bd19df..62dea4362 100755
--- a/mobile/src/main/java/net/etuldan/sparss/utils/ArticleTextExtractor.java
+++ b/mobile/src/main/java/net/etuldan/sparss/utils/ArticleTextExtractor.java
@@ -1,9 +1,6 @@
 package net.etuldan.sparss.utils;
 
-import android.util.Log;
-
 import org.jsoup.Jsoup;
-import org.jsoup.nodes.Attribute;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
@@ -22,34 +19,27 @@
  * @author Peter Karich
  */
 public class ArticleTextExtractor {
-    private static final String TAG = "ArticleTextExtractor";
 
     // Interesting nodes
-    private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section|main", Pattern.CASE_INSENSITIVE); //"main" is used by Joomla CMS
+    private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section");
 
     // Unlikely candidates
-    private static final Pattern UNLIKELY = Pattern.compile("^(com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|"
+    private static final Pattern UNLIKELY = Pattern.compile("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|"
             + "header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsor"
             + "a(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|"
-            + "login|si(debar|gn|ngle))", Pattern.CASE_INSENSITIVE);
-
-    // Most likely positive candidates for id, class, and attributes of matching node
-    private static final Pattern POSITIVE = Pattern.compile("(^(body|content|h?entry|main|page|post|text|blog|story|haupt"
-            + "|(news)?arti(cle|kel)|instapaper_body))", Pattern.CASE_INSENSITIVE);
-
-    // Most likely positive tag name for matching node
-    private static final Pattern POSITIVE_TAG_NAME = Pattern.compile("(^(article|main)$)", Pattern.CASE_INSENSITIVE);
+            + "login|si(debar|gn|ngle)");
 
-    // Most likely positive attribute for children of a matching parent node
-    private static final Pattern POSITIVE_CHILD_ATTRIBUTE = Pattern.compile("(^(paragra(ph|fo)|section))", Pattern.CASE_INSENSITIVE);
+    // Most likely positive candidates
+    private static final Pattern POSITIVE = Pattern.compile("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))"
+            + "|arti(cle|kel)|instapaper_body");
 
     // Most likely negative candidates
     private static final Pattern NEGATIVE = Pattern.compile("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
             + "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
-            + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard|paragra(ph|fo)", Pattern.CASE_INSENSITIVE);
+            + "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard");
 
     private static final Pattern NEGATIVE_STYLE =
-            Pattern.compile("hidden|display: ?none|font-size: ?small", Pattern.CASE_INSENSITIVE);
+            Pattern.compile("hidden|display: ?none|font-size: ?small");
 
     /**
      * @param input            extracts article text from given html string. wasn't tested
@@ -57,11 +47,11 @@ public class ArticleTextExtractor {
      * @param contentIndicator a text which should be included into the extracted content, or null
      * @return extracted article, all HTML tags stripped
      */
-    public static String extractContent(InputStream input, String contentIndicator, String titleIndicator) throws Exception {
-        return extractContent(Jsoup.parse(input, null, ""), contentIndicator, titleIndicator);
+    public static String extractContent(InputStream input, String contentIndicator) throws Exception {
+        return extractContent(Jsoup.parse(input, null, ""), contentIndicator);
     }
 
-    private static String extractContent(Document doc, String contentIndicator, String titleIndicator) {
+    public static String extractContent(Document doc, String contentIndicator) {
         if (doc == null)
             throw new NullPointerException("missing document");
 
@@ -70,199 +60,10 @@ private static String extractContent(Document doc, String contentIndicator, Stri
 
         // init elements
         Collection<Element> nodes = getNodes(doc);
-        Element bestMatchElement = null;
-
-        log(TAG, "======================================================");
-        log(TAG, "extractContent: " + titleIndicator + "");
-        if(doc.text().contains(titleIndicator)) { //newMatching can only work if title exists within text. 
-            bestMatchElement = newMatching(nodes, contentIndicator, titleIndicator);
-        }
-
-        if(bestMatchElement != null) {
-            log(TAG, "extractContent: new method worked. <"+bestMatchElement.tagName() + " " + 
-                    bestMatchElement.attributes().toString() + " length:" + bestMatchElement.text().length());
-        }
-
-        if(bestMatchElement == null) {
-            if(contentIndicator != null && !contentIndicator.isEmpty()) {
-                bestMatchElement = conventionalMatching(nodes, contentIndicator, true);
-                if(bestMatchElement != null) {
-                    log(TAG, "extractContent: conventionalMatching worked, withContentFilter==true <"
-                            +bestMatchElement.tagName() + " " + bestMatchElement.attributes().toString() + " length:" + bestMatchElement.text().length());
-                }
-            }
-            if (bestMatchElement == null) {
-                bestMatchElement = conventionalMatching(nodes, contentIndicator, false);
-                if(bestMatchElement != null) {
-                    log(TAG, "extractContent: conventionalMatching worked, withContentFilter==false <"
-                            +bestMatchElement.tagName() + " " + bestMatchElement.attributes().toString() + " length:" + bestMatchElement.text().length());
-                }
-            }
-        }
-
-        if (bestMatchElement == null || bestMatchElement.text().isEmpty()) {
-            Log.e(TAG, "extractContent failed. Returning document body.");
-            bestMatchElement = doc.select("body").first();
-        }
-
-//        log(TAG, "extractContent: -----------------------------------------------------");
-//        log(TAG, bestMatchElement.text());
-//        log(TAG, "extractContent: -----------------------------------------------------");
-        
-        addImageSiblings(bestMatchElement);
-        removeUnwantedElements(bestMatchElement);
-        fixVideoTags(bestMatchElement);
-        fixImageTags(bestMatchElement);
-
-        return bestMatchElement.toString();
-    }
-
-    private static void fixImageTags(Element bestMatchElement) {
-        //search for img and remove lazy-loading
-//        IF IMAGE TAG LOOKS LIKE THIS:
-//        <figure class="NewsArticle__ChapterImage LazyImage mt-sm" data-lazy-image="{&quot;src&quot;: &quot;/ii/4/5/4/7/2/9/8/8/d51292db9620e5ed.jpeg&quot; }" data-lazy-image-text="Bild lädt...">
-//        <img src="data:image/svg+xml;charset=utf-8,%3Csvg xmlns%3D'http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg' viewBox%3D'0 0 4 3'%2F%3E">
-//        ...
-//        </figure>
-//        TRANSFORM TO THIS:
-//        <img src="/ii/4/5/4/7/2/9/8/8/d51292db9620e5ed.jpeg">
-        for (Element img : bestMatchElement.getElementsByTag("img")) {
-            String src = null;
-            if(img.parent() != null && img.parent().tag().getName().equals("figure")) {
-                Element parent = img.parent();
-                String json = parent.attr("data-lazy-image");
-//                    JSONObject obj = new JSONObject(json); //does not work.
-//                    src = obj.getString("src");            //WHY?
-                if(json.length() > 7 && json.substring(2, 5).equals("src")) {
-                    json = json.substring(6);//remove "src"
-                    int first = json.indexOf("\"") + 1;
-                    int last = json.indexOf("\"", first);
-                    src = json.substring(first, last);
-                }
-            }
-            if(src == null && img.hasAttr("data-src")) {
-                src = img.attr("data-src");
-            }
-            if(src != null) {
-                img.attr("src", src);
-                log(TAG, "extractContent: removed lazy-load " + src);
-            }
-        }
-    }
-
-    private static void fixVideoTags(Element bestMatchElement) {
-        //search for video tags and fix them if necessary
-//        IF VIDEO TAG LOOKS LIKE THIS:
-//        <video style="position: absolute; top: 0px; display: none; width: 100%; padding-top: 56.25%;">
-//        ...
-//        <meta itemprop="thumbnailUrl" content="http://...">
-//        <meta itemprop="contentURL" content="http://...">
-//        </video>
-//        TRANSFORM TO THIS:
-//        <video controls poster="http://...">
-//        <source src="http://...">
-//        </video>
-        for (Element video : bestMatchElement.getElementsByTag("video")) {
-            String thumb = null;
-            String url = null;
-            for (Element meta : video.getElementsByTag("meta")) {
-                if(meta.attr("itemprop").equals("thumbnailUrl")) {
-                    thumb = meta.attr("content");
-                }
-                if(meta.attr("itemprop").equals("contentURL")) {
-                    url = meta.attr("content");
-                }
-            }
-            if(thumb != null && url != null) {
-                video.attr("controls", true);
-                video.attr("poster", thumb);
-                video.appendElement("source").attr("src", url);
-                log(TAG, "extractContent: fixed video " + url);
-            }
-        }
-    }
-    private static final Pattern UNWANTED_TAGS = Pattern.compile("^(aside)$", Pattern.CASE_INSENSITIVE);
-    private static final Pattern UNWANTED_CLASSES = Pattern.compile("^(msgCenter|correlat(ed|i)|breadcrumb|TopNews)$", Pattern.CASE_INSENSITIVE);
-    private static final Pattern UNWANTED_IDS = Pattern.compile("^(commentiMsgCenter|disclaimer|(comment-)?navigation)$", Pattern.CASE_INSENSITIVE);
-    
-    private static void removeUnwantedElements(Element bestMatchElement) {
-        for (Element child : bestMatchElement.children()) {
-            if (UNWANTED_CLASSES.matcher(child.className()).find()) {
-                child.remove();
-                continue;
-            }
-            if(UNWANTED_TAGS.matcher(child.tagName()).find()) {
-                child.remove();
-                continue;
-            }
-            if(UNWANTED_IDS.matcher(child.id()).find()) {
-                child.remove();
-                continue;
-            }
-        }
-    }
-
-    private static void addImageSiblings(Element bestMatchElement) {
-        //check siblings for images and add them if any available
-        Element previousSibling = bestMatchElement.previousElementSibling();
-        while(previousSibling != null) {
-            if (previousSibling.select("img").size() != 0 &&
-                    previousSibling.children().size() <= 3 //only img and optional two descriptions
-                        && previousSibling.text().length() < 200 //only short description 
-                        ) {
-                    bestMatchElement.prependChild(previousSibling);
-                    log(TAG, "extractContent: prepended image " + previousSibling);
-                    previousSibling = bestMatchElement.previousElementSibling();
-                
-            } else {
-                previousSibling = previousSibling.previousElementSibling();
-            }
-        }
-        Element nextSibling = bestMatchElement.nextElementSibling();
-        while(nextSibling != null) {
-            if (nextSibling.select("img").size() != 0 &&
-                    nextSibling.children().size() <= 2 //only img and optional description
-                        && nextSibling.text().length() < 160 //only short description 
-                        ) {
-                    bestMatchElement.appendChild(nextSibling);
-                    log(TAG, "extractContent: appended image <" + nextSibling.tagName() + " " + nextSibling.attributes());
-                    nextSibling = bestMatchElement.nextElementSibling();
-                
-            } else {
-                nextSibling = nextSibling.nextElementSibling();
-            }
-        }
-    }
-
-    private static void log(String tag, String s) {
-        Log.d(tag, s);
-        System.out.println(tag + ": " + s);
-    }
-
-    /**
-     * Conventional matching algorithm. 
-     * @param nodes All HTML elements to be considered.
-     * @param contentIndicator Only required if withContentFilter==true
-     * @param withContentFilter If true only nodes containing contentIndicator are considered
-     * @return Best matching node or null
-     */
-    private static Element conventionalMatching(Collection<Element> nodes, String contentIndicator, boolean withContentFilter) {
         int maxWeight = 0;
         Element bestMatchElement = null;
+
         for (Element entry : nodes) {
-            String text = entry.text();
-            text = text.substring(0, Math.min(200, text.length())).replaceAll("[\\s\\u00A0]+"," "); //normalized beginning of text
-            //only consider entries which contain the contentIndicator if withContentFilter 
-            if (withContentFilter && !text.contains(contentIndicator)) {
-                continue;
-            }
-//            if(entry.tagName().equals("article") || entry.className().equals("NewsArticle"))
-//            {
-//                maxWeight++;maxWeight--;
-//            }
-//            if(entry.attr("itemprop").equals("articleBody")) {
-//                maxWeight++; maxWeight--;
-//            }
             int currentWeight = getWeight(entry, contentIndicator);
             if (currentWeight > maxWeight) {
                 maxWeight = currentWeight;
@@ -273,62 +74,14 @@ private static Element conventionalMatching(Collection<Element> nodes, String co
                 }
             }
         }
-        if (withContentFilter && maxWeight < 70) {
-            bestMatchElement = null;
-        }
-        return bestMatchElement;
-    }
-
-    /**
-     * New matching algorithm. Find largest node which contains content but not title.
-     * @param nodes
-     * @param contentIndicator
-     * @param titleIndicator
-     * @return
-     */
-    private static Element newMatching(Collection<Element> nodes, String contentIndicator, String titleIndicator) {
-        int maxWeight = 0;
-        Element bestMatchElement = null;
 
-        if(contentIndicator != null && !contentIndicator.isEmpty()) {
-            //first largest node which contains content but not title. that is the content we want.
-            for (Element entry : nodes) {
-//                if(entry.attr("itemprop").equals("articleBody")) {
-//                    maxWeight++; maxWeight--;
-//                }
-//                if(entry.tagName().equals("article") || entry.className().equals("NewsArticle"))
-//                {
-//                    maxWeight++;maxWeight--;
-//                }
-                String text = entry.text().replaceAll("\u00A0", ""); //entry may contain &nbsp; characters which need to be filtered first.
-                text = Jsoup.parse(text).text(); //now text is normalized (like description from rss feed)
-                if(text.contains(contentIndicator)) {
-                    if(!text.contains(titleIndicator)) {
-                        if(entry.text().length() > 200) { //ignore very small tags
-                            float factor = 1;
-                            if (POSITIVE.matcher(entry.className()).find())
-                                factor *= 1.4;
-                            if (POSITIVE.matcher(entry.id()).find())
-                                factor *= 1.4;
-                            for (Attribute a : entry.attributes()) {
-                                if (POSITIVE.matcher(a.getValue()).find())
-                                    factor *= 1.4;
-                            }
-                            int weight = (int) ((float) entry.text().length() * factor);
-                            if (maxWeight < weight) { //use whole content length here!
-                                maxWeight = weight;
-                                bestMatchElement = entry;
-                            }
-                            if (POSITIVE_TAG_NAME.matcher(entry.tagName()).find())
-                                maxWeight += 50;
-                        }
-                    }
-                }
-            }
+        if (bestMatchElement != null) {
+            return bestMatchElement.toString();
         }
-        return bestMatchElement;
+
+        return null;
     }
-    
+
     /**
      * Weights current element. By matching it with positive candidates and
      * weighting child nodes. Since it's impossible to predict which exactly
@@ -340,8 +93,7 @@ private static Element newMatching(Collection<Element> nodes, String contentIndi
      */
     private static int getWeight(Element e, String contentIndicator) {
         int weight = calcWeight(e);
-        //often the wanted elements consists only tags, no text. so this is not needed.
-        //weight += (int) Math.round(e.ownText().length() / 100.0 * 10);
+        weight += (int) Math.round(e.ownText().length() / 100.0 * 10);
         weight += weightChildNodes(e, contentIndicator);
         return weight;
     }
@@ -364,31 +116,20 @@ private static int weightChildNodes(Element rootEl, String contentIndicator) {
         Element caption = null;
         List<Element> pEls = new ArrayList<>(5);
         for (Element child : rootEl.children()) {
-            //if child contains only (!) a single child, get that sub-child instead (recursively!)
-            while(child.children().size() == 1 && child.text().length() == 0) {
-                child = child.child(0);
-            }
             String text = child.text();
             int textLength = text.length();
             if (textLength < 20) {
                 continue;
             }
 
-            //this is not reliable. there are many tags (tree hierarchy) which contain contentIndicator,
-            //at this point we cannot be certain that this is the tag we actually want.
-            //if (contentIndicator != null && text.contains(contentIndicator)) {
-            //    weight += 100; // We certainly found the item
-            //}
-
-            for (Attribute a : child.attributes()                ) {
-                if (POSITIVE_CHILD_ATTRIBUTE.matcher(a.getValue()).find())
-                    weight += 30;
+            if (contentIndicator != null && text.contains(contentIndicator)) {
+                weight += 100; // We certainly found the item
             }
 
             String ownText = child.ownText();
             int ownTextLength = ownText.length();
             if (ownTextLength > 200) {
-                weight += 20;
+                weight += Math.max(50, ownTextLength / 10);
             }
 
             if (child.tagName().equals("h1") || child.tagName().equals("h2")) {
@@ -425,24 +166,12 @@ private static int calcWeightForChild(String text) {
 
     private static int calcWeight(Element e) {
         int weight = 0;
-
-        if (POSITIVE_TAG_NAME.matcher(e.tagName()).find())
-            weight += 35;
-        
         if (POSITIVE.matcher(e.className()).find())
             weight += 35;
 
         if (POSITIVE.matcher(e.id()).find())
             weight += 40;
 
-        //also allow custom HTML attributes, e.g. like Joomla uses: itemprop="articleBody"
-        for (Attribute a : e.attributes()                ) {
-            if(a.getKey().equals("class") || a.getKey().equals("id") )
-                continue; //already accounted for above.
-            if (POSITIVE.matcher(a.getValue()).find())
-                weight += 35;            
-        }
-
         if (UNLIKELY.matcher(e.className()).find())
             weight -= 20;
 
@@ -497,13 +226,9 @@ private static Document removeScriptsAndStyles(Document doc) {
             item.remove();
         }
 
-        //some websites include images inside noscript tags, e.g. https://www.nrdc.org/rss.xml
         Elements noscripts = doc.getElementsByTag("noscript");
         for (Element item : noscripts) {
-            if(item.getElementsByTag("img").size() == 0) {
-                //only remove if <noscript> does not contain image
-                item.remove();
-            }
+            item.remove();
         }
 
         Elements styles = doc.getElementsByTag("style");
diff --git a/mobile/src/test/java/net/etuldan/sparss/utils/ArticleTextExtractorTest.java b/mobile/src/test/java/net/etuldan/sparss/utils/ArticleTextExtractorTest.java
index 5c100cf6e..9a919739b 100755
--- a/mobile/src/test/java/net/etuldan/sparss/utils/ArticleTextExtractorTest.java
+++ b/mobile/src/test/java/net/etuldan/sparss/utils/ArticleTextExtractorTest.java
@@ -413,7 +413,7 @@ private void checkArticle(String articleUrl, String titleIndicator, String conte
         
         InputStream in = decompressStream(con.getInputStream());
 
-        String s = ArticleTextExtractor.extractContent(in, contentIndicator, titleIndicator);
+        String s = ArticleTextExtractor.extractContent(in, contentIndicator);
         s = HtmlUtils.improveHtmlContent(s, NetworkUtils.getBaseUrl(con.getURL().toURI().toString())); //make image paths absolute
         Document doc = Jsoup.parse(s);