From d7ddd0971624faf33a2341266bd13b5cb0e0edff Mon Sep 17 00:00:00 2001
From: Burton DeWilde <burton@chartbeat.com>
Date: Tue, 20 Dec 2016 13:44:06 -0500
Subject: [PATCH] Removed arias module, features, and tests

---
 dragnet/__init__.py |   3 --
 dragnet/arias.py    | 126 --------------------------------------------
 test/test_arias.py  |  31 -----------
 3 files changed, 160 deletions(-)
 delete mode 100644 dragnet/arias.py
 delete mode 100644 test/test_arias.py

diff --git a/dragnet/__init__.py b/dragnet/__init__.py
index 05af217..9f00b6f 100644
--- a/dragnet/__init__.py
+++ b/dragnet/__init__.py
@@ -1,4 +1,3 @@
-from dragnet.arias import AriasFeatures, Arias
 from dragnet.blocks import Blockifier, PartialBlock, BlockifyError
 from dragnet.features import NormalizedFeature, CSSFeatures
 from dragnet.content_extraction_model import ContentExtractionModel
@@ -22,8 +21,6 @@ def get(key, *args, **kwargs):
             return NormalizedFeature(kohlschuetter_features)
         elif key == 'css':
             return CSSFeatures()
-        elif key == 'arias':
-            return AriasFeatures(*args, **kwargs)
         elif key == 'weninger':
             return weninger_features_kmeans
         elif key == 'readability':
diff --git a/dragnet/arias.py b/dragnet/arias.py
deleted file mode 100644
index 358b8fa..0000000
--- a/dragnet/arias.py
+++ /dev/null
@@ -1,126 +0,0 @@
-#! /usr/bin/env python
-"""
-A *rough* implementation of that described by Aurias et al.:
-https://lirias.kuleuven.be/bitstream/123456789/215528/1/AriasEtAl2009.pdf
-"""
-from .blocks import Blockifier
-from .content_extraction_model import ContentExtractionModel, IdentityPredictor
-
-import numpy as np
-import scipy.weave
-
-
-class AriasFeatures(object):
-    """A global feature based on connected blocks of long text
-      inspired by Arias"""
-    nfeatures = 1
-
-    def __init__(self, percent_cutoff, window):
-        """Set parameters
-
-        percent_cutoff = we use scipy.percentile(block_lengths, percent_cutoff)
-            to determine the min length to call content
-            percent_cutoff is a float in [0, 100]
-        window = the window parameter to strip"""
-        self._percent_cutoff = percent_cutoff
-        self._window = window
-
-    def __call__(self, blocks, train=False):
-        from scipy import percentile
-        features = np.zeros((len(blocks), AriasFeatures.nfeatures))
-
-        block_lengths = np.array([len(block.text) for block in blocks])
-        index = block_lengths.argmax()
-        cutoff = int(percentile(block_lengths, self._percent_cutoff))
-        lowindex, highindex = AriasFeatures.strip(block_lengths, index, self._window, cutoff)
-        features[lowindex:(highindex + 1), 0] = 1.0
-        return features
-
-    @staticmethod
-    def strip(block_lengths, index, window, cutoff):
-        """Strip a list of blocks down to the content.
-
-        Starting at some block index, expand outward to left and right until we
-        encounter `window` consecutive blocks with length less then `cutoff`.
-
-        block_lengths = 1D numpy array of length of text in blocks
-            in document
-        index = the starting index for the determination
-        window = we need this many consecutive blocks <= cutoff to terminate
-        cutoff = min block length to be considered content
-
-        returns start/end block indices of the content
-        """
-        ret = np.zeros(2, np.int)
-        nblock = len(block_lengths)
-        c_code = """
-            // First we'll work backwards to find the beginning index, and then we'll
-            // work forward to find the ending index, and then we'll just take that
-            // slice to be our content
-            int lowindex  = index;
-            int lastindex = index;
-            while (lowindex > 0)
-            {
-                if (lastindex - lowindex > window)
-                    break;
-                if (block_lengths(lowindex) >= cutoff)
-                    lastindex = lowindex;
-                lowindex--;
-            }
-            ret(0) = lastindex;
-
-            // Like above, except we're looking in the forward direction
-            int highindex = index;
-            lastindex = index;
-            while (highindex < nblock)
-            {
-                if (highindex - lastindex > window)
-                    break;
-                if (block_lengths(highindex) >= cutoff)
-                    lastindex = highindex;
-                highindex++;
-            }
-            ret(1) = lastindex;
-        """
-        scipy.weave.inline(
-            c_code,
-            ['ret', 'nblock', 'index', 'window', 'cutoff', 'block_lengths'],
-            type_converters=scipy.weave.converters.blitz)
-        return ret
-
-
-class Arias(ContentExtractionModel):
-    def __init__(self, percentile_cutoff=25, window=3, blockifier=Blockifier, **kwargs):
-        """A Arias model.
-
-        percentile_cutoff, window = passed into AriasFeatures to determine the model
-        blockifier = something that implements Blockify
-        **kwags = any kwargs to pass into ContentExtractionModel.__init__"""
-        features = [AriasFeatures(percentile_cutoff, window)]
-        ContentExtractionModel.__init__(self, blockifier, features, IdentityPredictor, **kwargs)
-
-    @staticmethod
-    def plot(L, name, low, hi, cutoff):
-        '''
-        Helper method to plot the document (like in the paper)
-        '''
-        import numpy as np
-        import matplotlib.pyplot as plt
-
-        # First, plot up to the low point in blue...
-        plt.bar(np.arange(low), [len(l) for l in L[0:low]], linewidth=0.0)
-        # And now from low-high in red
-        plt.bar(
-            np.arange(low, hi + 1), [len(l) for l in L[low:hi + 1]],
-            linewidth=0.0, color='r')
-        # And from then on in blue
-        plt.bar(
-            np.arange(hi + 1, len(L)), [len(l) for l in L[hi + 1:]],
-            linewidth=0.0)
-        # Lastly, apply a line across the board at the cutoff
-        plt.plot([0, len(L)], [cutoff, cutoff], 'g-')
-
-        plt.xlabel('Order')
-        plt.ylabel('Length')
-        plt.title(name)
-        plt.show()
diff --git a/test/test_arias.py b/test/test_arias.py
deleted file mode 100644
index f0fa3a2..0000000
--- a/test/test_arias.py
+++ /dev/null
@@ -1,31 +0,0 @@
-
-import unittest
-import numpy as np
-from scipy import percentile
-
-from html_for_testing import big_html_doc
-from dragnet import Blockifier, Arias
-
-
-class TestArias(unittest.TestCase):
-    def test_arias_model(self):
-        cutoff_percent = 60
-        window = 2
-
-        a = Arias(cutoff_percent, window)
-        content_arias = a.analyze(big_html_doc)
-
-        # now compute the actual content
-        blocks = Blockifier.blockify(big_html_doc)
-        actual_content_indices = [1, 2, 3]
-        actual_content = ' '.join([blocks[k].text for k in actual_content_indices])
-
-        self.assertEqual(actual_content, content_arias)
-
-
-if __name__ == "__main__":
-    unittest.main()
-
-
-
-