From d7ddd0971624faf33a2341266bd13b5cb0e0edff Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Tue, 20 Dec 2016 13:44:06 -0500 Subject: [PATCH] Removed arias module, features, and tests --- dragnet/__init__.py | 3 -- dragnet/arias.py | 126 -------------------------------------------- test/test_arias.py | 31 ----------- 3 files changed, 160 deletions(-) delete mode 100644 dragnet/arias.py delete mode 100644 test/test_arias.py diff --git a/dragnet/__init__.py b/dragnet/__init__.py index 05af217..9f00b6f 100644 --- a/dragnet/__init__.py +++ b/dragnet/__init__.py @@ -1,4 +1,3 @@ -from dragnet.arias import AriasFeatures, Arias from dragnet.blocks import Blockifier, PartialBlock, BlockifyError from dragnet.features import NormalizedFeature, CSSFeatures from dragnet.content_extraction_model import ContentExtractionModel @@ -22,8 +21,6 @@ def get(key, *args, **kwargs): return NormalizedFeature(kohlschuetter_features) elif key == 'css': return CSSFeatures() - elif key == 'arias': - return AriasFeatures(*args, **kwargs) elif key == 'weninger': return weninger_features_kmeans elif key == 'readability': diff --git a/dragnet/arias.py b/dragnet/arias.py deleted file mode 100644 index 358b8fa..0000000 --- a/dragnet/arias.py +++ /dev/null @@ -1,126 +0,0 @@ -#! /usr/bin/env python -""" -A *rough* implementation of that described by Aurias et al.: -https://lirias.kuleuven.be/bitstream/123456789/215528/1/AriasEtAl2009.pdf -""" -from .blocks import Blockifier -from .content_extraction_model import ContentExtractionModel, IdentityPredictor - -import numpy as np -import scipy.weave - - -class AriasFeatures(object): - """A global feature based on connected blocks of long text - inspired by Arias""" - nfeatures = 1 - - def __init__(self, percent_cutoff, window): - """Set parameters - - percent_cutoff = we use scipy.percentile(block_lengths, percent_cutoff) - to determine the min length to call content - percent_cutoff is a float in [0, 100] - window = the window parameter to strip""" - self._percent_cutoff = percent_cutoff - self._window = window - - def __call__(self, blocks, train=False): - from scipy import percentile - features = np.zeros((len(blocks), AriasFeatures.nfeatures)) - - block_lengths = np.array([len(block.text) for block in blocks]) - index = block_lengths.argmax() - cutoff = int(percentile(block_lengths, self._percent_cutoff)) - lowindex, highindex = AriasFeatures.strip(block_lengths, index, self._window, cutoff) - features[lowindex:(highindex + 1), 0] = 1.0 - return features - - @staticmethod - def strip(block_lengths, index, window, cutoff): - """Strip a list of blocks down to the content. - - Starting at some block index, expand outward to left and right until we - encounter `window` consecutive blocks with length less then `cutoff`. - - block_lengths = 1D numpy array of length of text in blocks - in document - index = the starting index for the determination - window = we need this many consecutive blocks <= cutoff to terminate - cutoff = min block length to be considered content - - returns start/end block indices of the content - """ - ret = np.zeros(2, np.int) - nblock = len(block_lengths) - c_code = """ - // First we'll work backwards to find the beginning index, and then we'll - // work forward to find the ending index, and then we'll just take that - // slice to be our content - int lowindex = index; - int lastindex = index; - while (lowindex > 0) - { - if (lastindex - lowindex > window) - break; - if (block_lengths(lowindex) >= cutoff) - lastindex = lowindex; - lowindex--; - } - ret(0) = lastindex; - - // Like above, except we're looking in the forward direction - int highindex = index; - lastindex = index; - while (highindex < nblock) - { - if (highindex - lastindex > window) - break; - if (block_lengths(highindex) >= cutoff) - lastindex = highindex; - highindex++; - } - ret(1) = lastindex; - """ - scipy.weave.inline( - c_code, - ['ret', 'nblock', 'index', 'window', 'cutoff', 'block_lengths'], - type_converters=scipy.weave.converters.blitz) - return ret - - -class Arias(ContentExtractionModel): - def __init__(self, percentile_cutoff=25, window=3, blockifier=Blockifier, **kwargs): - """A Arias model. - - percentile_cutoff, window = passed into AriasFeatures to determine the model - blockifier = something that implements Blockify - **kwags = any kwargs to pass into ContentExtractionModel.__init__""" - features = [AriasFeatures(percentile_cutoff, window)] - ContentExtractionModel.__init__(self, blockifier, features, IdentityPredictor, **kwargs) - - @staticmethod - def plot(L, name, low, hi, cutoff): - ''' - Helper method to plot the document (like in the paper) - ''' - import numpy as np - import matplotlib.pyplot as plt - - # First, plot up to the low point in blue... - plt.bar(np.arange(low), [len(l) for l in L[0:low]], linewidth=0.0) - # And now from low-high in red - plt.bar( - np.arange(low, hi + 1), [len(l) for l in L[low:hi + 1]], - linewidth=0.0, color='r') - # And from then on in blue - plt.bar( - np.arange(hi + 1, len(L)), [len(l) for l in L[hi + 1:]], - linewidth=0.0) - # Lastly, apply a line across the board at the cutoff - plt.plot([0, len(L)], [cutoff, cutoff], 'g-') - - plt.xlabel('Order') - plt.ylabel('Length') - plt.title(name) - plt.show() diff --git a/test/test_arias.py b/test/test_arias.py deleted file mode 100644 index f0fa3a2..0000000 --- a/test/test_arias.py +++ /dev/null @@ -1,31 +0,0 @@ - -import unittest -import numpy as np -from scipy import percentile - -from html_for_testing import big_html_doc -from dragnet import Blockifier, Arias - - -class TestArias(unittest.TestCase): - def test_arias_model(self): - cutoff_percent = 60 - window = 2 - - a = Arias(cutoff_percent, window) - content_arias = a.analyze(big_html_doc) - - # now compute the actual content - blocks = Blockifier.blockify(big_html_doc) - actual_content_indices = [1, 2, 3] - actual_content = ' '.join([blocks[k].text for k in actual_content_indices]) - - self.assertEqual(actual_content, content_arias) - - -if __name__ == "__main__": - unittest.main() - - - -