Merge pull request #34 from bdewilde/pep8-and-py23

Stylistic cleanup and (start of) Py2/3 compatibility
dragnet-org · Dec 23, 2016 · fd59e21 · fd59e21
2 parents e8800d3 + 5ecf283
commit fd59e21
Show file tree

Hide file tree

Showing 17 changed files with 427 additions and 416 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 *.pyc
 *.so
 build/*
+dist/*
 dragnet.egg-info
 
 # cython temporary files
@@ -22,4 +23,3 @@ output/*
 *.swp
 
 .vagrant
-
diff --git a/dragnet/__init__.py b/dragnet/__init__.py
@@ -1,19 +1,17 @@
-#! /usr/bin/env python
-
-from .arias import AriasFeatures, Arias
-from .blocks import Blockifier, PartialBlock, BlockifyError
-from .features import NormalizedFeature, CSSFeatures
-from .content_extraction_model import ContentExtractionModel
-from .kohlschuetter import kohlschuetter_features, kohlschuetter
-from .util import evaluation_metrics
-from .weninger import weninger_features_kmeans
-from .readability import readability_features
-from .models import content_extractor, content_comments_extractor
+from dragnet.arias import AriasFeatures, Arias
+from dragnet.blocks import Blockifier, PartialBlock, BlockifyError
+from dragnet.features import NormalizedFeature, CSSFeatures
+from dragnet.content_extraction_model import ContentExtractionModel
+from dragnet.kohlschuetter import kohlschuetter_features, kohlschuetter
+from dragnet.util import evaluation_metrics
+from dragnet.weninger import weninger_features_kmeans
+from dragnet.readability import readability_features
+from dragnet.models import content_extractor, content_comments_extractor
 
 
 class AllFeatures(object):
     """Easy access to feature instances.
-    
+
     We need a way to get instances of the feature classes.
     Since these classes are potentially mutated by clients,
     we create a new instance on each access"""
@@ -32,4 +30,3 @@ def get(key, *args, **kwargs):
             return NormalizedFeature(readability_features)
         else:
             raise KeyError
-
diff --git a/dragnet/arias.py b/dragnet/arias.py
@@ -1,14 +1,15 @@
 #! /usr/bin/env python
-
-# A /rough/ implementation of that described by Aurias et al.:
-#    https://lirias.kuleuven.be/bitstream/123456789/215528/1/AriasEtAl2009.pdf
-
+"""
+A *rough* implementation of that described by Aurias et al.:
+https://lirias.kuleuven.be/bitstream/123456789/215528/1/AriasEtAl2009.pdf
+"""
 from .blocks import Blockifier
 from .content_extraction_model import ContentExtractionModel, IdentityPredictor
 
 import numpy as np
 import scipy.weave
 
+
 class AriasFeatures(object):
     """A global feature based on connected blocks of long text
       inspired by Arias"""
@@ -24,7 +25,6 @@ def __init__(self, percent_cutoff, window):
         self._percent_cutoff = percent_cutoff
         self._window = window
 
-
     def __call__(self, blocks, train=False):
         from scipy import percentile
         features = np.zeros((len(blocks), AriasFeatures.nfeatures))
@@ -36,7 +36,6 @@ def __call__(self, blocks, train=False):
         features[lowindex:(highindex + 1), 0] = 1.0
         return features
 
-
     @staticmethod
     def strip(block_lengths, index, window, cutoff):
         """Strip a list of blocks down to the content.
@@ -83,9 +82,10 @@ def strip(block_lengths, index, window, cutoff):
             }
             ret(1) = lastindex;
         """
-        scipy.weave.inline(c_code,
-                ['ret', 'nblock', 'index', 'window', 'cutoff', 'block_lengths'],
-                type_converters=scipy.weave.converters.blitz)
+        scipy.weave.inline(
+            c_code,
+            ['ret', 'nblock', 'index', 'window', 'cutoff', 'block_lengths'],
+            type_converters=scipy.weave.converters.blitz)
         return ret
 
 
@@ -108,18 +108,19 @@ def plot(L, name, low, hi, cutoff):
         import matplotlib.pyplot as plt
 
         # First, plot up to the low point in blue...
-        p1 = plt.bar(np.arange(low), [len(l) for l in L[0:low]], linewidth=0.0)
+        plt.bar(np.arange(low), [len(l) for l in L[0:low]], linewidth=0.0)
         # And now from low-high in red
-        p2 = plt.bar(np.arange(low, hi+1), [len(l) for l in L[low:hi+1]],
+        plt.bar(
+            np.arange(low, hi + 1), [len(l) for l in L[low:hi + 1]],
             linewidth=0.0, color='r')
         # And from then on in blue
-        p3 = plt.bar(np.arange(hi+1, len(L)), [len(l) for l in L[hi+1:]],
+        plt.bar(
+            np.arange(hi + 1, len(L)), [len(l) for l in L[hi + 1:]],
             linewidth=0.0)
         # Lastly, apply a line across the board at the cutoff
-        line = plt.plot([0, len(L)], [cutoff, cutoff], 'g-')
+        plt.plot([0, len(L)], [cutoff, cutoff], 'g-')
 
         plt.xlabel('Order')
         plt.ylabel('Length')
         plt.title(name)
         plt.show()
-
diff --git a/dragnet/blocks.pyx b/dragnet/blocks.pyx
@@ -51,23 +51,21 @@ cdef inline int int_min(int a, int b): return a if a <= b else b
 
 # tags we'll ignore completely
 cdef cpp_set[string] BLACKLIST
-BLACKLIST = set([
-    'applet', 'area', 'base', 'basefont', 'bdo', 'button', 
-    'caption', 'fieldset', 'fram', 'frameset', 
-    'iframe', 'img', 'input', 'legend', 'link', 'menu', 'meta', 
-    'noframes', 'noscript', 'object', 'optgroup', 'option', 'param', 
-    'script', 'select', 'style', 'textarea', 'var', 'xmp',
-    'like', 'like-box', 'plusone',
+BLACKLIST = {
+    b'applet', b'area', b'base', b'basefont', b'bdo', b'button',
+    b'caption', b'fieldset', b'fram', b'frameset',
+    b'iframe', b'img', b'input', b'legend', b'link', b'menu', b'meta',
+    b'noframes', b'noscript', b'object', b'optgroup', b'option', b'param',
+    b'script', b'select', b'style', b'textarea', b'var', b'xmp',
+    b'like', b'like-box', b'plusone',
     # HTML5 vector image tags and math tags
-    'svg', 'math'
-])
+    b'svg', b'math'
+    }
 
 
 # tags defining the blocks we'll extract
 cdef cpp_set[string] BLOCKS
-BLOCKS = set([
-    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'table', 'map',
-])
+BLOCKS = {b'h1', b'h2', b'h3', b'h4', b'h5', b'h6', b'p', b'div', b'table', b'map'}
 
 # define some commonly used strings here, otherwise Cython will always add
 # a little python overhead when using them even though they are constat
@@ -87,14 +85,13 @@ re_readability_positive = re.compile('article|body|content|entry|hentry|main|pag
 cdef string DIV = <string>'div'
 
 cdef cpp_set[string] READABILITY_PLUS3
-READABILITY_PLUS3 = set(["pre", "td", "blockquote"])
+READABILITY_PLUS3 = {b'pre', b'td', b'blockquote'}
 
 cdef cpp_set[string] READABILITY_MINUS3
-READABILITY_MINUS3 = set(
-    ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"])
+READABILITY_MINUS3 = {b'address', b'ol', b'ul', b'dl', b'dd', b'dt', b'li', b'form'}
 
 cdef cpp_set[string] READABILITY_MINUS5
-READABILITY_MINUS5 = set(["h1", "h2", "h3", "h4", "h5", "h6", "th"])
+READABILITY_MINUS5 = {b'h1', b'h2', b'h3', b'h4', b'h5', b'h6', b'th'}
 
 
 cdef cpp_set[char] WHITESPACE = set([<char>' ', <char>'\t', <char>'\n',
@@ -571,7 +568,7 @@ cdef class PartialBlock:
         # finally store it
         self.class_weights.push_back(pair[uint32_t, int](self.tag_id, weight))
         self.class_weights_written.insert(self.tag_id)
-    
+
     cdef void reinit_readability(self):
         self.ancestors_write = self.ancestors
 
@@ -695,12 +692,12 @@ cdef class TagCountPB(PartialBlock):
     # Since we don't output empty blocks, we also keep track of the
     # tag count since the last block we output as an additional feature
     #
-    
+
     # _tc = tag count in the current block, since the last <div>, <p>, etc.
     # _tc_lb = tag count since last block.  This is the tag count in prior
     # empty blocks, accumulated since the last block was output, excluding
     # the current block
-    
+
     # so tc gets updated with each tag
     # tc is reset on block formation, even for empty blocks
     #
@@ -776,7 +773,7 @@ xml_re = re.compile('<\?\s*xml[^>]*encoding\s*=\s*"{0,1}\s*([a-zA-Z0-9-]+)', re.
 def guess_encoding(s, default='utf-8'):
     """Try to guess the encoding of s -- check the XML declaration
     and the HTML meta tag
-    
+
     if default=CHARDET then use chardet to guess the default"""
     mo = xml_re.search(s)
     if mo:
@@ -820,7 +817,7 @@ class Blockifier(object):
         partial_block.add_block_to_results(results)
 
         return results
-    
+
 
     @staticmethod
     def blockify(s, encoding=None,
@@ -878,5 +875,3 @@ class TagCountNoCSSReadabilityBlockifier(Blockifier):
         return Blockifier.blockify(s, encoding, pb=TagCountPB,
             do_css=False, do_readability=True,
             parse_callback=parse_callback)
-
-
diff --git a/dragnet/compat.py b/dragnet/compat.py
@@ -0,0 +1,14 @@
+import sys
+
+PY2 = int(sys.version[0]) == 2
+
+if PY2:
+    range_ = xrange
+    bytes_ = str
+    unicode_ = unicode
+    string_ = (str, unicode)
+else:
+    range_ = range
+    bytes_ = bytes
+    unicode_ = str
+    string_ = (bytes, str)
diff --git a/dragnet/content_extraction_model.py b/dragnet/content_extraction_model.py
@@ -1,10 +1,9 @@
 #! /usr/bin/env python
 # -*- coding: utf-8 -*-
-
-import re
 import numpy as np
 from .blocks import Blockifier
 
+
 class IdentityPredictor(object):
     """Mock out the machine learning model with an identity model."""
     @staticmethod
@@ -15,6 +14,7 @@ def predict(x):
     def fit(*args, **kargs):
         pass
 
+
 class BaselinePredictor(object):
     """Always predict content"""
     @staticmethod
@@ -25,16 +25,19 @@ def predict(x):
     def fit(*args, **kwargs):
         pass
 
+
 def nofeatures(blocks, *args, **kwargs):
     return np.zeros((len(blocks), 1))
+
 nofeatures.nfeatures = 1
 
+
 class ContentExtractionModel(object):
     """Content extraction model
 
     Encapsulates a blockifier, some feature generators and a
     machine learing block model
-    
+
     Implements analyze, make_features"""
 
     def __init__(self, blockifier, features, block_model, threshold=0.5):
@@ -55,7 +58,7 @@ def __init__(self, blockifier, features, block_model, threshold=0.5):
 
     def set_threshold(self, thres):
         """Set the threshold
-        
+
         0<= thres <= 1.0"""
         self._threshold = thres
 
@@ -68,8 +71,8 @@ def analyze(self, s, blocks=False, encoding=None, parse_callback=None):
             If None, then try to guess it.
         """
         # blockify html into blocks
-        blocks_ = self._blockifier.blockify(s, encoding=encoding,
-            parse_callback=parse_callback)
+        blocks_ = self._blockifier.blockify(
+            s, encoding=encoding, parse_callback=parse_callback)
 
         # make features, run model and return content
         return self.analyze_from_blocks(blocks_, return_blocks=blocks)
@@ -112,13 +115,13 @@ def make_features(self, s, train=False, encoding=None, parse_callback=None):
 
            raises BlockifyError if there is an error parsing the doc
            and None if doc is too short (< 3 blocks)
-           
+
            train = if true, then passes it into feature maker
         """
         # note: this method is not longer needed by ContentExtractionModel
         # but is kept for now for backward compatibilty with training code
-        blocks = self._blockifier.blockify(s, encoding=encoding,
-            parse_callback=parse_callback)
+        blocks = self._blockifier.blockify(
+            s, encoding=encoding, parse_callback=parse_callback)
         return self.make_features_from_blocks(blocks, train), blocks
 
     @staticmethod
@@ -133,20 +136,19 @@ def plot(blocks, content_mask):
         block_lengths_no_content = block_lengths.copy()
         block_lengths_no_content[content_mask] = 0.0
 
-        ret = plt.bar(np.arange(len(blocks)), block_lengths_no_content, 0.5)
-        ret = plt.bar(np.arange(len(blocks)), block_lengths_content, 0.5)
+        plt.bar(np.arange(len(blocks)), block_lengths_no_content, 0.5)
+        plt.bar(np.arange(len(blocks)), block_lengths_content, 0.5)
 
         fig.show()
 
 
 class ContentCommentsExtractionModel(ContentExtractionModel):
-    '''
+    """
     Run two models: a content only and a content + comments model
     on a document and return the output of both
-    '''
+    """
     def __init__(self, blockifier, features,
-        content_model, content_comments_model, threshold=0.5):
-
+                 content_model, content_comments_model, threshold=0.5):
         self._blockifier = blockifier
         self._features = features
         self._content_model = content_model
@@ -203,7 +205,8 @@ def analyze_from_blocks(self, blocks_, return_blocks=False):
             return (
                 ' '.join(blk.text for blk in blocks_content),
                 ' '.join(blk.text for blk in blocks_content_comments)
-            )
+                )
+
 
 class SklearnWrapper(object):
     def __init__(self, skmodel):
@@ -216,4 +219,3 @@ def predict(self, X):
         return self._skmodel.predict_proba(X)[:, self._positive_idx]
 
 baseline_model = ContentExtractionModel(Blockifier, [nofeatures], BaselinePredictor)
-
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,7 @@ @@
     *.pyc
     *.so
     build/*
+    dist/*
     dragnet.egg-info
     # cython temporary files
@@ Expand All / @@ -22,4 +23,3 @@ output/* @@
     *.swp
     .vagrant