Skip to content

Commit

Permalink
Merge pull request #34 from bdewilde/pep8-and-py23
Browse files Browse the repository at this point in the history
Stylistic cleanup and (start of) Py2/3 compatibility
  • Loading branch information
matt-peters authored Dec 23, 2016
2 parents e8800d3 + 5ecf283 commit fd59e21
Show file tree
Hide file tree
Showing 17 changed files with 427 additions and 416 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
*.pyc
*.so
build/*
dist/*
dragnet.egg-info

# cython temporary files
Expand All @@ -22,4 +23,3 @@ output/*
*.swp

.vagrant

23 changes: 10 additions & 13 deletions dragnet/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
#! /usr/bin/env python

from .arias import AriasFeatures, Arias
from .blocks import Blockifier, PartialBlock, BlockifyError
from .features import NormalizedFeature, CSSFeatures
from .content_extraction_model import ContentExtractionModel
from .kohlschuetter import kohlschuetter_features, kohlschuetter
from .util import evaluation_metrics
from .weninger import weninger_features_kmeans
from .readability import readability_features
from .models import content_extractor, content_comments_extractor
from dragnet.arias import AriasFeatures, Arias
from dragnet.blocks import Blockifier, PartialBlock, BlockifyError
from dragnet.features import NormalizedFeature, CSSFeatures
from dragnet.content_extraction_model import ContentExtractionModel
from dragnet.kohlschuetter import kohlschuetter_features, kohlschuetter
from dragnet.util import evaluation_metrics
from dragnet.weninger import weninger_features_kmeans
from dragnet.readability import readability_features
from dragnet.models import content_extractor, content_comments_extractor


class AllFeatures(object):
"""Easy access to feature instances.
We need a way to get instances of the feature classes.
Since these classes are potentially mutated by clients,
we create a new instance on each access"""
Expand All @@ -32,4 +30,3 @@ def get(key, *args, **kwargs):
return NormalizedFeature(readability_features)
else:
raise KeyError

29 changes: 15 additions & 14 deletions dragnet/arias.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#! /usr/bin/env python

# A /rough/ implementation of that described by Aurias et al.:
# https://lirias.kuleuven.be/bitstream/123456789/215528/1/AriasEtAl2009.pdf

"""
A *rough* implementation of that described by Aurias et al.:
https://lirias.kuleuven.be/bitstream/123456789/215528/1/AriasEtAl2009.pdf
"""
from .blocks import Blockifier
from .content_extraction_model import ContentExtractionModel, IdentityPredictor

import numpy as np
import scipy.weave


class AriasFeatures(object):
"""A global feature based on connected blocks of long text
inspired by Arias"""
Expand All @@ -24,7 +25,6 @@ def __init__(self, percent_cutoff, window):
self._percent_cutoff = percent_cutoff
self._window = window


def __call__(self, blocks, train=False):
from scipy import percentile
features = np.zeros((len(blocks), AriasFeatures.nfeatures))
Expand All @@ -36,7 +36,6 @@ def __call__(self, blocks, train=False):
features[lowindex:(highindex + 1), 0] = 1.0
return features


@staticmethod
def strip(block_lengths, index, window, cutoff):
"""Strip a list of blocks down to the content.
Expand Down Expand Up @@ -83,9 +82,10 @@ def strip(block_lengths, index, window, cutoff):
}
ret(1) = lastindex;
"""
scipy.weave.inline(c_code,
['ret', 'nblock', 'index', 'window', 'cutoff', 'block_lengths'],
type_converters=scipy.weave.converters.blitz)
scipy.weave.inline(
c_code,
['ret', 'nblock', 'index', 'window', 'cutoff', 'block_lengths'],
type_converters=scipy.weave.converters.blitz)
return ret


Expand All @@ -108,18 +108,19 @@ def plot(L, name, low, hi, cutoff):
import matplotlib.pyplot as plt

# First, plot up to the low point in blue...
p1 = plt.bar(np.arange(low), [len(l) for l in L[0:low]], linewidth=0.0)
plt.bar(np.arange(low), [len(l) for l in L[0:low]], linewidth=0.0)
# And now from low-high in red
p2 = plt.bar(np.arange(low, hi+1), [len(l) for l in L[low:hi+1]],
plt.bar(
np.arange(low, hi + 1), [len(l) for l in L[low:hi + 1]],
linewidth=0.0, color='r')
# And from then on in blue
p3 = plt.bar(np.arange(hi+1, len(L)), [len(l) for l in L[hi+1:]],
plt.bar(
np.arange(hi + 1, len(L)), [len(l) for l in L[hi + 1:]],
linewidth=0.0)
# Lastly, apply a line across the board at the cutoff
line = plt.plot([0, len(L)], [cutoff, cutoff], 'g-')
plt.plot([0, len(L)], [cutoff, cutoff], 'g-')

plt.xlabel('Order')
plt.ylabel('Length')
plt.title(name)
plt.show()

41 changes: 18 additions & 23 deletions dragnet/blocks.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -51,23 +51,21 @@ cdef inline int int_min(int a, int b): return a if a <= b else b

# tags we'll ignore completely
cdef cpp_set[string] BLACKLIST
BLACKLIST = set([
'applet', 'area', 'base', 'basefont', 'bdo', 'button',
'caption', 'fieldset', 'fram', 'frameset',
'iframe', 'img', 'input', 'legend', 'link', 'menu', 'meta',
'noframes', 'noscript', 'object', 'optgroup', 'option', 'param',
'script', 'select', 'style', 'textarea', 'var', 'xmp',
'like', 'like-box', 'plusone',
BLACKLIST = {
b'applet', b'area', b'base', b'basefont', b'bdo', b'button',
b'caption', b'fieldset', b'fram', b'frameset',
b'iframe', b'img', b'input', b'legend', b'link', b'menu', b'meta',
b'noframes', b'noscript', b'object', b'optgroup', b'option', b'param',
b'script', b'select', b'style', b'textarea', b'var', b'xmp',
b'like', b'like-box', b'plusone',
# HTML5 vector image tags and math tags
'svg', 'math'
])
b'svg', b'math'
}


# tags defining the blocks we'll extract
cdef cpp_set[string] BLOCKS
BLOCKS = set([
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'table', 'map',
])
BLOCKS = {b'h1', b'h2', b'h3', b'h4', b'h5', b'h6', b'p', b'div', b'table', b'map'}

# define some commonly used strings here, otherwise Cython will always add
# a little python overhead when using them even though they are constat
Expand All @@ -87,14 +85,13 @@ re_readability_positive = re.compile('article|body|content|entry|hentry|main|pag
cdef string DIV = <string>'div'

cdef cpp_set[string] READABILITY_PLUS3
READABILITY_PLUS3 = set(["pre", "td", "blockquote"])
READABILITY_PLUS3 = {b'pre', b'td', b'blockquote'}

cdef cpp_set[string] READABILITY_MINUS3
READABILITY_MINUS3 = set(
["address", "ol", "ul", "dl", "dd", "dt", "li", "form"])
READABILITY_MINUS3 = {b'address', b'ol', b'ul', b'dl', b'dd', b'dt', b'li', b'form'}

cdef cpp_set[string] READABILITY_MINUS5
READABILITY_MINUS5 = set(["h1", "h2", "h3", "h4", "h5", "h6", "th"])
READABILITY_MINUS5 = {b'h1', b'h2', b'h3', b'h4', b'h5', b'h6', b'th'}


cdef cpp_set[char] WHITESPACE = set([<char>' ', <char>'\t', <char>'\n',
Expand Down Expand Up @@ -571,7 +568,7 @@ cdef class PartialBlock:
# finally store it
self.class_weights.push_back(pair[uint32_t, int](self.tag_id, weight))
self.class_weights_written.insert(self.tag_id)

cdef void reinit_readability(self):
self.ancestors_write = self.ancestors

Expand Down Expand Up @@ -695,12 +692,12 @@ cdef class TagCountPB(PartialBlock):
# Since we don't output empty blocks, we also keep track of the
# tag count since the last block we output as an additional feature
#

# _tc = tag count in the current block, since the last <div>, <p>, etc.
# _tc_lb = tag count since last block. This is the tag count in prior
# empty blocks, accumulated since the last block was output, excluding
# the current block

# so tc gets updated with each tag
# tc is reset on block formation, even for empty blocks
#
Expand Down Expand Up @@ -776,7 +773,7 @@ xml_re = re.compile('<\?\s*xml[^>]*encoding\s*=\s*"{0,1}\s*([a-zA-Z0-9-]+)', re.
def guess_encoding(s, default='utf-8'):
"""Try to guess the encoding of s -- check the XML declaration
and the HTML meta tag
if default=CHARDET then use chardet to guess the default"""
mo = xml_re.search(s)
if mo:
Expand Down Expand Up @@ -820,7 +817,7 @@ class Blockifier(object):
partial_block.add_block_to_results(results)

return results


@staticmethod
def blockify(s, encoding=None,
Expand Down Expand Up @@ -878,5 +875,3 @@ class TagCountNoCSSReadabilityBlockifier(Blockifier):
return Blockifier.blockify(s, encoding, pb=TagCountPB,
do_css=False, do_readability=True,
parse_callback=parse_callback)


14 changes: 14 additions & 0 deletions dragnet/compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import sys

PY2 = int(sys.version[0]) == 2

if PY2:
range_ = xrange
bytes_ = str
unicode_ = unicode
string_ = (str, unicode)
else:
range_ = range
bytes_ = bytes
unicode_ = str
string_ = (bytes, str)
36 changes: 19 additions & 17 deletions dragnet/content_extraction_model.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import re
import numpy as np
from .blocks import Blockifier


class IdentityPredictor(object):
"""Mock out the machine learning model with an identity model."""
@staticmethod
Expand All @@ -15,6 +14,7 @@ def predict(x):
def fit(*args, **kargs):
pass


class BaselinePredictor(object):
"""Always predict content"""
@staticmethod
Expand All @@ -25,16 +25,19 @@ def predict(x):
def fit(*args, **kwargs):
pass


def nofeatures(blocks, *args, **kwargs):
return np.zeros((len(blocks), 1))

nofeatures.nfeatures = 1


class ContentExtractionModel(object):
"""Content extraction model
Encapsulates a blockifier, some feature generators and a
machine learing block model
Implements analyze, make_features"""

def __init__(self, blockifier, features, block_model, threshold=0.5):
Expand All @@ -55,7 +58,7 @@ def __init__(self, blockifier, features, block_model, threshold=0.5):

def set_threshold(self, thres):
"""Set the threshold
0<= thres <= 1.0"""
self._threshold = thres

Expand All @@ -68,8 +71,8 @@ def analyze(self, s, blocks=False, encoding=None, parse_callback=None):
If None, then try to guess it.
"""
# blockify html into blocks
blocks_ = self._blockifier.blockify(s, encoding=encoding,
parse_callback=parse_callback)
blocks_ = self._blockifier.blockify(
s, encoding=encoding, parse_callback=parse_callback)

# make features, run model and return content
return self.analyze_from_blocks(blocks_, return_blocks=blocks)
Expand Down Expand Up @@ -112,13 +115,13 @@ def make_features(self, s, train=False, encoding=None, parse_callback=None):
raises BlockifyError if there is an error parsing the doc
and None if doc is too short (< 3 blocks)
train = if true, then passes it into feature maker
"""
# note: this method is not longer needed by ContentExtractionModel
# but is kept for now for backward compatibilty with training code
blocks = self._blockifier.blockify(s, encoding=encoding,
parse_callback=parse_callback)
blocks = self._blockifier.blockify(
s, encoding=encoding, parse_callback=parse_callback)
return self.make_features_from_blocks(blocks, train), blocks

@staticmethod
Expand All @@ -133,20 +136,19 @@ def plot(blocks, content_mask):
block_lengths_no_content = block_lengths.copy()
block_lengths_no_content[content_mask] = 0.0

ret = plt.bar(np.arange(len(blocks)), block_lengths_no_content, 0.5)
ret = plt.bar(np.arange(len(blocks)), block_lengths_content, 0.5)
plt.bar(np.arange(len(blocks)), block_lengths_no_content, 0.5)
plt.bar(np.arange(len(blocks)), block_lengths_content, 0.5)

fig.show()


class ContentCommentsExtractionModel(ContentExtractionModel):
'''
"""
Run two models: a content only and a content + comments model
on a document and return the output of both
'''
"""
def __init__(self, blockifier, features,
content_model, content_comments_model, threshold=0.5):

content_model, content_comments_model, threshold=0.5):
self._blockifier = blockifier
self._features = features
self._content_model = content_model
Expand Down Expand Up @@ -203,7 +205,8 @@ def analyze_from_blocks(self, blocks_, return_blocks=False):
return (
' '.join(blk.text for blk in blocks_content),
' '.join(blk.text for blk in blocks_content_comments)
)
)


class SklearnWrapper(object):
def __init__(self, skmodel):
Expand All @@ -216,4 +219,3 @@ def predict(self, X):
return self._skmodel.predict_proba(X)[:, self._positive_idx]

baseline_model = ContentExtractionModel(Blockifier, [nofeatures], BaselinePredictor)

Loading

0 comments on commit fd59e21

Please sign in to comment.