From a383c56c34131915189ce8a26db668846c5e4c51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=B1=9F=E4=BC=9F?= Date: Tue, 20 Mar 2018 20:24:49 +0800 Subject: [PATCH] fix-bug: missing-pick-sentences --- .gitignore | 1 + breadability/readable.py | 4 ++-- breadability/scoring.py | 5 +++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index d1e5761..68ae303 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ lib/ local/ man/ share/ +.idea/ diff --git a/breadability/readable.py b/breadability/readable.py index 1b83ae8..3883a24 100644 --- a/breadability/readable.py +++ b/breadability/readable.py @@ -21,7 +21,6 @@ ) from .utils import cached_property, shrink_text - html_cleaner = Cleaner( scripts=True, javascript=True, comments=True, style=True, links=True, meta=False, add_nofollow=False, @@ -30,7 +29,6 @@ annoying_tags=False, remove_tags=None, kill_tags=("noscript", "iframe"), remove_unknown_tags=False, safe_attrs_only=False) - SCORABLE_TAGS = ("div", "p", "td", "pre", "article") ANNOTATION_TAGS = ( "a", "abbr", "acronym", "b", "big", "blink", "blockquote", "br", "cite", @@ -193,6 +191,7 @@ def clean_document(node): logger.debug("Dropping <%s>, it's insignificant", n.tag) to_drop.append(n) + """ modified by guojw # drop block element without content and children if n.tag in ("div", "p"): text_content = shrink_text(n.text_content()) @@ -204,6 +203,7 @@ def clean_document(node): # finally try out the conditional cleaning of the target node if clean_conditionally(n): to_drop.append(n) + """ drop_nodes_with_parents(to_drop) diff --git a/breadability/scoring.py b/breadability/scoring.py index a042af2..795de37 100644 --- a/breadability/scoring.py +++ b/breadability/scoring.py @@ -13,7 +13,6 @@ from ._compat import to_bytes from .utils import normalize_whitespace - # A series of sets of attributes we check to help in determining if a node is # a potential candidate or not. CLS_UNLIKELY = re.compile( @@ -140,7 +139,9 @@ def is_unlikely_node(node): def score_candidates(nodes): """Given a list of potential nodes, find some initial scores to start""" - MIN_HIT_LENTH = 25 + # guojw + # MIN_HIT_LENTH = 25 + MIN_HIT_LENTH = 1 candidates = {} for node in nodes: