Skip to content

Commit

Permalink
fix-bug: missing-pick-sentences
Browse files Browse the repository at this point in the history
  • Loading branch information
guojiangwei2 committed Mar 20, 2018
1 parent d912366 commit a383c56
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ lib/
local/
man/
share/
.idea/
4 changes: 2 additions & 2 deletions breadability/readable.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
)
from .utils import cached_property, shrink_text


html_cleaner = Cleaner(
scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False,
Expand All @@ -30,7 +29,6 @@
annoying_tags=False, remove_tags=None, kill_tags=("noscript", "iframe"),
remove_unknown_tags=False, safe_attrs_only=False)


SCORABLE_TAGS = ("div", "p", "td", "pre", "article")
ANNOTATION_TAGS = (
"a", "abbr", "acronym", "b", "big", "blink", "blockquote", "br", "cite",
Expand Down Expand Up @@ -193,6 +191,7 @@ def clean_document(node):
logger.debug("Dropping <%s>, it's insignificant", n.tag)
to_drop.append(n)

""" modified by guojw
# drop block element without content and children
if n.tag in ("div", "p"):
text_content = shrink_text(n.text_content())
Expand All @@ -204,6 +203,7 @@ def clean_document(node):
# finally try out the conditional cleaning of the target node
if clean_conditionally(n):
to_drop.append(n)
"""

drop_nodes_with_parents(to_drop)

Expand Down
5 changes: 3 additions & 2 deletions breadability/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from ._compat import to_bytes
from .utils import normalize_whitespace


# A series of sets of attributes we check to help in determining if a node is
# a potential candidate or not.
CLS_UNLIKELY = re.compile(
Expand Down Expand Up @@ -140,7 +139,9 @@ def is_unlikely_node(node):

def score_candidates(nodes):
"""Given a list of potential nodes, find some initial scores to start"""
MIN_HIT_LENTH = 25
# guojw
# MIN_HIT_LENTH = 25
MIN_HIT_LENTH = 1
candidates = {}

for node in nodes:
Expand Down

1 comment on commit a383c56

@guojiangwei2
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.