From ef910678c8fe1d2326ccdd5ae836890730a6c7d3 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Mon, 12 Jan 2015 17:44:43 +0200 Subject: [PATCH 1/2] Fix title extraction if title is same as site_name --- .gitignore | 2 ++ goose/extractors/title.py | 3 +++ .../title/test_title_opengraph_empty.html | 14 ++++++++++++++ .../title/test_title_opengraph_empty.json | 6 ++++++ tests/extractors/title.py | 5 +++++ 5 files changed, 30 insertions(+) create mode 100644 tests/data/extractors/title/test_title_opengraph_empty.html create mode 100644 tests/data/extractors/title/test_title_opengraph_empty.json diff --git a/.gitignore b/.gitignore index 4bfadf57..4a5b33ff 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ env/ *.egg venv/ goose_extractor.egg-info/ +.env +.directory diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 8104c52b..f253f5b8 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -51,6 +51,9 @@ def clean_title(self, title): # my wonderfull article | TechCrunch title_words = title.split() + if not title_words: + return str() + # check if first letter is in TITLE_SPLITTERS # if so remove it if title_words[0] in TITLE_SPLITTERS: diff --git a/tests/data/extractors/title/test_title_opengraph_empty.html b/tests/data/extractors/title/test_title_opengraph_empty.html new file mode 100644 index 00000000..5d270152 --- /dev/null +++ b/tests/data/extractors/title/test_title_opengraph_empty.html @@ -0,0 +1,14 @@ + + + + + Wrong article title - website + + +
+

+ TextNode 1 - The Scala supported IDE is one of the few pain points of developers who want to start using Scala in their Java project. On existing long term project developed by a team its hard to step in and introduce a new language that is not supported by the existing IDE. On way to go about it is to hid the fact that you use Scala from the Java world by using one way dependency injection. Still, if you wish to truly absorb Scala into your existing java environment then you'll soon introduced cross language dependencies. +

+
+ + diff --git a/tests/data/extractors/title/test_title_opengraph_empty.json b/tests/data/extractors/title/test_title_opengraph_empty.json new file mode 100644 index 00000000..376fc189 --- /dev/null +++ b/tests/data/extractors/title/test_title_opengraph_empty.json @@ -0,0 +1,6 @@ +{ + "url": "http://exemple.com/test_opengraphcontent", + "expected": { + "title": "" + } +} diff --git a/tests/extractors/title.py b/tests/extractors/title.py index 36bee9a2..79d5f9c5 100644 --- a/tests/extractors/title.py +++ b/tests/extractors/title.py @@ -30,3 +30,8 @@ def test_title_opengraph(self): article = self.getArticle() fields = ['title'] self.runArticleAssertions(article=article, fields=fields) + + def test_title_opengraph_empty(self): + article = self.getArticle() + fields = ['title'] + self.runArticleAssertions(article=article, fields=fields) From 7575a5a73436c3de426bfc2127fe9ddabe16dbdc Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 14 Jan 2015 15:27:38 +0200 Subject: [PATCH 2/2] Clean gitignore --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index 4a5b33ff..4bfadf57 100644 --- a/.gitignore +++ b/.gitignore @@ -11,5 +11,3 @@ env/ *.egg venv/ goose_extractor.egg-info/ -.env -.directory