From 7217a462ee94ae90cfea5248fd788d55c9476d06 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Tue, 13 Jan 2015 12:27:33 +0200 Subject: [PATCH 1/3] Add old title cleaning method --- goose/extractors/title.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 8104c52b..e1fd6c57 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -64,6 +64,21 @@ def clean_title(self, title): # rebuild the title title = u" ".join(title_words).strip() + # Check if clean was bad, then apply old clean method + title_pieces = tuple() + for splitter in TITLE_SPLITTERS: + if splitter in title: + title_pieces = title.split(splitter) + break + + largest_text_length = 0 + # find the largest title piece + for current in title_pieces: + current_len = len(current) + if current_len > largest_text_length: + largest_text_length = current_len + title = current + return title def get_title(self): From 5aba30f211921b4e455d47d25bfd6523c8421e81 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Tue, 13 Jan 2015 12:56:09 +0200 Subject: [PATCH 2/3] Small fix --- goose/extractors/title.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/extractors/title.py b/goose/extractors/title.py index e1fd6c57..29eb3652 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -79,7 +79,7 @@ def clean_title(self, title): largest_text_length = current_len title = current - return title + return title.strip().rstrip() def get_title(self): """\ From 924a199bbccd58d9f0c74ee3100b2bcafe756abc Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 14 Jan 2015 14:15:49 +0200 Subject: [PATCH 3/3] Fix tests --- tests/data/extractors/content/test_allnewlyrics1.json | 2 +- tests/data/extractors/content/test_businessWeek1.json | 2 +- tests/data/extractors/content/test_cnn1.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/data/extractors/content/test_allnewlyrics1.json b/tests/data/extractors/content/test_allnewlyrics1.json index 53cd1cf8..7e013690 100644 --- a/tests/data/extractors/content/test_allnewlyrics1.json +++ b/tests/data/extractors/content/test_allnewlyrics1.json @@ -10,7 +10,7 @@ "PJ Morton", "Stevie Wonder" ], - "title": "\u201cOnly One\u201d Lyrics : PJ Morton (Ft. Stevie Wonder)", + "title": "PJ Morton (Ft. Stevie Wonder)", "meta_favicon": "", "meta_lang": "en" } diff --git a/tests/data/extractors/content/test_businessWeek1.json b/tests/data/extractors/content/test_businessWeek1.json index f6918584..0db59c3d 100644 --- a/tests/data/extractors/content/test_businessWeek1.json +++ b/tests/data/extractors/content/test_businessWeek1.json @@ -6,7 +6,7 @@ "final_url": "http://www.businessweek.com/magazine/content/10_34/b4192066630779.htm", "meta_keywords": "Olivia Munn, Attack of the Show, Jon Stewart, Daily Show, G4", "cleaned_text": "Six years ago, Olivia Munn arrived in Hollywood with fading ambitions of making it as a sports reporter and set about deploying", - "title": "Olivia Munn: Queen of the Uncool", + "title": "Queen of the Uncool", "meta_favicon": "", "meta_lang": "en" } diff --git a/tests/data/extractors/content/test_cnn1.json b/tests/data/extractors/content/test_cnn1.json index ced9eb91..280fd31b 100644 --- a/tests/data/extractors/content/test_cnn1.json +++ b/tests/data/extractors/content/test_cnn1.json @@ -6,7 +6,7 @@ "final_url": "http://www.cnn.com/2010/POLITICS/08/13/democrats.social.security/index.html", "meta_keywords": "", "cleaned_text": "Washington (CNN) -- Democrats pledged ", - "title": "Democrats to use Social Security against GOP this fall - CNN.com", + "title": "Democrats to use Social Security against GOP this fall", "meta_favicon": "http://i.cdn.turner.com/cnn/.element/img/3.0/global/misc/apple-touch-icon.png", "meta_lang": "en" }