Merge pull request #279 from Medno/fix_wrong_type

Encode empty "html_title" if it's not found
fhamborg · Jul 10, 2024 · 04dca08 · 04dca08
2 parents 623ea12 + 7404aa7
commit 04dca08
Showing 1 changed file with 56 additions and 46 deletions.
diff --git a/newsplease/helper_classes/parse_crawler.py b/newsplease/helper_classes/parse_crawler.py
@@ -1,20 +1,24 @@
 """
 This is a helper class for the crawler's parse methods
 """
+
 import logging
 import re
 import time
+from typing import Optional
 
 import scrapy
+from scrapy.http import Response
 
 # to improve performance, regex statements are compiled only once per module
-re_html = re.compile('text/html')
+re_html = re.compile("text/html")
 
 
 class ParseCrawler(object):
     """
     Helper class for the crawler's parse methods.
     """
+
     helper = None
     log = None
 
@@ -23,11 +27,11 @@ def __init__(self, helper):
         self.log = logging.getLogger(__name__)
 
     def pass_to_pipeline_if_article(
-            self,
-            response,
-            source_domain,
-            original_url,
-            rss_title=None
+        self,
+        response: Response,
+        source_domain: str,
+        original_url: str,
+        rss_title: Optional[str] = None,
     ):
         """
         Responsible for passing a NewscrawlerItem to the pipeline if the
@@ -40,51 +44,53 @@ def pass_to_pipeline_if_article(
         :return NewscrawlerItem: NewscrawlerItem to pass to the pipeline
         """
         if self.helper.heuristics.is_article(response, original_url):
-            return self.pass_to_pipeline(
-                response, source_domain, rss_title=None)
+            return self.pass_to_pipeline(response, source_domain, rss_title=rss_title)
 
     def pass_to_pipeline(
-            self,
-            response,
-            source_domain,
-            rss_title=None
+        self, response: Response, source_domain: str, rss_title: Optional[str] = None
     ):
-        timestamp = time.strftime('%Y-%m-%d %H:%M:%S',
-                                  time.gmtime(time.time()))
+        timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time.time()))
 
-        relative_local_path = self.helper.savepath_parser \
-            .get_savepath(response.url)
+        relative_local_path = self.helper.savepath_parser.get_savepath(response.url)
 
         # Instantiate the crawler item class defined in the configuration
         article = self.helper.crawler_item_class()
-        article['local_path'] = self.helper.savepath_parser \
-            .get_formatted_relative_path(relative_local_path)
-        article['filename'] = self.helper.savepath_parser.get_filename(article['local_path'])
-        article['abs_local_path'] = self.helper.savepath_parser \
-            .get_abs_path(relative_local_path)
-        article['modified_date'] = timestamp
-        article['download_date'] = timestamp
-        article['source_domain'] = source_domain.encode("utf-8")
-        article['url'] = response.url
-        extracted_title = response.selector.xpath('//title/text()').extract_first()
-        article['html_title'] = extracted_title.encode("utf-8") if extracted_title is not None else ''
+        article["local_path"] = self.helper.savepath_parser.get_formatted_relative_path(
+            relative_local_path
+        )
+        article["filename"] = self.helper.savepath_parser.get_filename(
+            article["local_path"]
+        )
+        article["abs_local_path"] = self.helper.savepath_parser.get_abs_path(
+            relative_local_path
+        )
+        article["modified_date"] = timestamp
+        article["download_date"] = timestamp
+        article["source_domain"] = source_domain.encode("utf-8")
+        article["url"] = response.url
+        extracted_title = response.selector.xpath("//title/text()").extract_first()
+        if extracted_title is None:
+            extracted_title = ""
+        article["html_title"] = extracted_title.encode("utf-8")
+
         if rss_title is None:
-            article['rss_title'] = 'NULL'
+            article["rss_title"] = "NULL"
         else:
-            article['rss_title'] = rss_title.encode("utf-8")
-        article['spider_response'] = response
-        article['article_title'] = 'NULL'
-        article['article_description'] = 'NULL'
-        article['article_text'] = 'NULL'
-        article['article_image'] = 'NULL'
-        article['article_author'] = 'NULL'
-        article['article_publish_date'] = 'NULL'
-        article['article_language'] = 'NULL'
+            article["rss_title"] = rss_title
+        article["spider_response"] = response
+        article["article_title"] = "NULL"
+        article["article_description"] = "NULL"
+        article["article_text"] = "NULL"
+        article["article_image"] = "NULL"
+        article["article_author"] = "NULL"
+        article["article_publish_date"] = "NULL"
+        article["article_language"] = "NULL"
         return article
 
     @staticmethod
-    def recursive_requests(response, spider, ignore_regex='',
-                           ignore_file_extensions='pdf'):
+    def recursive_requests(
+        response, spider, ignore_regex="", ignore_file_extensions="pdf"
+    ):
         """
         Manages recursive requests.
         Determines urls to recursivly crawl if they do not match certain file
@@ -103,10 +109,13 @@ def recursive_requests(response, spider, ignore_regex='',
         # or contain any of the given ignore_regex regexes
         return [
             scrapy.Request(response.urljoin(href), callback=spider.parse)
-            for href in response.css("a::attr('href')").extract() if re.match(
-                r'.*\.' + ignore_file_extensions +
-                r'$', response.urljoin(href), re.IGNORECASE
-            ) is None
+            for href in response.css("a::attr('href')").extract()
+            if re.match(
+                r".*\." + ignore_file_extensions + r"$",
+                response.urljoin(href),
+                re.IGNORECASE,
+            )
+            is None
             and len(re.match(ignore_regex, response.urljoin(href)).group(0)) == 0
         ]
 
@@ -117,10 +126,11 @@ def content_type(self, response):
         :param obj response: The scrapy response
         :return bool: Determines wether the response is of the correct type
         """
-        if not re_html.match(response.headers.get('Content-Type').decode('utf-8')):
+        if not re_html.match(response.headers.get("Content-Type").decode("utf-8")):
             self.log.warn(
-                "Dropped: %s's content is not of type "
-                "text/html but %s", response.url, response.headers.get('Content-Type')
+                "Dropped: %s's content is not of type " "text/html but %s",
+                response.url,
+                response.headers.get("Content-Type"),
             )
             return False
         else: