Skip to content

Commit

Permalink
Merge pull request #279 from Medno/fix_wrong_type
Browse files Browse the repository at this point in the history
Encode empty "html_title" if it's not found
  • Loading branch information
fhamborg authored Jul 10, 2024
2 parents 623ea12 + 7404aa7 commit 04dca08
Showing 1 changed file with 56 additions and 46 deletions.
102 changes: 56 additions & 46 deletions newsplease/helper_classes/parse_crawler.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
"""
This is a helper class for the crawler's parse methods
"""

import logging
import re
import time
from typing import Optional

import scrapy
from scrapy.http import Response

# to improve performance, regex statements are compiled only once per module
re_html = re.compile('text/html')
re_html = re.compile("text/html")


class ParseCrawler(object):
"""
Helper class for the crawler's parse methods.
"""

helper = None
log = None

Expand All @@ -23,11 +27,11 @@ def __init__(self, helper):
self.log = logging.getLogger(__name__)

def pass_to_pipeline_if_article(
self,
response,
source_domain,
original_url,
rss_title=None
self,
response: Response,
source_domain: str,
original_url: str,
rss_title: Optional[str] = None,
):
"""
Responsible for passing a NewscrawlerItem to the pipeline if the
Expand All @@ -40,51 +44,53 @@ def pass_to_pipeline_if_article(
:return NewscrawlerItem: NewscrawlerItem to pass to the pipeline
"""
if self.helper.heuristics.is_article(response, original_url):
return self.pass_to_pipeline(
response, source_domain, rss_title=None)
return self.pass_to_pipeline(response, source_domain, rss_title=rss_title)

def pass_to_pipeline(
self,
response,
source_domain,
rss_title=None
self, response: Response, source_domain: str, rss_title: Optional[str] = None
):
timestamp = time.strftime('%Y-%m-%d %H:%M:%S',
time.gmtime(time.time()))
timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(time.time()))

relative_local_path = self.helper.savepath_parser \
.get_savepath(response.url)
relative_local_path = self.helper.savepath_parser.get_savepath(response.url)

# Instantiate the crawler item class defined in the configuration
article = self.helper.crawler_item_class()
article['local_path'] = self.helper.savepath_parser \
.get_formatted_relative_path(relative_local_path)
article['filename'] = self.helper.savepath_parser.get_filename(article['local_path'])
article['abs_local_path'] = self.helper.savepath_parser \
.get_abs_path(relative_local_path)
article['modified_date'] = timestamp
article['download_date'] = timestamp
article['source_domain'] = source_domain.encode("utf-8")
article['url'] = response.url
extracted_title = response.selector.xpath('//title/text()').extract_first()
article['html_title'] = extracted_title.encode("utf-8") if extracted_title is not None else ''
article["local_path"] = self.helper.savepath_parser.get_formatted_relative_path(
relative_local_path
)
article["filename"] = self.helper.savepath_parser.get_filename(
article["local_path"]
)
article["abs_local_path"] = self.helper.savepath_parser.get_abs_path(
relative_local_path
)
article["modified_date"] = timestamp
article["download_date"] = timestamp
article["source_domain"] = source_domain.encode("utf-8")
article["url"] = response.url
extracted_title = response.selector.xpath("//title/text()").extract_first()
if extracted_title is None:
extracted_title = ""
article["html_title"] = extracted_title.encode("utf-8")

if rss_title is None:
article['rss_title'] = 'NULL'
article["rss_title"] = "NULL"
else:
article['rss_title'] = rss_title.encode("utf-8")
article['spider_response'] = response
article['article_title'] = 'NULL'
article['article_description'] = 'NULL'
article['article_text'] = 'NULL'
article['article_image'] = 'NULL'
article['article_author'] = 'NULL'
article['article_publish_date'] = 'NULL'
article['article_language'] = 'NULL'
article["rss_title"] = rss_title
article["spider_response"] = response
article["article_title"] = "NULL"
article["article_description"] = "NULL"
article["article_text"] = "NULL"
article["article_image"] = "NULL"
article["article_author"] = "NULL"
article["article_publish_date"] = "NULL"
article["article_language"] = "NULL"
return article

@staticmethod
def recursive_requests(response, spider, ignore_regex='',
ignore_file_extensions='pdf'):
def recursive_requests(
response, spider, ignore_regex="", ignore_file_extensions="pdf"
):
"""
Manages recursive requests.
Determines urls to recursivly crawl if they do not match certain file
Expand All @@ -103,10 +109,13 @@ def recursive_requests(response, spider, ignore_regex='',
# or contain any of the given ignore_regex regexes
return [
scrapy.Request(response.urljoin(href), callback=spider.parse)
for href in response.css("a::attr('href')").extract() if re.match(
r'.*\.' + ignore_file_extensions +
r'$', response.urljoin(href), re.IGNORECASE
) is None
for href in response.css("a::attr('href')").extract()
if re.match(
r".*\." + ignore_file_extensions + r"$",
response.urljoin(href),
re.IGNORECASE,
)
is None
and len(re.match(ignore_regex, response.urljoin(href)).group(0)) == 0
]

Expand All @@ -117,10 +126,11 @@ def content_type(self, response):
:param obj response: The scrapy response
:return bool: Determines wether the response is of the correct type
"""
if not re_html.match(response.headers.get('Content-Type').decode('utf-8')):
if not re_html.match(response.headers.get("Content-Type").decode("utf-8")):
self.log.warn(
"Dropped: %s's content is not of type "
"text/html but %s", response.url, response.headers.get('Content-Type')
"Dropped: %s's content is not of type " "text/html but %s",
response.url,
response.headers.get("Content-Type"),
)
return False
else:
Expand Down

0 comments on commit 04dca08

Please sign in to comment.