Skip to content

Commit

Permalink
change: async parse method
Browse files Browse the repository at this point in the history
Signed-off-by: criamos <[email protected]>
  • Loading branch information
Criamos committed Nov 29, 2023
1 parent 37a5044 commit ce6a5d8
Show file tree
Hide file tree
Showing 7 changed files with 14 additions and 14 deletions.
4 changes: 2 additions & 2 deletions converter/spiders/kmap_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@ def getId(self, response=None) -> str:
def getHash(self, response=None) -> str:
pass

def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
async def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
"""
Scrapy Contracts:
@url https://kmap.eu/app/browser/Mathematik/Exponentialfunktionen/Asymptoten
@returns item 1
"""
last_modified = kwargs.get("lastModified")
url_data_web_tools_dict = WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
url_data_web_tools_dict = await WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
splash_html_string = url_data_web_tools_dict.get('html')
json_ld_string: str = Selector(text=splash_html_string).xpath('//*[@id="ld"]/text()').get()
json_ld: dict = json.loads(json_ld_string)
Expand Down
4 changes: 2 additions & 2 deletions converter/spiders/materialnetzwerk_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def parse_start_url(self, response: scrapy.http.Response, **kwargs):
bundle_urls.append(current_url)
yield scrapy.Request(url=current_url, callback=self.parse_bundle_overview)

def parse_bundle_overview(self, response: scrapy.http.Response):
async def parse_bundle_overview(self, response: scrapy.http.Response):
"""
Spider Contracts:
Expand All @@ -98,7 +98,7 @@ def parse_bundle_overview(self, response: scrapy.http.Response):
bundle_dict = dict()
bundle_dict["bundle_url"] = response.url
# render the web page to execute js and copy to the response
body = WebTools.getUrlData(response.url, WebEngine.Playwright)
body = await WebTools.getUrlData(response.url, WebEngine.Playwright)
response = response.replace(body=body['html'])

# a typical bundle_overview looks like this: https://editor.mnweg.org/mnw/sammlung/das-menschliche-skelett-m-78
Expand Down
4 changes: 2 additions & 2 deletions converter/spiders/oersi_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,7 @@ def split_names_if_possible_and_add_to_lifecycle(name_string: str, lifecycle_ite
else:
lifecycle_item_loader.add_value("firstName", name_string)

def parse(self, response: scrapy.http.Response, **kwargs):
async def parse(self, response: scrapy.http.Response, **kwargs):
elastic_item: dict = kwargs.get("elastic_item")
elastic_item_source: dict = elastic_item.get("_source")
# _source is the original JSON body passed for the document at index time
Expand Down Expand Up @@ -1080,7 +1080,7 @@ def parse(self, response: scrapy.http.Response, **kwargs):
if not thumbnail_url:
# only use the headless browser if we need to take a website screenshot, otherwise skip this (expensive)
# part of the program flow completely
url_data = WebTools.getUrlData(url=response.url, engine=WebEngine.Playwright)
url_data = await WebTools.getUrlData(url=response.url, engine=WebEngine.Playwright)
if "html" in url_data:
response_loader.add_value("html", url_data["html"])
if "text" in url_data:
Expand Down
4 changes: 2 additions & 2 deletions converter/spiders/sample_spider_alternative.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(url=start_url, callback=self.parse)

def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
async def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
# OPTIONAL: If you need to use playwright to crawl a website, this is how you can access the data provided
# by Playwright's headless browser
playwright_dict: dict = WebTools.getUrlData(response.url, WebEngine.Playwright)
playwright_dict: dict = await WebTools.getUrlData(response.url, WebEngine.Playwright)
html_body = playwright_dict.get("html")
screenshot_bytes = playwright_dict.get("screenshot_bytes") # to be used in base.screenshot_bytes

Expand Down
4 changes: 2 additions & 2 deletions converter/spiders/tutory_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ def getLicense(self, response=None):
license_loader.add_value("author", full_name)
return license_loader

def getLOMGeneral(self, response=None):
async def getLOMGeneral(self, response=None):
general = LomBase.getLOMGeneral(self, response)
general.add_value("title", response.meta["item"]["name"])
item_description = None
Expand All @@ -287,7 +287,7 @@ def getLOMGeneral(self, response=None):
general.add_value("description", meta_og_description)
else:
# this is where the (expensive) calls to our headless browser start
playwright_dict = WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
playwright_dict = await WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
playwright_html = playwright_dict["html"]
# ToDo: if we need DOM data from Playwright in another method, move the call to Playwright into parse()
# and parametrize the result
Expand Down
4 changes: 2 additions & 2 deletions converter/spiders/zum_mathe_apps_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def parse_apollonian_subtopic(self, response: scrapy.http.Response):
apollo_url = response.urljoin(apollo_url)
yield scrapy.Request(url=apollo_url, callback=self.parse)

def parse(self, response: scrapy.http.Response, **kwargs):
async def parse(self, response: scrapy.http.Response, **kwargs):
"""
Populates a BaseItemLoader with metadata and yields the BaseItem afterwards.
Expand All @@ -84,7 +84,7 @@ def parse(self, response: scrapy.http.Response, **kwargs):
@returns items 1
"""
# fetching publication date and lastModified from dynamically loaded <p class="Ende">-element:
url_data_splash_dict = WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
url_data_splash_dict = await WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
splash_html_string = url_data_splash_dict.get('html')
page_end_element = Selector(text=splash_html_string).xpath('//p[@class="Ende"]').get()
line_regex = re.compile(r'<br>')
Expand Down
4 changes: 2 additions & 2 deletions converter/spiders/zum_physik_apps_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def parse_topic_overview(self, response: scrapy.http.Response):
topic_url = response.urljoin(topic_url)
yield scrapy.Request(url=topic_url, callback=self.parse)

def parse(self, response: scrapy.http.Response, **kwargs):
async def parse(self, response: scrapy.http.Response, **kwargs):
"""
Populates a BaseItemLoader with metadata and yields the individual BaseItem via BaseItemLoader.load_item()
afterwards.
Expand All @@ -64,7 +64,7 @@ def parse(self, response: scrapy.http.Response, **kwargs):
@returns item 1
"""
# fetching publication date and lastModified from dynamically loaded <p class="Ende">-element:
url_data_splash_dict = WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
url_data_splash_dict = await WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
splash_html_string = url_data_splash_dict.get('html')
page_end_element = Selector(text=splash_html_string).xpath('//p[@class="Ende"]').get()
line_regex = re.compile(r'<br>')
Expand Down

0 comments on commit ce6a5d8

Please sign in to comment.