change: async parse method

Signed-off-by: criamos <[email protected]>
openeduhub · Nov 29, 2023 · ce6a5d8 · ce6a5d8
1 parent 37a5044
commit ce6a5d8
Show file tree

Hide file tree

Showing 7 changed files with 14 additions and 14 deletions.
diff --git a/converter/spiders/kmap_spider.py b/converter/spiders/kmap_spider.py
@@ -55,15 +55,15 @@ def getId(self, response=None) -> str:
     def getHash(self, response=None) -> str:
         pass
 
-    def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
+    async def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
         """
 
         Scrapy Contracts:
         @url https://kmap.eu/app/browser/Mathematik/Exponentialfunktionen/Asymptoten
         @returns item 1
         """
         last_modified = kwargs.get("lastModified")
-        url_data_web_tools_dict = WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
+        url_data_web_tools_dict = await WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
         splash_html_string = url_data_web_tools_dict.get('html')
         json_ld_string: str = Selector(text=splash_html_string).xpath('//*[@id="ld"]/text()').get()
         json_ld: dict = json.loads(json_ld_string)

diff --git a/converter/spiders/materialnetzwerk_spider.py b/converter/spiders/materialnetzwerk_spider.py
@@ -84,7 +84,7 @@ def parse_start_url(self, response: scrapy.http.Response, **kwargs):
             bundle_urls.append(current_url)
             yield scrapy.Request(url=current_url, callback=self.parse_bundle_overview)
 
-    def parse_bundle_overview(self, response: scrapy.http.Response):
+    async def parse_bundle_overview(self, response: scrapy.http.Response):
         """
 
         Spider Contracts:
@@ -98,7 +98,7 @@ def parse_bundle_overview(self, response: scrapy.http.Response):
         bundle_dict = dict()
         bundle_dict["bundle_url"] = response.url
         # render the web page to execute js and copy to the response
-        body = WebTools.getUrlData(response.url, WebEngine.Playwright)
+        body = await WebTools.getUrlData(response.url, WebEngine.Playwright)
         response = response.replace(body=body['html'])
 
         # a typical bundle_overview looks like this: https://editor.mnweg.org/mnw/sammlung/das-menschliche-skelett-m-78

diff --git a/converter/spiders/oersi_spider.py b/converter/spiders/oersi_spider.py
@@ -756,7 +756,7 @@ def split_names_if_possible_and_add_to_lifecycle(name_string: str, lifecycle_ite
         else:
             lifecycle_item_loader.add_value("firstName", name_string)
 
-    def parse(self, response: scrapy.http.Response, **kwargs):
+    async def parse(self, response: scrapy.http.Response, **kwargs):
         elastic_item: dict = kwargs.get("elastic_item")
         elastic_item_source: dict = elastic_item.get("_source")
         # _source is the original JSON body passed for the document at index time
@@ -1080,7 +1080,7 @@ def parse(self, response: scrapy.http.Response, **kwargs):
         if not thumbnail_url:
             # only use the headless browser if we need to take a website screenshot, otherwise skip this (expensive)
             # part of the program flow completely
-            url_data = WebTools.getUrlData(url=response.url, engine=WebEngine.Playwright)
+            url_data = await WebTools.getUrlData(url=response.url, engine=WebEngine.Playwright)
             if "html" in url_data:
                 response_loader.add_value("html", url_data["html"])
             if "text" in url_data:

diff --git a/converter/spiders/sample_spider_alternative.py b/converter/spiders/sample_spider_alternative.py
@@ -43,10 +43,10 @@ def start_requests(self):
         for start_url in self.start_urls:
             yield scrapy.Request(url=start_url, callback=self.parse)
 
-    def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
+    async def parse(self, response: scrapy.http.Response, **kwargs) -> BaseItemLoader:
         # OPTIONAL: If you need to use playwright to crawl a website, this is how you can access the data provided
         # by Playwright's headless browser
-        playwright_dict: dict = WebTools.getUrlData(response.url, WebEngine.Playwright)
+        playwright_dict: dict = await WebTools.getUrlData(response.url, WebEngine.Playwright)
         html_body = playwright_dict.get("html")
         screenshot_bytes = playwright_dict.get("screenshot_bytes")  # to be used in base.screenshot_bytes
 

diff --git a/converter/spiders/tutory_spider.py b/converter/spiders/tutory_spider.py
@@ -269,7 +269,7 @@ def getLicense(self, response=None):
                         license_loader.add_value("author", full_name)
         return license_loader
 
-    def getLOMGeneral(self, response=None):
+    async def getLOMGeneral(self, response=None):
         general = LomBase.getLOMGeneral(self, response)
         general.add_value("title", response.meta["item"]["name"])
         item_description = None
@@ -287,7 +287,7 @@ def getLOMGeneral(self, response=None):
             general.add_value("description", meta_og_description)
         else:
             # this is where the (expensive) calls to our headless browser start
-            playwright_dict = WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
+            playwright_dict = await WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
             playwright_html = playwright_dict["html"]
             # ToDo: if we need DOM data from Playwright in another method, move the call to Playwright into parse()
             #  and parametrize the result

diff --git a/converter/spiders/zum_mathe_apps_spider.py b/converter/spiders/zum_mathe_apps_spider.py
@@ -75,7 +75,7 @@ def parse_apollonian_subtopic(self, response: scrapy.http.Response):
             apollo_url = response.urljoin(apollo_url)
             yield scrapy.Request(url=apollo_url, callback=self.parse)
 
-    def parse(self, response: scrapy.http.Response, **kwargs):
+    async def parse(self, response: scrapy.http.Response, **kwargs):
         """
         Populates a BaseItemLoader with metadata and yields the BaseItem afterwards.
 
@@ -84,7 +84,7 @@ def parse(self, response: scrapy.http.Response, **kwargs):
         @returns items 1
         """
         # fetching publication date and lastModified from dynamically loaded <p class="Ende">-element:
-        url_data_splash_dict = WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
+        url_data_splash_dict = await WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
         splash_html_string = url_data_splash_dict.get('html')
         page_end_element = Selector(text=splash_html_string).xpath('//p[@class="Ende"]').get()
         line_regex = re.compile(r'<br>')

diff --git a/converter/spiders/zum_physik_apps_spider.py b/converter/spiders/zum_physik_apps_spider.py
@@ -54,7 +54,7 @@ def parse_topic_overview(self, response: scrapy.http.Response):
             topic_url = response.urljoin(topic_url)
             yield scrapy.Request(url=topic_url, callback=self.parse)
 
-    def parse(self, response: scrapy.http.Response, **kwargs):
+    async def parse(self, response: scrapy.http.Response, **kwargs):
         """
         Populates a BaseItemLoader with metadata and yields the individual BaseItem via BaseItemLoader.load_item()
         afterwards.
@@ -64,7 +64,7 @@ def parse(self, response: scrapy.http.Response, **kwargs):
         @returns item 1
         """
         # fetching publication date and lastModified from dynamically loaded <p class="Ende">-element:
-        url_data_splash_dict = WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
+        url_data_splash_dict = await WebTools.getUrlData(response.url, engine=WebEngine.Playwright)
         splash_html_string = url_data_splash_dict.get('html')
         page_end_element = Selector(text=splash_html_string).xpath('//p[@class="Ende"]').get()
         line_regex = re.compile(r'<br>')