diff --git a/forloop_modules/flog.py b/forloop_modules/flog.py index fefa4fc..c97c9b1 100644 --- a/forloop_modules/flog.py +++ b/forloop_modules/flog.py @@ -283,7 +283,7 @@ def flog(message: str, class_name: str, color: LogColor = LogColor.COLOROFF, mes """ if DEVELOPER_MODE: - header = f"{datetime.now().strftime('%H:%M:%S')} " + header = f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - " if class_name: header += f"{class_name}: " diff --git a/forloop_modules/function_handlers/browser_handlers.py b/forloop_modules/function_handlers/browser_handlers.py index fdaac22..ab3bd5a 100644 --- a/forloop_modules/function_handlers/browser_handlers.py +++ b/forloop_modules/function_handlers/browser_handlers.py @@ -1,12 +1,10 @@ import base64 from io import BytesIO -from pathlib import Path -from typing import Union, Optional +from typing import Union from PIL import Image import forloop_modules.queries.node_context_requests_backend as ncrb -from docrawl.errors import SpiderFunctionError from forloop_modules.errors.errors import CriticalPipelineError from forloop_modules.flog import flog from forloop_modules.function_handlers.auxilliary.abstract_function_handler import ( @@ -375,7 +373,7 @@ def direct_execute( # start_time = time.perf_counter() # a_time = time.perf_counter() - start_time - output_folder = Path.cwd() / 'tmp' / 'screenshots' + #output_folder = Path.cwd() / 'tmp' / 'screenshots' # random_number = 50000000 + random.randint(1, 10000000) # initialization @@ -433,7 +431,7 @@ def direct_execute( # e_time = time.perf_counter() - start_time #12.5s if url: suh.webscraping_client.load_website(url, timeout=120) - elements, screenshot_base64 = suh.scan_web_page_API(output_folder, scraping_options) #9 seconds + elements, screenshot_base64 = suh.scan_web_page_API(scraping_options) #9 seconds # # Convert PNG file to WEBP # img = Image.open(output_folder / "website.png") @@ -446,7 +444,38 @@ def direct_execute( # Convert Base64 to WEBP png_img_data = BytesIO(base64.b64decode(screenshot_base64)) webp_img_data = BytesIO() - Image.open(png_img_data).save(webp_img_data, 'WEBP', quality=70) + + def resize_screenshot_with_max_dimension(screenshot_image, max_dimension=16383) -> tuple[Image, int]: + """ + Resizes an image so that no dimension exceeds the specified max_dimension. + The other dimension is scaled proportionally. + + :param screenshot_image: PIL.Image object of screenshot + :param max_dimension: Maximum allowed size for width or height for WEBP by Pillow lib + :return: Resized PIL.Image object + :return: scaling factor + """ + width, height = screenshot_image.size + + if width > max_dimension or height > max_dimension: + print('Current screenshot image exceeds maximum size for .webp (16383px).') + scaling_factor = min(max_dimension / width, max_dimension / height) + + new_width = int(width * scaling_factor) + new_height = int(height * scaling_factor) + + screenshot_image = screenshot_image.resize((new_width, new_height), Image.Resampling.LANCZOS) + print(f'Screenshot resized: {width}x{height} -> {new_width}x{new_height}.') + else: + scaling_factor = 1 + + return screenshot_image, scaling_factor + + img = Image.open(png_img_data) + + # PIL allows to save max 16383px long images in WEBP format + image_resized, scaling_factor = resize_screenshot_with_max_dimension(img) + image_resized.save(webp_img_data, 'WEBP', quality=70) webp_img_data.seek(0) screenshot = base64.b64encode(webp_img_data.getvalue()).decode("utf-8") diff --git a/forloop_modules/globals/scraping_utilities_handler.py b/forloop_modules/globals/scraping_utilities_handler.py index 647380f..fb780ea 100644 --- a/forloop_modules/globals/scraping_utilities_handler.py +++ b/forloop_modules/globals/scraping_utilities_handler.py @@ -571,7 +571,9 @@ def get_generalized_xpaths(self, xpath): flog.warning('\n'.join(generalised_xpaths)) results = [] - expected_optimal = np.log(20) + + # Most probable numbers of similar items (e.g. real estate listings, blog posts, table rows, ...) on a page + possible_optimums = [20, 50, 100] webpage_elements_history = [] @@ -588,14 +590,16 @@ def get_generalized_xpaths(self, xpath): flog.warning(f'Using XPath: {generalised_xpath}') flog.warning(f'Num of elements found: {num_of_elements}') - # How far is point from expected optimal - distance_from_optimal = abs(expected_optimal - np.log(num_of_elements)) - flog.warning(f'Distance from optimal: {distance_from_optimal}') - results.append(distance_from_optimal) + # How far is point from expected optimum + distances_from_optimums = [abs(np.log(x) - np.log(num_of_elements)) for x in possible_optimums] + best_distance = min(distances_from_optimums) + flog.warning(f'Best distance: {best_distance} from optimums: {list(zip(possible_optimums, distances_from_optimums))}') + + results.append(best_distance) webpage_elements_history.append(webpage_elements) - # If it's already the best possible XPath (number of elements is 20) or very close to optimum -> break - if distance_from_optimal <= 0.3: + # If it's already the best possible XPath (number of elements is very close to one of 20/50/100) -> break + if best_distance <= 0.3: flog.warning('Found most probable optimum, exiting cycle') break @@ -611,25 +615,25 @@ def get_generalized_xpaths(self, xpath): return generalised_xpaths, optimal_xpath_index - def scan_web_page_API(self, output_folder, scraping_options: dict): + def scan_web_page_API(self, scraping_options: dict): """ Function only to use in "Getting Started" tutorial on web app !!! Combines ScanWebPage (all elements) with Cookies Detection """ - def generate_folder_structure(folder_name): - try: - os.mkdir(folder_name) - except: - print("skipping - "+folder_name+" folder exists already") - - generate_folder_structure("tmp") - generate_folder_structure("tmp/screenshots") - generate_folder_structure("tmp/scraped_data") + # def generate_folder_structure(folder_name): + # try: + # os.mkdir(folder_name) + # except: + # print("skipping - "+folder_name+" folder exists already") + # + # generate_folder_structure("tmp") + # generate_folder_structure("tmp/screenshots") + # generate_folder_structure("tmp/scraped_data") xpath = self.detect_cookies_xpath_preparation() # self.webscraping_client.take_png_screenshot(str(Path(output_folder, 'website.png'))) #needs to run before the scanner so there is enough time for the parallel thread - self.webscraping_client.take_screenshot() + #self.webscraping_client.take_screenshot() self.webscraping_client.scan_web_page(**scraping_options, timeout = 60) #Duration: ~3s @@ -651,7 +655,7 @@ def generate_folder_structure(folder_name): # Close cookies popup self.webscraping_client.click_xpath(button_xpath) # self.webscraping_client.take_png_screenshot(str(Path(output_folder, 'website.png'))) #TODO: The scanning finishes before the screenshot thread - need to either 1) refresh screenshot multiple times in FE (optimal), or 2) run this not in thread when cookies detected - self.webscraping_client.take_screenshot() + self.webscraping_client.take_screenshot() # [::-1] needed to ensure that FE rectangles are not overlapped (bigger elements do not cover smaller)