Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve "Find similar items" #106

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion forloop_modules/flog.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def flog(message: str, class_name: str, color: LogColor = LogColor.COLOROFF, mes
"""

if DEVELOPER_MODE:
header = f"{datetime.now().strftime('%H:%M:%S')} "
header = f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - "
if class_name:
header += f"{class_name}: "

Expand Down
41 changes: 35 additions & 6 deletions forloop_modules/function_handlers/browser_handlers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import base64
from io import BytesIO
from pathlib import Path
from typing import Union, Optional
from typing import Union

from PIL import Image

import forloop_modules.queries.node_context_requests_backend as ncrb
from docrawl.errors import SpiderFunctionError
from forloop_modules.errors.errors import CriticalPipelineError
from forloop_modules.flog import flog
from forloop_modules.function_handlers.auxilliary.abstract_function_handler import (
Expand Down Expand Up @@ -375,7 +373,7 @@ def direct_execute(
# start_time = time.perf_counter()
# a_time = time.perf_counter() - start_time

output_folder = Path.cwd() / 'tmp' / 'screenshots'
#output_folder = Path.cwd() / 'tmp' / 'screenshots'

# random_number = 50000000 + random.randint(1, 10000000) # initialization

Expand Down Expand Up @@ -433,7 +431,7 @@ def direct_execute(
# e_time = time.perf_counter() - start_time #12.5s
if url:
suh.webscraping_client.load_website(url, timeout=120)
elements, screenshot_base64 = suh.scan_web_page_API(output_folder, scraping_options) #9 seconds
elements, screenshot_base64 = suh.scan_web_page_API(scraping_options) #9 seconds

# # Convert PNG file to WEBP
# img = Image.open(output_folder / "website.png")
Expand All @@ -446,7 +444,38 @@ def direct_execute(
# Convert Base64 to WEBP
png_img_data = BytesIO(base64.b64decode(screenshot_base64))
webp_img_data = BytesIO()
Image.open(png_img_data).save(webp_img_data, 'WEBP', quality=70)

def resize_screenshot_with_max_dimension(screenshot_image, max_dimension=16383) -> tuple[Image, int]:
"""
Resizes an image so that no dimension exceeds the specified max_dimension.
The other dimension is scaled proportionally.

:param screenshot_image: PIL.Image object of screenshot
:param max_dimension: Maximum allowed size for width or height for WEBP by Pillow lib
:return: Resized PIL.Image object
:return: scaling factor
"""
width, height = screenshot_image.size

if width > max_dimension or height > max_dimension:
print('Current screenshot image exceeds maximum size for .webp (16383px).')
scaling_factor = min(max_dimension / width, max_dimension / height)

new_width = int(width * scaling_factor)
new_height = int(height * scaling_factor)

screenshot_image = screenshot_image.resize((new_width, new_height), Image.Resampling.LANCZOS)
print(f'Screenshot resized: {width}x{height} -> {new_width}x{new_height}.')
else:
scaling_factor = 1

return screenshot_image, scaling_factor

img = Image.open(png_img_data)

# PIL allows to save max 16383px long images in WEBP format
image_resized, scaling_factor = resize_screenshot_with_max_dimension(img)
image_resized.save(webp_img_data, 'WEBP', quality=70)
webp_img_data.seek(0)
screenshot = base64.b64encode(webp_img_data.getvalue()).decode("utf-8")

Expand Down
42 changes: 23 additions & 19 deletions forloop_modules/globals/scraping_utilities_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,9 @@ def get_generalized_xpaths(self, xpath):
flog.warning('\n'.join(generalised_xpaths))

results = []
expected_optimal = np.log(20)

# Most probable numbers of similar items (e.g. real estate listings, blog posts, table rows, ...) on a page
possible_optimums = [20, 50, 100]

webpage_elements_history = []

Expand All @@ -588,14 +590,16 @@ def get_generalized_xpaths(self, xpath):
flog.warning(f'Using XPath: {generalised_xpath}')
flog.warning(f'Num of elements found: {num_of_elements}')

# How far is point from expected optimal
distance_from_optimal = abs(expected_optimal - np.log(num_of_elements))
flog.warning(f'Distance from optimal: {distance_from_optimal}')
results.append(distance_from_optimal)
# How far is point from expected optimum
distances_from_optimums = [abs(np.log(x) - np.log(num_of_elements)) for x in possible_optimums]
best_distance = min(distances_from_optimums)
flog.warning(f'Best distance: {best_distance} from optimums: {list(zip(possible_optimums, distances_from_optimums))}')

results.append(best_distance)
webpage_elements_history.append(webpage_elements)

# If it's already the best possible XPath (number of elements is 20) or very close to optimum -> break
if distance_from_optimal <= 0.3:
# If it's already the best possible XPath (number of elements is very close to one of 20/50/100) -> break
if best_distance <= 0.3:
flog.warning('Found most probable optimum, exiting cycle')
break

Expand All @@ -611,25 +615,25 @@ def get_generalized_xpaths(self, xpath):

return generalised_xpaths, optimal_xpath_index

def scan_web_page_API(self, output_folder, scraping_options: dict):
def scan_web_page_API(self, scraping_options: dict):
"""
Function only to use in "Getting Started" tutorial on web app !!!
Combines ScanWebPage (all elements) with Cookies Detection
"""
def generate_folder_structure(folder_name):
try:
os.mkdir(folder_name)
except:
print("skipping - "+folder_name+" folder exists already")

generate_folder_structure("tmp")
generate_folder_structure("tmp/screenshots")
generate_folder_structure("tmp/scraped_data")
# def generate_folder_structure(folder_name):
# try:
# os.mkdir(folder_name)
# except:
# print("skipping - "+folder_name+" folder exists already")
#
# generate_folder_structure("tmp")
# generate_folder_structure("tmp/screenshots")
# generate_folder_structure("tmp/scraped_data")

xpath = self.detect_cookies_xpath_preparation()

# self.webscraping_client.take_png_screenshot(str(Path(output_folder, 'website.png'))) #needs to run before the scanner so there is enough time for the parallel thread
self.webscraping_client.take_screenshot()
#self.webscraping_client.take_screenshot()
self.webscraping_client.scan_web_page(**scraping_options, timeout = 60) #Duration: ~3s


Expand All @@ -651,7 +655,7 @@ def generate_folder_structure(folder_name):
# Close cookies popup
self.webscraping_client.click_xpath(button_xpath)
# self.webscraping_client.take_png_screenshot(str(Path(output_folder, 'website.png'))) #TODO: The scanning finishes before the screenshot thread - need to either 1) refresh screenshot multiple times in FE (optimal), or 2) run this not in thread when cookies detected
self.webscraping_client.take_screenshot()
self.webscraping_client.take_screenshot()


# [::-1] needed to ensure that FE rectangles are not overlapped (bigger elements do not cover smaller)
Expand Down