Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dom object validation & fixes #190

Merged
merged 1 commit into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/scripts/structure_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def validate(self) -> None:
if file.is_dir():
raise InvalidStructureException(f"{self._FULL_IMAGES_DIR} can only contain images, {file} is invalid!")
file_extension: str = file.name.split(".")[-1].lower()
# TODO validate with mime!
if file_extension not in ("svg", "png", "jpg", "jpeg"):
raise InvalidStructureException(f"{self._FULL_IMAGES_DIR} can only contain image formatted files, {file} is invalid!")

Expand Down
80 changes: 53 additions & 27 deletions .github/workflows/scripts/technology_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ def __init__(self, msg: str):
super().__init__(msg)


class InvalidKeyException(Exception):
def __init__(self, msg: str):
super().__init__(msg)


class ImageNotFoundException(Exception):
def __init__(self, msg: str):
super().__init__(msg)
Expand Down Expand Up @@ -94,8 +99,7 @@ def get_type(self) -> list[Type]:

class PricingValidator(AbstractValidator):
def _validate(self, tech_name: str, data: Any) -> bool:
type_validator: bool = super()._validate(tech_name, data)
if not type_validator:
if not super()._validate(tech_name, data):
return False
for price in data:
if price not in ("low", "mid", "high", "freemium", "poa", "payg", "onetime", "recurring"):
Expand All @@ -107,23 +111,16 @@ def get_type(self) -> list[Type]:
return [list]


class BoolValidator(AbstractValidator):
def get_type(self) -> list[Type]:
return [bool]


class RegexValidator(abc.ABC, AbstractValidator):
def __init__(self, contains_regex: bool = False):
super().__init__()
self._contains_regex = contains_regex

def _validate(self, tech_name: str, data: Any) -> bool:
type_validator: bool = super()._validate(tech_name, data)
if not type_validator:
if not super()._validate(tech_name, data):
return False
if self._contains_regex:
valid: bool = self._validate_regex(tech_name, data)
if not valid:
if not self._validate_regex(tech_name, data):
return False
return True

Expand All @@ -136,17 +133,20 @@ def _validate_regex(self, tech_name: str, data: Any) -> bool:
return False
elif type(data) is dict:
for _, val in data.items():
valid: bool = self._validate_regex(tech_name, val)
if not valid:
if not self._validate_regex(tech_name, val):
return False
elif type(data) is list:
for item in data:
valid: bool = self._validate_regex(tech_name, item)
if not valid:
if not self._validate_regex(tech_name, item):
return False
return True


class BoolValidator(AbstractValidator):
def get_type(self) -> list[Type]:
return [bool]


class ArrayValidator(RegexValidator):
def get_type(self) -> list[Type]:
return [list]
Expand All @@ -163,8 +163,7 @@ def __init__(self, categories: list[int], required: bool = False):
self._categories: Final[list[int]] = categories

def _validate(self, tech_name: str, data: Any) -> bool:
type_validator: bool = super()._validate(tech_name, data)
if not type_validator:
if not super()._validate(tech_name, data):
return False
for category_id in data:
if category_id not in self._categories:
Expand All @@ -173,14 +172,44 @@ def _validate(self, tech_name: str, data: Any) -> bool:
return True


class DomValidator(AbstractValidator):
class DomValidator(RegexValidator):
def _validate(self, tech_name: str, data: Any) -> bool:
if isinstance(data, list):
for element in data:
BeautifulSoup("", "html.parser").select(element.split(r"\;")[0])
elif isinstance(data, dict):
for k, _ in data.items():
for k, v in data.items():
BeautifulSoup("", "html.parser").select(k)
if isinstance(v, dict):
for key, val in v.items():
if key in ("attributes", "properties"):
if isinstance(val, dict):
for attr_name, attr_val in val.items():
if not isinstance(attr_name, str):
self._set_custom_error(InvalidTypeForFieldException(f"Invalid type for dom in tech '{tech_name}', selector '{k}' '{key}' key must be string!"))
return False
if not isinstance(attr_val, str):
self._set_custom_error(InvalidTypeForFieldException(f"Invalid type for dom in tech '{tech_name}', selector '{k}' '{key}' value must be string!"))
return False
if not self._validate_regex(tech_name, attr_val):
return False
else:
self._set_custom_error(InvalidTypeForFieldException(f"Invalid type for dom in tech '{tech_name}', selector '{k}' object is required inside '{key}' but {type(val).__name__} was found!"))
return False
elif key == "text":
if isinstance(val, str):
if not self._validate_regex(tech_name, val):
return False
elif key == "exists":
if val.split(r"\;")[0] != "":
self._set_custom_error(InvalidTypeForFieldException(f"Invalid value for dom in tech '{tech_name}', selector '{k}' empty string is required inside '{key}' but {val} was found!"))
return False
else:
self._set_custom_error(UnknownFieldsException(f"Invalid key for tech '{tech_name}' (attributes, text, properties, exists) are required but '{key}' was found inside of the {k} selector!"))
return False
else:
self._set_custom_error(InvalidTypeForFieldException(f"Invalid type for dom in tech '{tech_name}' object is required inside the selector!"))
return False
else:
return False
return True
Expand All @@ -192,11 +221,9 @@ def __init__(self, icons: list[str], required: bool = False):
self._icons: Final[list[str]] = icons

def _validate(self, tech_name: str, data: Any) -> bool:
type_validator: bool = super()._validate(tech_name, data)
if not type_validator:
if not super()._validate(tech_name, data):
return False
contains: bool = data in self._icons
if not contains:
if data not in self._icons:
self._set_custom_error(ImageNotFoundException(f"The image '{data}' for tech '{tech_name}' does not exist!"))
return False
return True
Expand All @@ -207,16 +234,14 @@ def __init__(self):
super().__init__()

def _validate(self, tech_name: str, data: Any) -> bool:
type_validator: bool = super()._validate(tech_name, data)
if not type_validator:
if not super()._validate(tech_name, data):
return False
# https://csrc.nist.gov/schema/cpe/2.3/cpe-naming_2.3.xsd
cpe_regex: str = r"""cpe:2\.3:[aho\*\-](:(((\?*|\*?)([a-zA-Z0-9\-\._]|(\\[\\\*\?!"#$$%&'\(\)\+,/:;<=>@\[
\]\^`\{\|}~]))+(\?*|\*?))|[\*\-])){5}(:(([a-zA-Z]{2,3}(-([a-zA-Z]{2}|[0-9]{3}))?)|[\*\-]))(:(((\?*|\*?)([
a-zA-Z0-9\-\._]|(\\[\\\*\?!"#$$%&'\(\)\+,/:;<=>@\[\]\^`\{\|}~]))+(\?*|\*?))|[\*\-])){4}"""
pattern: re.Pattern = re.compile(cpe_regex)
match: re.Match = pattern.match(data)
if not match:
if not pattern.match(data):
self._set_custom_error(InvalidCPEException(f"The cpe {data} for tech '{tech_name}' is invalid!"))
return False
return True
Expand Down Expand Up @@ -310,6 +335,7 @@ def process(self) -> None:


if __name__ == '__main__':
# TODO validate ;confidence & ;version
# for letter in string.ascii_lowercase + "_":
# TechnologiesValidator(os.getenv("TECH_FILE_NAME", f"{letter}.json")).validate()
TechnologiesValidator(os.getenv("TECH_FILE_NAME", f"a.json")).validate()
8 changes: 6 additions & 2 deletions src/technologies/g.json
Original file line number Diff line number Diff line change
Expand Up @@ -745,10 +745,14 @@
}
},
"img[src*='cdn.getyourguide.com/']": {
"src": ""
"attributes": {
"src": ""
}
},
"link[href*='cdn.getyourguide.com/']": {
"src": ""
"attributes": {
"src": ""
}
}
},
"icon": "GetYourGuide.svg",
Expand Down
2 changes: 1 addition & 1 deletion src/technologies/t.json
Original file line number Diff line number Diff line change
Expand Up @@ -3833,7 +3833,7 @@
],
"description": "Twenty Twenty-Two is the default WordPress theme for 2022.",
"dom": {
"link#twentytwentytwo-style-css": "",
"link#twentytwentytwo-style-css": {},
"style#webfonts-inline-css": {
"text": "/wp-content/themes/twentytwentytwo/assets/fonts/"
},
Expand Down
12 changes: 6 additions & 6 deletions src/technologies/w.json
Original file line number Diff line number Diff line change
Expand Up @@ -587,30 +587,30 @@
},
"[style*='images/']": {
"attributes": {
"style": "(?:\\.[a-z]+|/media)(?:/[\\w-]+)?/(?:original_images/[\\w-]+|images/[\\w-.]+\\.(?:(?:fill|max|min)-\\d+x\\d+(?:-c\\d+)?|(?:width|height|scale)-\\d+|original))\\."
"style": "(?:\\.[a-z]+|/media)(?:/[\\w-]+)?/(?:original_images/[\\w-]+|images/[\\w\\-.]+\\.(?:(?:fill|max|min)-\\d+x\\d+(?:-c\\d+)?|(?:width|height|scale)-\\d+|original))\\."
}
},
"img[src*='images/']": {
"attributes": {
"src": "(?:\\.[a-z]+|/media)(?:/[\\w-]+)?/(?:original_images/[\\w-]+|images/[\\w-.]+\\.(?:(?:fill|max|min)-\\d+x\\d+(?:-c\\d+)?|(?:width|height|scale)-\\d+|original))\\."
"src": "(?:\\.[a-z]+|/media)(?:/[\\w-]+)?/(?:original_images/[\\w-]+|images/[\\w\\-.]+\\.(?:(?:fill|max|min)-\\d+x\\d+(?:-c\\d+)?|(?:width|height|scale)-\\d+|original))\\."
}
},
"img[srcset*='images/'], source[srcset*='images/']": {
"attributes": {
"srcset": "(?:\\.[a-z]+|/media)(?:/[\\w-]+)?/(?:original_images/[\\w-]+|images/[\\w-.]+\\.(?:(?:fill|max|min)-\\d+x\\d+(?:-c\\d+)?|(?:width|height|scale)-\\d+|original))\\."
"srcset": "(?:\\.[a-z]+|/media)(?:/[\\w-]+)?/(?:original_images/[\\w-]+|images/[\\w\\-.]+\\.(?:(?:fill|max|min)-\\d+x\\d+(?:-c\\d+)?|(?:width|height|scale)-\\d+|original))\\."
}
},
"meta[content*='images/']": {
"attributes": {
"content": "(?:\\.[a-z]+|/media)(?:/[\\w-]+)?/(?:original_images/[\\w-]+|images/[\\w-.]+\\.(?:(?:fill|max|min)-\\d+x\\d+(?:-c\\d+)?|(?:width|height|scale)-\\d+|original))\\."
"content": "(?:\\.[a-z]+|/media)(?:/[\\w-]+)?/(?:original_images/[\\w-]+|images/[\\w\\-.]+\\.(?:(?:fill|max|min)-\\d+x\\d+(?:-c\\d+)?|(?:width|height|scale)-\\d+|original))\\."
}
},
"style, script": {
"text": "(?:\\.[a-z]+|/media)(?:/[\\w-]+)?/(?:original_images/[\\w-]+|images/[\\w-.]+\\.(?:(?:fill|max|min)-\\d+x\\d+(?:-c\\d+)?|(?:width|height|scale)-\\d+|original))\\."
"text": "(?:\\.[a-z]+|/media)(?:/[\\w-]+)?/(?:original_images/[\\w-]+|images/[\\w\\-.]+\\.(?:(?:fill|max|min)-\\d+x\\d+(?:-c\\d+)?|(?:width|height|scale)-\\d+|original))\\."
},
"video[poster*='images/']": {
"attributes": {
"poster": "(?:\\.[a-z]+|/media)(?:/[\\w-]+)?/(?:original_images/[\\w-]+|images/[\\w-.]+\\.(?:(?:fill|max|min)-\\d+x\\d+(?:-c\\d+)?|(?:width|height|scale)-\\d+|original))\\."
"poster": "(?:\\.[a-z]+|/media)(?:/[\\w-]+)?/(?:original_images/[\\w-]+|images/[\\w\\-.]+\\.(?:(?:fill|max|min)-\\d+x\\d+(?:-c\\d+)?|(?:width|height|scale)-\\d+|original))\\."
}
}
},
Expand Down