Skip to content

Commit

Permalink
add generate_prices_configuration() to mapper
Browse files Browse the repository at this point in the history
  • Loading branch information
AKorets committed Jul 26, 2023
1 parent ad6d172 commit 7d8efea
Showing 1 changed file with 107 additions and 3 deletions.
110 changes: 107 additions & 3 deletions mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,16 @@
import glob
import os
import shutil
from pprint import pprint
from il_supermarket_scarper.scrappers_factory import ScraperFactory
import il_supermarket_scarper.scrappers as all_scrappers
from il_supermarket_scarper.utils.file_types import FileTypesFilters
from il_supermarket_scarper.main import ScarpingTask
from tools import save_conf
from store_parser import get_root, generate_store_dictionary
from store_parser import generate_store_dictionary_lower_case, save_store_conf



class Mapper:
"""class that creates mapper from original xml files to unified xml (in stores and items)"""
def __init__(self):
Expand Down Expand Up @@ -82,6 +84,20 @@ def parse_store(self, xml_path, name_dict, ignore_dict, encoding="utf-8"):
if self.smart_print(root, ignore_dict, name_dict):
print (f'The format of {xml_path} is parsed')

def check_prices_tags(self, file, encoding, tags, ignore, tags_dict):
"""find all new or missing tags (that doesnt part of tags or ignore dictionaries)"""
root = get_root(file, encoding)
tag_set = set()
self.all_value_tags(root, tag_set)
lowcase_tag_set = [tags_dict.get(tag.lower(), tag.lower()) for tag in tag_set]
new_tag = [tag for tag in lowcase_tag_set if not tag in tags and not tag in ignore]
if new_tag:
print('new_tag', new_tag)
missing_tag = [tag for tag in tags if not tag in lowcase_tag_set]
if missing_tag:
print('missing_tag', missing_tag)
return {'new_tag':new_tag, 'missing_tag':missing_tag}

def delete_files(path, pattern):
"""delete files by pattern"""
for file in glob.iglob(os.path.join(path, pattern)):
Expand Down Expand Up @@ -207,11 +223,99 @@ def generate_stores_configurations(output_folder):

def generate_stores_configurations_base():
"""start generate_stores_configurations"""
output_folder = "data"
output_folder = "data_stores"
ScarpingTask(dump_folder_name=output_folder, only_latest=True,
files_types=[FileTypesFilters.STORE_FILE.name]).start()
files_types=[FileTypesFilters.STORE_FILE.name],
lookup_in_db=False).start()
generate_stores_configurations(output_folder)
shutil.rmtree(output_folder)

def get_data_file(data_files, encoding):
"""get first xml, that doesn't contains Items Count="0" (even it doesn't loading),
to skip dor alon empty price files"""
for file_path in data_files:
with open(file_path, encoding=encoding) as file:
try:
if 'Items Count="0"' not in file.read():#workaround for dor alon, empty prices file
return file_path
except: # pylint: disable=bare-except #not dor alon
return file_path
return ""

def download_all_prices(tags, ignore, tags_dict):
"""show current status of prices xmls (from all providers) """
output_folder = "price_data"
run_result = {}
provider_encoding = {}
used_files = []
my_mapper = Mapper()
for scrapper_class in ScraperFactory:
ScarpingTask(dump_folder_name=output_folder, only_latest=True,
files_types=FileTypesFilters.only_price(),
enabled_scrapers=[scrapper_class],
lookup_in_db=False).start()
pattern = f'{FileTypesFilters.PRICE_FILE.value["should_contain"]}*.xml'
scrapper = ScraperFactory.get(scrapper_class)(output_folder)
data_files = list(file for file in
glob.iglob(os.path.join(scrapper.get_storage_path(), pattern)))
if not data_files:
print(f'Failed to find file for scrapper {scrapper.chain}')
return
#chain_name = scrapper.chain
#conf_path = get_store_conf_path(chain_name)
if scrapper.chain not in provider_encoding:
provider_encoding[scrapper.chain] = 'utf-8-sig'
print(scrapper.chain, provider_encoding[scrapper.chain])
data_file = get_data_file(data_files, provider_encoding[scrapper.chain])
used_files.append(data_file)
run_result[scrapper.chain] = my_mapper.check_prices_tags(data_file,
provider_encoding[scrapper.chain],
tags, ignore,
tags_dict)
shutil.rmtree(output_folder)
pprint(used_files)
pprint(provider_encoding)
pprint(run_result)

def generate_prices_configuration():
"""generate all_prices.json"""
ignore = {'lastupdatetime',
'bikoretno',
'itemstatus',
'dllverno',
'xmldocversion',
'lastupdatedate',
'itemid',
'itemtype'}
tags_dict = {'itemnm':'itemname',
'manufacturername':'manufacturename',
'manufactureritemdescription':'manufactureitemdescription',
'unitofmeasure':'unitmeasure',
'blsweighted':'bisweighted'}
tags = {'chainid',
'subchainid',
'storeid',
'priceupdatedate',
'itemcode',
'itemtype',
'itemname',
'manufacturename',
'manufacturecountry',
'manufactureitemdescription',
'unitqty',
'quantity',
'unitmeasure',
'bisweighted',
'qtyinpackage',
'itemprice',
'unitofmeasureprice',
'allowdiscount'}
all_prices = {'ignore': list(ignore),
'tags_dict': tags_dict,
'tags':list(tags)}
save_conf('conf/all_prices.json', all_prices)
download_all_prices(tags, ignore, tags_dict)

if __name__ == "__main__":
generate_stores_configurations_base()
generate_prices_configuration()

0 comments on commit 7d8efea

Please sign in to comment.