From bb44da2c796c6ae398be9b81bf1cdbf6d324edad Mon Sep 17 00:00:00 2001 From: Avram Korets Date: Thu, 27 Jul 2023 21:40:46 +0300 Subject: [PATCH] add download_all_prices flow --- conf/all_prices.json | 38 +- mapper.py | 5 +- price_parser.py | 101 +++ store_parser.py | 13 +- testbed.ipynb | 1518 +++++++++++++++++++++++++++--------------- xml_parser.py | 17 + 6 files changed, 1116 insertions(+), 576 deletions(-) create mode 100644 price_parser.py create mode 100644 xml_parser.py diff --git a/conf/all_prices.json b/conf/all_prices.json index 9476c45..cec814f 100644 --- a/conf/all_prices.json +++ b/conf/all_prices.json @@ -1,13 +1,14 @@ { "ignore": [ - "itemstatus", - "itemid", "lastupdatetime", - "xmldocversion", + "itemtype", + "itemid", + "dllverno", "bikoretno", + "itemstatus", + "xmldocversion", "lastupdatedate", - "dllverno", - "itemtype" + "itemtype" ], "tags_dict": { "itemnm": "itemname", @@ -17,23 +18,22 @@ "blsweighted": "bisweighted" }, "tags": [ - "itemprice", - "unitmeasure", - "manufactureitemdescription", - "storeid", - "subchainid", - "unitofmeasureprice", "chainid", - "priceupdatedate", - "allowdiscount", - "itemname", - "quantity", - "manufacturecountry", - "bisweighted", + "subchainid", + "storeid", "itemcode", + "itemname", + "itemprice", "unitqty", - "manufacturename", + "unitofmeasureprice", "qtyinpackage", - "itemtype" + "manufacturename", + "manufacturecountry", + "manufactureitemdescription", + "bisweighted", + "allowdiscount", + "priceupdatedate", + "unitmeasure", + "quantity" ] } \ No newline at end of file diff --git a/mapper.py b/mapper.py index 7422978..6bdfd53 100644 --- a/mapper.py +++ b/mapper.py @@ -16,7 +16,8 @@ from il_supermarket_scarper.utils.file_types import FileTypesFilters from il_supermarket_scarper.main import ScarpingTask from tools import save_conf -from store_parser import get_root, generate_store_dictionary +from xml_parser import get_root +from store_parser import generate_store_dictionary from store_parser import generate_store_dictionary_lower_case, save_store_conf @@ -261,8 +262,6 @@ def download_all_prices(tags, ignore, tags_dict): if not data_files: print(f'Failed to find file for scrapper {scrapper.chain}') return - #chain_name = scrapper.chain - #conf_path = get_store_conf_path(chain_name) if scrapper.chain not in provider_encoding: provider_encoding[scrapper.chain] = 'utf-8-sig' print(scrapper.chain, provider_encoding[scrapper.chain]) diff --git a/price_parser.py b/price_parser.py new file mode 100644 index 0000000..34b5092 --- /dev/null +++ b/price_parser.py @@ -0,0 +1,101 @@ +from tools import load_conf, save_conf +from pprint import pprint +import glob +import os +import shutil +from il_supermarket_scarper.scrappers_factory import ScraperFactory +from il_supermarket_scarper.main import ScarpingTask +from il_supermarket_scarper.utils.file_types import FileTypesFilters +from xml_parser import get_root +import pandas as pd +from pathlib import Path + +def parse_price_xml(root, provider, tags, ignore, tags_dict, item_info_dict, price_rows): + """analyse "price item" by going through the xml""" + if root is None: + return + + have_item_id = False + + for child in root.getchildren(): + if len(child.getchildren()) > 0: + parse_price_xml(child, provider, tags, ignore, tags_dict, item_info_dict, price_rows) + else: + tag = child.tag.lower() + if tag in ignore: + continue + tag_name = tags_dict.get(tag, tag) + item_info_dict[tag_name] = child.text + #print(tag_name, child.tag, child.text) + if tag_name == 'itemcode': + have_item_id = True + #print("") + + + if have_item_id: + row = [provider] + #print(item_info_dict) + for tag in tags: + row.append(item_info_dict[tag]) + price_rows.append(row) + + return + +def download_all_prices(progress_bar=None, force=False): + """load or create dataframe based on prices from all providers (from all providers) """ + data_prices_path = 'data/prices.csv' + if os.path.isfile(data_prices_path) and not force: + if progress_bar: + progress_bar.value = progress_bar.max/2 + df = pd.read_csv(data_prices_path, low_memory=False) + if progress_bar: + progress_bar.value = progress_bar.max + return df + + #collect price_rows from all providers + price_rows = [] + all_prices = load_conf('conf/all_prices.json') + (tags, ignore, tags_dict) = (all_prices['tags'], all_prices['ignore'], all_prices['tags_dict']) + output_folder = "price_data" + encoding = 'utf-8-sig' + used_files = [] + failed_files = [] + for scrapper_class in ScraperFactory: + ScarpingTask(dump_folder_name=output_folder, only_latest=True, + files_types=FileTypesFilters.only_price(), + enabled_scrapers=[scrapper_class], + lookup_in_db=False).start() + pattern = f'{FileTypesFilters.PRICE_FILE.value["should_contain"]}*.xml' + scrapper = ScraperFactory.get(scrapper_class)(output_folder) + data_files = list(file for file in + glob.iglob(os.path.join(scrapper.get_storage_path(), pattern))) + if not data_files: + print(f'Failed to find file for scrapper {scrapper.chain}') + return + print(scrapper.chain, encoding) + for data_file in data_files: + used_files.append(data_file) + root = None + try: + root = get_root(data_file, encoding) + except: + print('failed to get root of '+data_file) + failed_files.append(data_file) + continue + item_info_dict = {} + parse_price_xml(root, scrapper.chain, tags, ignore, tags_dict, item_info_dict, price_rows) + + if progress_bar: + progress_bar.value += 1 + shutil.rmtree(output_folder) + if failed_files: + print('failed files:') + pprint(failed_files) + + #create dataframe based on price_rows + header = ['provider'] + all_prices['tags'] + df = pd.DataFrame(price_rows, columns=header) + filepath = Path(data_prices_path) + filepath.parent.mkdir(parents=True, exist_ok=True) + df.to_csv(filepath, index=False) + return df \ No newline at end of file diff --git a/store_parser.py b/store_parser.py index fb59789..6c1f9f0 100644 --- a/store_parser.py +++ b/store_parser.py @@ -3,24 +3,13 @@ Used to parse all store file formats @author: Avi """ -import codecs import os import shutil -from lxml import objectify from il_supermarket_scarper.main import ScarpingTask from il_supermarket_scarper.scrappers_factory import ScraperFactory from il_supermarket_scarper.utils.file_types import FileTypesFilters from tools import save_conf, load_conf - -def get_root(xml_file, encoding): - """get store xml root, in lxml format""" - with codecs.open(xml_file, encoding=encoding, errors="ignore") as store_file: - xml = store_file.read() - #print(xml[:90]) - xml = xml.replace('\r\n','') - xml = xml.encode("UTF-16") - - return objectify.fromstring(xml) +from xml_parser import get_root def save_store_conf(chain_name, encoding, name_dict, ignore_dict, ignore_file): """save store configuration""" diff --git a/testbed.ipynb b/testbed.ipynb index 50832a7..f021990 100644 --- a/testbed.ipynb +++ b/testbed.ipynb @@ -559,60 +559,6 @@ "unknown" ] }, - { - "cell_type": "code", - "execution_count": 100, - "id": "cbb8935e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "count_online_stores:20\n", - "{'www.shufersal.co.il', 'www.rami-levy.co.il', 'www.carrefour.co.il', 'www.m2000.co.il', 'www.keshet-teamim.co.il', 'www.shukcity.co.il', 'www.edenteva.co.il', 'www.ybitan.co.il', 'www.mega.co.il', 'www.tivtaam.co.il'}\n" - ] - }, - { - "data": { - "text/plain": [ - "{'WWW.SHUFERSAL.CO.IL',\n", - " 'http://www.edenteva.co.il/',\n", - " 'https://www.carrefour.co.il/',\n", - " 'https://www.mega.co.il/',\n", - " 'https://www.rami-levy.co.il/he',\n", - " 'https://www.shukcity.co.il/ 0',\n", - " 'https://www.tivtaam.co.il/',\n", - " 'https://www.ybitan.co.il/',\n", - " 'www.m2000.co.il 0',\n", - " 'המפעלים 9-www.keshet-teamim.co.il'}" - ] - }, - "execution_count": 100, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stores_data = load_conf('conf/all_stores.json')\n", - "url_list = set()\n", - "url_clean_list = set()\n", - "count_online_stores = 0\n", - "city_that_online = []\n", - "for key in stores_data:\n", - " address = stores_data[key]['Address']\n", - " if address and 'co.il' in address.lower():\n", - " count_online_stores += 1\n", - " url_list.add(address)\n", - " #print(address)\n", - " #print(re.findall(\"www.*.il\",address.lower())[0])\n", - " url_clean_list.add(re.findall(\"www.*.il\",address.lower())[0])\n", - " city_that_online.append(stores_data[key]['City'])\n", - "print(f'count_online_stores:{count_online_stores}')\n", - "print(url_clean_list)\n", - "url_list" - ] - }, { "cell_type": "code", "execution_count": 24, @@ -875,9 +821,7 @@ "cell_type": "code", "execution_count": 48, "id": "3fd8b8d1", - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -996,494 +940,10 @@ "city_status" ] }, - { - "cell_type": "code", - "execution_count": 84, - "id": "0f7b8bfd", - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Logger 2023-07-24 17:19:17,541 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:19:17,541 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:19:17,542 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:19:17,542 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:19:17,543 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:19:22,499 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:19:22,501 INFO logger.py:info Storage path: price_data\\bareket\n", - "bareket utf-8-sig\n", - "price_data\\bareket\\Price7290875100001-002-202307241449-001.xml utf-8-sig\n", - "Logger 2023-07-24 17:19:22,872 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:19:22,874 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:19:22,874 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:19:22,875 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:19:22,876 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:19:36,135 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:19:36,137 INFO logger.py:info Storage path: price_data\\ybitan\n", - "ybitan utf-8-sig\n", - "price_data\\ybitan\\Price7290725900003-0002-202307241700.xml utf-8-sig\n", - "Logger 2023-07-24 17:19:36,292 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:19:36,293 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:19:36,294 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:19:36,295 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:19:36,295 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:19:41,605 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:19:41,606 INFO logger.py:info Storage path: price_data\\cofix\n", - "cofix utf-8-sig\n", - "price_data\\cofix\\Price7291056200008-299-202307240722.xml utf-8-sig\n", - "Logger 2023-07-24 17:19:41,661 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:19:41,662 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:19:41,663 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:19:41,663 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:19:41,664 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:20:28,816 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:20:28,817 INFO logger.py:info Storage path: price_data\\Dor Alon\n", - "Dor Alon utf-8-sig\n", - "price_data\\Dor Alon\\Price7290492000005-507-202307241000.xml utf-8-sig\n", - "Logger 2023-07-24 17:20:29,098 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:20:29,099 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:20:29,099 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:20:29,100 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:20:29,101 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:20:46,259 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:20:46,260 INFO logger.py:info Storage path: price_data\\GoodPharm\n", - "GoodPharm utf-8-sig\n", - "price_data\\GoodPharm\\Price7290058197699-044-202307241601.xml utf-8-sig\n", - "Logger 2023-07-24 17:20:46,339 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:20:46,340 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:20:46,341 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:20:46,342 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:20:46,342 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:20:48,891 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:20:48,892 INFO logger.py:info Storage path: price_data\\Hazi Hinam\n", - "Hazi Hinam utf-8-sig\n", - "price_data\\Hazi Hinam\\Price7290700100008-001-202307240700.xml utf-8-sig\n", - "Logger 2023-07-24 17:20:48,945 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:20:48,946 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:20:48,947 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:20:48,948 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:20:48,948 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:20:56,217 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:20:56,219 INFO logger.py:info Storage path: price_data\\Keshet Taamim\n", - "Keshet Taamim utf-8-sig\n", - "price_data\\Keshet Taamim\\Price7290785400000-002-202307241000.xml utf-8-sig\n", - "Logger 2023-07-24 17:20:56,268 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:20:56,269 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:20:56,270 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:20:56,271 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:20:56,272 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:21:04,236 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:21:04,238 INFO logger.py:info Storage path: price_data\\King Store\n", - "King Store utf-8-sig\n", - "price_data\\King Store\\Price7290058108879-001-202307241601.xml utf-8-sig\n", - "Logger 2023-07-24 17:21:04,314 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:21:04,315 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:21:04,315 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:21:04,316 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:21:04,317 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:21:17,024 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:21:17,025 INFO logger.py:info Storage path: price_data\\Maayan2000\n", - "Maayan2000 utf-8-sig\n", - "price_data\\Maayan2000\\Price7290058159628-001-202307241601.xml utf-8-sig\n", - "Logger 2023-07-24 17:21:17,130 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:21:17,131 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:21:17,132 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:21:17,133 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:21:17,133 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:21:44,686 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:21:44,687 INFO logger.py:info Storage path: price_data\\mahsani a shuk\n", - "mahsani a shuk utf-8-sig\n", - "price_data\\mahsani a shuk\\Price7290633800006-202-202307241610-000.xml utf-8-sig\n", - "Logger 2023-07-24 17:21:44,785 INFO logger.py:info size_estimation_mode: False\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Logger 2023-07-24 17:21:44,786 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:21:44,787 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:21:44,787 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:21:44,788 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:21:46,528 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:21:46,530 INFO logger.py:info Storage path: price_data\\mega-market\n", - "mega-market utf-8-sig\n", - "price_data\\mega-market\\Price7290055700014-2150-202307241700.xml utf-8-sig\n", - "Logger 2023-07-24 17:21:46,547 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:21:46,548 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:21:46,549 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:21:46,550 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:21:46,550 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:22:14,354 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:22:14,356 INFO logger.py:info Storage path: price_data\\mega\n", - "mega utf-8-sig\n", - "price_data\\mega\\Price7290055700007-0009-202307241700.xml utf-8-sig\n", - "Logger 2023-07-24 17:22:14,525 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:22:14,528 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:22:14,529 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:22:14,529 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:22:14,530 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:22:34,837 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:22:34,839 INFO logger.py:info Storage path: price_data\\Netiv Hasef\n", - "Netiv Hasef utf-8-sig\n", - "price_data\\Netiv Hasef\\Price7290058160839-001-202307241204.xml utf-8-sig\n", - "Logger 2023-07-24 17:22:34,931 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:22:34,932 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:22:34,933 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:22:34,934 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:22:34,934 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:22:38,806 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:22:38,808 INFO logger.py:info Storage path: price_data\\Osher Ad\n", - "Osher Ad utf-8-sig\n", - "price_data\\Osher Ad\\Price7290103152017-001-202307240900.xml utf-8-sig\n", - "Logger 2023-07-24 17:22:38,846 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:22:38,847 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:22:38,848 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:22:38,848 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:22:38,849 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:22:41,137 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:22:41,139 INFO logger.py:info Storage path: price_data\\Polizer\n", - "Polizer utf-8-sig\n", - "price_data\\Polizer\\Price7291059100008-001-202307240700.xml utf-8-sig\n", - "Logger 2023-07-24 17:22:41,168 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:22:41,169 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:22:41,170 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:22:41,171 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:22:41,171 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:22:56,037 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:22:56,039 INFO logger.py:info Storage path: price_data\\Rami Levy\n", - "Rami Levy utf-8-sig\n", - "price_data\\Rami Levy\\Price7290058140886-001-202307240800.xml utf-8-sig\n", - "Logger 2023-07-24 17:22:56,148 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:22:56,149 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:22:56,149 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:22:56,150 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:22:56,151 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:22:58,267 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:22:58,268 INFO logger.py:info Storage path: price_data\\salachdabach\n", - "salachdabach utf-8-sig\n", - "price_data\\salachdabach\\Price7290526500006-004-202307240600.xml utf-8-sig\n", - "Logger 2023-07-24 17:22:58,311 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:22:58,312 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:22:58,313 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:22:58,314 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:22:58,314 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:23:06,640 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:23:06,641 INFO logger.py:info Storage path: price_data\\ShefaBarcartAshem\n", - "ShefaBarcartAshem utf-8-sig\n", - "price_data\\ShefaBarcartAshem\\Price7290058134977-001-202307240801.xml utf-8-sig\n", - "Logger 2023-07-24 17:23:06,679 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:23:06,680 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:23:06,681 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:23:06,681 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:23:06,682 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:27:07,893 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:27:07,895 INFO logger.py:info Storage path: price_data\\Shufersal\n", - "Shufersal utf-8-sig\n", - "price_data\\Shufersal\\Price7290027600007-001-202307241600.xml utf-8-sig\n", - "Logger 2023-07-24 17:27:08,497 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:27:08,498 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:27:08,499 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:27:08,500 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:27:08,501 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:27:16,435 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:27:16,437 INFO logger.py:info Storage path: price_data\\Shuk Ahir\n", - "Shuk Ahir utf-8-sig\n", - "price_data\\Shuk Ahir\\Price7290058148776-004-202307241601.xml utf-8-sig\n", - "Logger 2023-07-24 17:27:16,498 INFO logger.py:info size_estimation_mode: False\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Logger 2023-07-24 17:27:16,499 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:27:16,500 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:27:16,501 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:27:16,501 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:27:20,189 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:27:20,190 INFO logger.py:info Storage path: price_data\\Stop Market\n", - "Stop Market utf-8-sig\n", - "price_data\\Stop Market\\Price7290639000004-001-202307240700.xml utf-8-sig\n", - "Logger 2023-07-24 17:27:20,216 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:27:20,217 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:27:20,217 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:27:20,218 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:27:20,219 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:27:51,063 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:27:51,064 INFO logger.py:info Storage path: price_data\\Super-Pharm\n", - "Super-Pharm utf-8-sig\n", - "price_data\\Super-Pharm\\Price7290172900007-001-202307241630.xml utf-8-sig\n", - "missing_tag ['itemtype']\n", - "Logger 2023-07-24 17:27:51,408 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:27:51,409 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:27:51,410 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:27:51,410 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:27:51,411 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:28:01,056 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:28:01,058 INFO logger.py:info Storage path: price_data\\SuperYuda\n", - "SuperYuda utf-8-sig\n", - "price_data\\SuperYuda\\Price7290058177776-003-202307241601.xml utf-8-sig\n", - "Logger 2023-07-24 17:28:01,190 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:28:01,191 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:28:01,192 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:28:01,193 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:28:01,193 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:28:09,213 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:28:09,215 INFO logger.py:info Storage path: price_data\\Super Dosh\n", - "Super Dosh utf-8-sig\n", - "price_data\\Super Dosh\\Price7290876100000-001-202307240700.xml utf-8-sig\n", - "Logger 2023-07-24 17:28:09,285 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:28:09,287 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:28:09,287 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:28:09,288 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:28:09,289 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:28:17,729 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:28:17,731 INFO logger.py:info Storage path: price_data\\Tiv Taam\n", - "Tiv Taam utf-8-sig\n", - "price_data\\Tiv Taam\\Price7290873255550-002-202307241610.xml utf-8-sig\n", - "Logger 2023-07-24 17:28:17,809 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:28:17,810 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:28:17,811 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:28:17,812 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:28:17,812 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:28:47,696 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:28:47,697 INFO logger.py:info Storage path: price_data\\Victory\n", - "Victory utf-8-sig\n", - "price_data\\Victory\\Price7290696200003-001-202307240600-001.xml utf-8-sig\n", - "Logger 2023-07-24 17:28:47,793 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:28:47,794 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:28:47,795 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:28:47,795 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:28:47,796 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:29:42,933 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:29:42,935 INFO logger.py:info Storage path: price_data\\Yellow\n", - "Yellow utf-8-sig\n", - "price_data\\Yellow\\Price7290644700005-100-202307240600.xml utf-8-sig\n", - "Logger 2023-07-24 17:29:43,213 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:29:43,214 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:29:43,215 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:29:43,215 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:29:43,216 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:29:51,519 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:29:51,521 INFO logger.py:info Storage path: price_data\\Yohananof\n", - "Yohananof utf-8-sig\n", - "price_data\\Yohananof\\Price7290803800003-001-202307240800.xml utf-8-sig\n", - "Logger 2023-07-24 17:29:51,585 INFO logger.py:info size_estimation_mode: False\n", - "Logger 2023-07-24 17:29:51,586 INFO logger.py:info Enabled scrapers: [>]\n", - "Logger 2023-07-24 17:29:51,587 INFO logger.py:info Limit is None\n", - "Logger 2023-07-24 17:29:51,588 INFO logger.py:info files_types is ['PRICE_FILE', 'PRICE_FULL_FILE']\n", - "Logger 2023-07-24 17:29:51,589 INFO logger.py:info Start scraping all supermarkets.\n", - "Logger 2023-07-24 17:30:03,453 INFO logger.py:info Done scraping all supermarkets.\n", - "Logger 2023-07-24 17:30:03,455 INFO logger.py:info Storage path: price_data\\ZolVeBegadol\n", - "ZolVeBegadol utf-8-sig\n", - "price_data\\ZolVeBegadol\\Price7290058173198-003-202307241701.xml utf-8-sig\n", - "['price_data\\\\bareket\\\\Price7290875100001-002-202307241449-001.xml',\n", - " 'price_data\\\\ybitan\\\\Price7290725900003-0002-202307241700.xml',\n", - " 'price_data\\\\cofix\\\\Price7291056200008-299-202307240722.xml',\n", - " 'price_data\\\\Dor Alon\\\\Price7290492000005-507-202307241000.xml',\n", - " 'price_data\\\\GoodPharm\\\\Price7290058197699-044-202307241601.xml',\n", - " 'price_data\\\\Hazi Hinam\\\\Price7290700100008-001-202307240700.xml',\n", - " 'price_data\\\\Keshet Taamim\\\\Price7290785400000-002-202307241000.xml',\n", - " 'price_data\\\\King Store\\\\Price7290058108879-001-202307241601.xml',\n", - " 'price_data\\\\Maayan2000\\\\Price7290058159628-001-202307241601.xml',\n", - " 'price_data\\\\mahsani a shuk\\\\Price7290633800006-202-202307241610-000.xml',\n", - " 'price_data\\\\mega-market\\\\Price7290055700014-2150-202307241700.xml',\n", - " 'price_data\\\\mega\\\\Price7290055700007-0009-202307241700.xml',\n", - " 'price_data\\\\Netiv Hasef\\\\Price7290058160839-001-202307241204.xml',\n", - " 'price_data\\\\Osher Ad\\\\Price7290103152017-001-202307240900.xml',\n", - " 'price_data\\\\Polizer\\\\Price7291059100008-001-202307240700.xml',\n", - " 'price_data\\\\Rami Levy\\\\Price7290058140886-001-202307240800.xml',\n", - " 'price_data\\\\salachdabach\\\\Price7290526500006-004-202307240600.xml',\n", - " 'price_data\\\\ShefaBarcartAshem\\\\Price7290058134977-001-202307240801.xml',\n", - " 'price_data\\\\Shufersal\\\\Price7290027600007-001-202307241600.xml',\n", - " 'price_data\\\\Shuk Ahir\\\\Price7290058148776-004-202307241601.xml',\n", - " 'price_data\\\\Stop Market\\\\Price7290639000004-001-202307240700.xml',\n", - " 'price_data\\\\Super-Pharm\\\\Price7290172900007-001-202307241630.xml',\n", - " 'price_data\\\\SuperYuda\\\\Price7290058177776-003-202307241601.xml',\n", - " 'price_data\\\\Super Dosh\\\\Price7290876100000-001-202307240700.xml',\n", - " 'price_data\\\\Tiv Taam\\\\Price7290873255550-002-202307241610.xml',\n", - " 'price_data\\\\Victory\\\\Price7290696200003-001-202307240600-001.xml',\n", - " 'price_data\\\\Yellow\\\\Price7290644700005-100-202307240600.xml',\n", - " 'price_data\\\\Yohananof\\\\Price7290803800003-001-202307240800.xml',\n", - " 'price_data\\\\ZolVeBegadol\\\\Price7290058173198-003-202307241701.xml']\n", - "{'Dor Alon': 'utf-8-sig',\n", - " 'GoodPharm': 'utf-8-sig',\n", - " 'Hazi Hinam': 'utf-8-sig',\n", - " 'Keshet Taamim': 'utf-8-sig',\n", - " 'King Store': 'utf-8-sig',\n", - " 'Maayan2000': 'utf-8-sig',\n", - " 'Netiv Hasef': 'utf-8-sig',\n", - " 'Osher Ad': 'utf-8-sig',\n", - " 'Polizer': 'utf-8-sig',\n", - " 'Rami Levy': 'utf-8-sig',\n", - " 'ShefaBarcartAshem': 'utf-8-sig',\n", - " 'Shufersal': 'utf-8-sig',\n", - " 'Shuk Ahir': 'utf-8-sig',\n", - " 'Stop Market': 'utf-8-sig',\n", - " 'Super Dosh': 'utf-8-sig',\n", - " 'Super-Pharm': 'utf-8-sig',\n", - " 'SuperYuda': 'utf-8-sig',\n", - " 'Tiv Taam': 'utf-8-sig',\n", - " 'Victory': 'utf-8-sig',\n", - " 'Yellow': 'utf-8-sig',\n", - " 'Yohananof': 'utf-8-sig',\n", - " 'ZolVeBegadol': 'utf-8-sig',\n", - " 'bareket': 'utf-8-sig',\n", - " 'cofix': 'utf-8-sig',\n", - " 'mahsani a shuk': 'utf-8-sig',\n", - " 'mega': 'utf-8-sig',\n", - " 'mega-market': 'utf-8-sig',\n", - " 'salachdabach': 'utf-8-sig',\n", - " 'ybitan': 'utf-8-sig'}\n", - "{'Dor Alon': {'missing_tag': [], 'new_tag': []},\n", - " 'GoodPharm': {'missing_tag': [], 'new_tag': []},\n", - " 'Hazi Hinam': {'missing_tag': [], 'new_tag': []},\n", - " 'Keshet Taamim': {'missing_tag': [], 'new_tag': []},\n", - " 'King Store': {'missing_tag': [], 'new_tag': []},\n", - " 'Maayan2000': {'missing_tag': [], 'new_tag': []},\n", - " 'Netiv Hasef': {'missing_tag': [], 'new_tag': []},\n", - " 'Osher Ad': {'missing_tag': [], 'new_tag': []},\n", - " 'Polizer': {'missing_tag': [], 'new_tag': []},\n", - " 'Rami Levy': {'missing_tag': [], 'new_tag': []},\n", - " 'ShefaBarcartAshem': {'missing_tag': [], 'new_tag': []},\n", - " 'Shufersal': {'missing_tag': [], 'new_tag': []},\n", - " 'Shuk Ahir': {'missing_tag': [], 'new_tag': []},\n", - " 'Stop Market': {'missing_tag': [], 'new_tag': []},\n", - " 'Super Dosh': {'missing_tag': [], 'new_tag': []},\n", - " 'Super-Pharm': {'missing_tag': ['itemtype'], 'new_tag': []},\n", - " 'SuperYuda': {'missing_tag': [], 'new_tag': []},\n", - " 'Tiv Taam': {'missing_tag': [], 'new_tag': []},\n", - " 'Victory': {'missing_tag': [], 'new_tag': []},\n", - " 'Yellow': {'missing_tag': [], 'new_tag': []},\n", - " 'Yohananof': {'missing_tag': [], 'new_tag': []},\n", - " 'ZolVeBegadol': {'missing_tag': [], 'new_tag': []},\n", - " 'bareket': {'missing_tag': [], 'new_tag': []},\n", - " 'cofix': {'missing_tag': [], 'new_tag': []},\n", - " 'mahsani a shuk': {'missing_tag': [], 'new_tag': []},\n", - " 'mega': {'missing_tag': [], 'new_tag': []},\n", - " 'mega-market': {'missing_tag': [], 'new_tag': []},\n", - " 'salachdabach': {'missing_tag': [], 'new_tag': []},\n", - " 'ybitan': {'missing_tag': [], 'new_tag': []}}\n" - ] - } - ], - "source": [ - "from il_supermarket_scarper.main import ScarpingTask\n", - "from il_supermarket_scarper.scrappers_factory import ScraperFactory\n", - "from il_supermarket_scarper.utils.file_types import FileTypesFilters\n", - "import glob\n", - "from store_parser import get_store_conf_path, get_root\n", - "from tools import save_conf, load_conf\n", - "import shutil\n", - "\n", - "\n", - "def all_value_tags(root, tag_set):\n", - " \"\"\"collect all tag names that doesn't have children (tag that contains only value)\"\"\"\n", - " if root is None:\n", - " return\n", - " if len(root.getchildren())==0:\n", - " tag_set.add(root.tag)\n", - " return\n", - " for child in root.getchildren():\n", - " all_value_tags(child, tag_set)\n", - "\n", - "def check_prices_tags(file, encoding, tags, ignore, tags_dict):\n", - " print(file, encoding)\n", - " root = get_root(file, encoding)\n", - " tag_set = set()\n", - " all_value_tags(root, tag_set)\n", - " lowcase_tag_set = [tags_dict.get(tag.lower(), tag.lower()) for tag in tag_set]\n", - " new_tag = [tag for tag in lowcase_tag_set if not tag in tags and not tag in ignore]\n", - " if new_tag:\n", - " print('new_tag', new_tag)\n", - " missing_tag = [tag for tag in tags if not tag in lowcase_tag_set]\n", - " if missing_tag:\n", - " print('missing_tag', missing_tag)\n", - " return {'new_tag':new_tag, 'missing_tag':missing_tag}\n", - "\n", - "def get_data_file(data_files, encoding):\n", - " for file in data_files:\n", - " with open(file, encoding=encoding) as f:\n", - " try:\n", - " if 'Items Count=\"0\"' not in f.read():#workaround for dor alon, empty prices file\n", - " return file\n", - " except: #not dor alon\n", - " return file\n", - "\n", - "def download_all_prices(tags, ignore, tags_dict, progress_bar=None, force=False):\n", - " output_folder = \"price_data\"\n", - " run_result = {}\n", - " provider_encoding = {}\n", - " used_files = []\n", - " for scrapper_class in ScraperFactory:\n", - " ScarpingTask(dump_folder_name=output_folder, only_latest=True,\n", - " files_types=FileTypesFilters.only_price(),\n", - " enabled_scrapers=[scrapper_class],\n", - " lookup_in_db=False).start()\n", - " pattern = f'{FileTypesFilters.PRICE_FILE.value[\"should_contain\"]}*.xml'\n", - " scrapper = ScraperFactory.get(scrapper_class)(output_folder)\n", - " data_files = list(file for file in\n", - " glob.iglob(os.path.join(scrapper.get_storage_path(), pattern)))\n", - " if not data_files:\n", - " print(f'Failed to find file for scrapper {scrapper.chain}')\n", - " return\n", - " chain_name = scrapper.chain\n", - " conf_path = get_store_conf_path(chain_name)\n", - " if scrapper.chain not in provider_encoding:\n", - " provider_encoding[scrapper.chain] = 'utf-8-sig'\n", - " print(scrapper.chain, provider_encoding[scrapper.chain])\n", - " data_file = get_data_file(data_files, provider_encoding[scrapper.chain])\n", - " used_files.append(data_file)\n", - " run_result[scrapper.chain] = check_prices_tags(data_file, provider_encoding[scrapper.chain], tags, ignore, tags_dict)\n", - " shutil.rmtree(output_folder)\n", - " from pprint import pprint\n", - " pprint(used_files)\n", - " pprint(provider_encoding)\n", - " pprint(run_result)\n", - "\n", - "ignore = {'lastupdatetime',\n", - " 'bikoretno',\n", - " 'itemstatus',\n", - " 'dllverno', \n", - " 'xmldocversion',\n", - " 'lastupdatedate',\n", - " 'itemid',\n", - " 'itemtype'}\n", - "tags_dict = {'itemnm':'itemname',\n", - " 'manufacturername':'manufacturename',\n", - " 'manufactureritemdescription':'manufactureitemdescription',\n", - " 'unitofmeasure':'unitmeasure',\n", - " 'blsweighted':'bisweighted'}\n", - "tags = {'chainid',\n", - " 'subchainid',\n", - " 'storeid',\n", - " 'priceupdatedate',\n", - " 'itemcode',\n", - " 'itemtype',\n", - " 'itemname',\n", - " 'manufacturename',\n", - " 'manufacturecountry',\n", - " 'manufactureitemdescription',\n", - " 'unitqty',\n", - " 'quantity',\n", - " 'unitmeasure',\n", - " 'bisweighted',\n", - " 'qtyinpackage',\n", - " 'itemprice',\n", - " 'unitofmeasureprice',\n", - " 'allowdiscount'}\n", - "download_all_prices(tags, ignore, tags_dict)" - ] - }, { "cell_type": "code", "execution_count": 43, - "id": "512c7d68", + "id": "daba32ea", "metadata": {}, "outputs": [ { @@ -1533,6 +993,980 @@ "df1['City_Count'] = df1['City'].map(count_freq)\n", "sns.histplot(df1[df1.City_Count>30], y=\"City\")" ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5fe3ebc1", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
providerchainidsubchainidstoreiditemcodeitemnameitempriceunitqtyunitofmeasurepriceqtyinpackagemanufacturenamemanufacturecountrymanufactureitemdescriptionbisweightedallowdiscountpriceupdatedateunitmeasurequantity
0bareket72908751000011268067809251סט כוסות לקפה12.9'יח12.901הום פרפקט ק.ד.מ בעמNaNNaN002023/07/26 15:35NaN1
1bareket729087510000112624269000009סט מכוניות19.9'יח19.901הום פרפקט ק.ד.מ בעמNaNNaN002023/07/26 15:35NaN1
2bareket7290875100001124011800522810קורני חטיף קרמל 6*25 גרם11.9'יח7.931משה סידס ובנו בעמNaNNaN002023/07/26 15:36ל-100 גרם1.50
3bareket7290875100001126927688302198חבל קפיצה7.5'יח7.501הום פרפקט ק.ד.מ בעמNaNNaN002023/07/26 15:35'יח1
4bareket7290875100001126941057402178מצופים אינקס15.0'יח151הום פרפקט ק.ד.מ בעמNaNNaN002023/07/26 15:35NaN1
.........................................................
7075233ZolVeBegadol72900581731981899310072028743'טיםטם אורגינל 200גר20.9000000.20924.0000לא ידועלא ידוע'טיםטם אורגינל 200גר002016-11-21 14:04:180000024
7075234ZolVeBegadol72900581731981899310072028750'טיםטם דאבל 200גר20.9000000.20924.0000לא ידועלא ידוע'טיםטם דאבל 200גר002016-11-21 14:03:300000024
7075235ZolVeBegadol72900581731981899771565025005מעריב סוף שבוע5.9000005.90.0000לא ידועלא ידועמעריב סוף שבוע002022-02-02 17:19:35000000
7075236ZolVeBegadol72900581731981899771565051011ידיעות שישי+ חג14.90000014.90.0000לא ידועלא ידועידיעות שישי+ חג002021-11-23 11:08:17000000
7075237ZolVeBegadol72900581731981899771565051066ידיעות שישי+ חג14.90000014.90.0000לא ידועלא ידועידיעות שישי+ חג002021-11-23 11:08:27000000
\n", + "

7075238 rows × 18 columns

\n", + "
" + ], + "text/plain": [ + " provider chainid subchainid storeid itemcode \\\n", + "0 bareket 7290875100001 1 2 68067809251 \n", + "1 bareket 7290875100001 1 2 624269000009 \n", + "2 bareket 7290875100001 1 2 4011800522810 \n", + "3 bareket 7290875100001 1 2 6927688302198 \n", + "4 bareket 7290875100001 1 2 6941057402178 \n", + "... ... ... ... ... ... \n", + "7075233 ZolVeBegadol 7290058173198 1 89 9310072028743 \n", + "7075234 ZolVeBegadol 7290058173198 1 89 9310072028750 \n", + "7075235 ZolVeBegadol 7290058173198 1 89 9771565025005 \n", + "7075236 ZolVeBegadol 7290058173198 1 89 9771565051011 \n", + "7075237 ZolVeBegadol 7290058173198 1 89 9771565051066 \n", + "\n", + " itemname itemprice unitqty unitofmeasureprice \\\n", + "0 סט כוסות לקפה 12.9 'יח 12.90 \n", + "1 סט מכוניות 19.9 'יח 19.90 \n", + "2 קורני חטיף קרמל 6*25 גרם 11.9 'יח 7.93 \n", + "3 חבל קפיצה 7.5 'יח 7.50 \n", + "4 מצופים אינקס 15.0 'יח 15 \n", + "... ... ... ... ... \n", + "7075233 'טיםטם אורגינל 200גר 20.9 00000 0.209 \n", + "7075234 'טיםטם דאבל 200גר 20.9 00000 0.209 \n", + "7075235 מעריב סוף שבוע 5.9 00000 5.9 \n", + "7075236 ידיעות שישי+ חג 14.9 00000 14.9 \n", + "7075237 ידיעות שישי+ חג 14.9 00000 14.9 \n", + "\n", + " qtyinpackage manufacturename manufacturecountry \\\n", + "0 1 הום פרפקט ק.ד.מ בעמ NaN \n", + "1 1 הום פרפקט ק.ד.מ בעמ NaN \n", + "2 1 משה סידס ובנו בעמ NaN \n", + "3 1 הום פרפקט ק.ד.מ בעמ NaN \n", + "4 1 הום פרפקט ק.ד.מ בעמ NaN \n", + "... ... ... ... \n", + "7075233 24.0000 לא ידוע לא ידוע \n", + "7075234 24.0000 לא ידוע לא ידוע \n", + "7075235 0.0000 לא ידוע לא ידוע \n", + "7075236 0.0000 לא ידוע לא ידוע \n", + "7075237 0.0000 לא ידוע לא ידוע \n", + "\n", + " manufactureitemdescription bisweighted allowdiscount \\\n", + "0 NaN 0 0 \n", + "1 NaN 0 0 \n", + "2 NaN 0 0 \n", + "3 NaN 0 0 \n", + "4 NaN 0 0 \n", + "... ... ... ... \n", + "7075233 'טיםטם אורגינל 200גר 0 0 \n", + "7075234 'טיםטם דאבל 200גר 0 0 \n", + "7075235 מעריב סוף שבוע 0 0 \n", + "7075236 ידיעות שישי+ חג 0 0 \n", + "7075237 ידיעות שישי+ חג 0 0 \n", + "\n", + " priceupdatedate unitmeasure quantity \n", + "0 2023/07/26 15:35 NaN 1 \n", + "1 2023/07/26 15:35 NaN 1 \n", + "2 2023/07/26 15:36 ל-100 גרם 1.50 \n", + "3 2023/07/26 15:35 'יח 1 \n", + "4 2023/07/26 15:35 NaN 1 \n", + "... ... ... ... \n", + "7075233 2016-11-21 14:04:18 00000 24 \n", + "7075234 2016-11-21 14:03:30 00000 24 \n", + "7075235 2022-02-02 17:19:35 00000 0 \n", + "7075236 2021-11-23 11:08:17 00000 0 \n", + "7075237 2021-11-23 11:08:27 00000 0 \n", + "\n", + "[7075238 rows x 18 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from price_parser import download_all_prices\n", + "from IPython.display import display, clear_output\n", + "from ipywidgets import IntProgress, Text\n", + "from il_supermarket_scarper.scrappers_factory import ScraperFactory\n", + "\n", + "max_count = len(ScraperFactory.all_scrapers())\n", + "t = Text(value='Downloading Price Info', disabled=True)\n", + "progress_bar = IntProgress(min=0, max=max_count) # instantiate the bar\n", + "display(t, progress_bar)\n", + "df = download_all_prices(progress_bar)\n", + "clear_output()\n", + "\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "eafbdfcd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
providerchainidsubchainidstoreiditemcodeitemnameitempriceunitqtyunitofmeasurepriceqtyinpackagemanufacturenamemanufacturecountrymanufactureitemdescriptionbisweightedallowdiscountpriceupdatedateunitmeasurequantity
7075218ZolVeBegadol72900581731981898719200998049מזולה בטעם טבעי10.9000000.1090.0000לא ידועישראלמזולה בטעם טבעי002021-10-27 10:43:45000000
7075219ZolVeBegadol72900581731981898720608014958תה ליפטון 1.5 גר' 100 יחידות15.9000000.15912.0000לא ידועהודותה ליפטון 1.5 גר' 100 יחידות002023-07-13 11:22:530000012
7075220ZolVeBegadol72900581731981898801055707966קפה בריסטה קלוי וטחון36.9000000.36912.0000לא ידועלא ידועקפה בריסטה קלוי וטחון002023-05-28 09:59:110000012
7075221ZolVeBegadol72900581731981898801055709465נסקפה קפוצ'ינו וניל 10 יח' 185 גרם21.5000000.2150.0000לא ידועדרום קוריאהנסקפה קפוצ'ינו וניל 10 יח' 185 גרם002023-05-28 09:52:11000000
7075222ZolVeBegadol72900581731981898801055709489נסקפה קפוצ'ינו אגוזים 10 יח' 180 גרם21.5000000.2150.0000לא ידועדרום קוריאהנסקפה קפוצ'ינו אגוזים 10 יח' 180 גרם002023-05-28 09:52:12000000
7075223ZolVeBegadol72900581731981898850389105832סאפה תפוח ליטר12.70000012.70.0000לא ידועתאילנדסאפה תפוח ליטר002023-05-25 16:54:56000000
7075224ZolVeBegadol72900581731981898850632604259יוגטה מקלות תות 24יח'5.0000000.050.0000לא ידועתאילנדיוגטה מקלות תות 24יח'012019-11-29 01:59:50000000
7075225ZolVeBegadol72900581731981898850632604266יוגטה מקלות קולה 24יח'5.0000000.050.0000לא ידועתאילנדיוגטה מקלות קולה 24יח'012019-11-30 04:02:36000000
7075226ZolVeBegadol72900581731981898850632604372יוגטה שטיח תות 24יח'5.0000000.050.0000לא ידועתאילנדיוגטה שטיח תות 24יח'012019-11-29 10:54:17000000
7075227ZolVeBegadol72900581731981898850632604402יוגטה SOUR STICKS - ממתק 50גר5.0000000.050.0000לא ידועלא ידועיוגטה SOUR STICKS - ממתק 50גר012019-11-28 21:27:10000000
7075228ZolVeBegadol72900581731981898850632605201יוגטה חספוסים כתום 50 גרם5.0000000.050.0000לא ידועתאילנדיוגטה חספוסים כתום 50 גרם012019-11-30 04:16:30000000
7075229ZolVeBegadol72900581731981898850632606390יוגטה שטיח אבטיח24יח'4.5000000.0450.0000לא ידועתאילנדיוגטה שטיח אבטיח24יח'012019-11-30 08:21:02000000
7075230ZolVeBegadol72900581731981898888111030053רביעיית מיץ אננס 100% 240 מ\"ל13.7000000.1370.0000לא ידועלא ידוערביעיית מיץ אננס 100% 240 מ\"ל002022-05-19 15:44:24000000
7075231ZolVeBegadol729005817319818990029753938431X30X70G הריבו גולדברס7.9000000.07930.0000לא ידועלא ידוע1X30X70G הריבו גולדברס012023-02-14 14:38:330000030
7075232ZolVeBegadol729005817319818990029753938501X30X70G הריבו וורמס7.9000000.07930.0000לא ידועלא ידוע1X30X70G הריבו וורמס012023-02-14 14:38:340000030
7075233ZolVeBegadol72900581731981899310072028743'טיםטם אורגינל 200גר20.9000000.20924.0000לא ידועלא ידוע'טיםטם אורגינל 200גר002016-11-21 14:04:180000024
7075234ZolVeBegadol72900581731981899310072028750'טיםטם דאבל 200גר20.9000000.20924.0000לא ידועלא ידוע'טיםטם דאבל 200גר002016-11-21 14:03:300000024
7075235ZolVeBegadol72900581731981899771565025005מעריב סוף שבוע5.9000005.90.0000לא ידועלא ידועמעריב סוף שבוע002022-02-02 17:19:35000000
7075236ZolVeBegadol72900581731981899771565051011ידיעות שישי+ חג14.90000014.90.0000לא ידועלא ידועידיעות שישי+ חג002021-11-23 11:08:17000000
7075237ZolVeBegadol72900581731981899771565051066ידיעות שישי+ חג14.90000014.90.0000לא ידועלא ידועידיעות שישי+ חג002021-11-23 11:08:27000000
\n", + "
" + ], + "text/plain": [ + " provider chainid subchainid storeid itemcode \\\n", + "7075218 ZolVeBegadol 7290058173198 1 89 8719200998049 \n", + "7075219 ZolVeBegadol 7290058173198 1 89 8720608014958 \n", + "7075220 ZolVeBegadol 7290058173198 1 89 8801055707966 \n", + "7075221 ZolVeBegadol 7290058173198 1 89 8801055709465 \n", + "7075222 ZolVeBegadol 7290058173198 1 89 8801055709489 \n", + "7075223 ZolVeBegadol 7290058173198 1 89 8850389105832 \n", + "7075224 ZolVeBegadol 7290058173198 1 89 8850632604259 \n", + "7075225 ZolVeBegadol 7290058173198 1 89 8850632604266 \n", + "7075226 ZolVeBegadol 7290058173198 1 89 8850632604372 \n", + "7075227 ZolVeBegadol 7290058173198 1 89 8850632604402 \n", + "7075228 ZolVeBegadol 7290058173198 1 89 8850632605201 \n", + "7075229 ZolVeBegadol 7290058173198 1 89 8850632606390 \n", + "7075230 ZolVeBegadol 7290058173198 1 89 8888111030053 \n", + "7075231 ZolVeBegadol 7290058173198 1 89 9002975393843 \n", + "7075232 ZolVeBegadol 7290058173198 1 89 9002975393850 \n", + "7075233 ZolVeBegadol 7290058173198 1 89 9310072028743 \n", + "7075234 ZolVeBegadol 7290058173198 1 89 9310072028750 \n", + "7075235 ZolVeBegadol 7290058173198 1 89 9771565025005 \n", + "7075236 ZolVeBegadol 7290058173198 1 89 9771565051011 \n", + "7075237 ZolVeBegadol 7290058173198 1 89 9771565051066 \n", + "\n", + " itemname itemprice unitqty \\\n", + "7075218 מזולה בטעם טבעי 10.9 00000 \n", + "7075219 תה ליפטון 1.5 גר' 100 יחידות 15.9 00000 \n", + "7075220 קפה בריסטה קלוי וטחון 36.9 00000 \n", + "7075221 נסקפה קפוצ'ינו וניל 10 יח' 185 גרם 21.5 00000 \n", + "7075222 נסקפה קפוצ'ינו אגוזים 10 יח' 180 גרם 21.5 00000 \n", + "7075223 סאפה תפוח ליטר 12.7 00000 \n", + "7075224 יוגטה מקלות תות 24יח' 5.0 00000 \n", + "7075225 יוגטה מקלות קולה 24יח' 5.0 00000 \n", + "7075226 יוגטה שטיח תות 24יח' 5.0 00000 \n", + "7075227 יוגטה SOUR STICKS - ממתק 50גר 5.0 00000 \n", + "7075228 יוגטה חספוסים כתום 50 גרם 5.0 00000 \n", + "7075229 יוגטה שטיח אבטיח24יח' 4.5 00000 \n", + "7075230 רביעיית מיץ אננס 100% 240 מ\"ל 13.7 00000 \n", + "7075231 1X30X70G הריבו גולדברס 7.9 00000 \n", + "7075232 1X30X70G הריבו וורמס 7.9 00000 \n", + "7075233 'טיםטם אורגינל 200גר 20.9 00000 \n", + "7075234 'טיםטם דאבל 200גר 20.9 00000 \n", + "7075235 מעריב סוף שבוע 5.9 00000 \n", + "7075236 ידיעות שישי+ חג 14.9 00000 \n", + "7075237 ידיעות שישי+ חג 14.9 00000 \n", + "\n", + " unitofmeasureprice qtyinpackage manufacturename manufacturecountry \\\n", + "7075218 0.109 0.0000 לא ידוע ישראל \n", + "7075219 0.159 12.0000 לא ידוע הודו \n", + "7075220 0.369 12.0000 לא ידוע לא ידוע \n", + "7075221 0.215 0.0000 לא ידוע דרום קוריאה \n", + "7075222 0.215 0.0000 לא ידוע דרום קוריאה \n", + "7075223 12.7 0.0000 לא ידוע תאילנד \n", + "7075224 0.05 0.0000 לא ידוע תאילנד \n", + "7075225 0.05 0.0000 לא ידוע תאילנד \n", + "7075226 0.05 0.0000 לא ידוע תאילנד \n", + "7075227 0.05 0.0000 לא ידוע לא ידוע \n", + "7075228 0.05 0.0000 לא ידוע תאילנד \n", + "7075229 0.045 0.0000 לא ידוע תאילנד \n", + "7075230 0.137 0.0000 לא ידוע לא ידוע \n", + "7075231 0.079 30.0000 לא ידוע לא ידוע \n", + "7075232 0.079 30.0000 לא ידוע לא ידוע \n", + "7075233 0.209 24.0000 לא ידוע לא ידוע \n", + "7075234 0.209 24.0000 לא ידוע לא ידוע \n", + "7075235 5.9 0.0000 לא ידוע לא ידוע \n", + "7075236 14.9 0.0000 לא ידוע לא ידוע \n", + "7075237 14.9 0.0000 לא ידוע לא ידוע \n", + "\n", + " manufactureitemdescription bisweighted allowdiscount \\\n", + "7075218 מזולה בטעם טבעי 0 0 \n", + "7075219 תה ליפטון 1.5 גר' 100 יחידות 0 0 \n", + "7075220 קפה בריסטה קלוי וטחון 0 0 \n", + "7075221 נסקפה קפוצ'ינו וניל 10 יח' 185 גרם 0 0 \n", + "7075222 נסקפה קפוצ'ינו אגוזים 10 יח' 180 גרם 0 0 \n", + "7075223 סאפה תפוח ליטר 0 0 \n", + "7075224 יוגטה מקלות תות 24יח' 0 1 \n", + "7075225 יוגטה מקלות קולה 24יח' 0 1 \n", + "7075226 יוגטה שטיח תות 24יח' 0 1 \n", + "7075227 יוגטה SOUR STICKS - ממתק 50גר 0 1 \n", + "7075228 יוגטה חספוסים כתום 50 גרם 0 1 \n", + "7075229 יוגטה שטיח אבטיח24יח' 0 1 \n", + "7075230 רביעיית מיץ אננס 100% 240 מ\"ל 0 0 \n", + "7075231 1X30X70G הריבו גולדברס 0 1 \n", + "7075232 1X30X70G הריבו וורמס 0 1 \n", + "7075233 'טיםטם אורגינל 200גר 0 0 \n", + "7075234 'טיםטם דאבל 200גר 0 0 \n", + "7075235 מעריב סוף שבוע 0 0 \n", + "7075236 ידיעות שישי+ חג 0 0 \n", + "7075237 ידיעות שישי+ חג 0 0 \n", + "\n", + " priceupdatedate unitmeasure quantity \n", + "7075218 2021-10-27 10:43:45 00000 0 \n", + "7075219 2023-07-13 11:22:53 00000 12 \n", + "7075220 2023-05-28 09:59:11 00000 12 \n", + "7075221 2023-05-28 09:52:11 00000 0 \n", + "7075222 2023-05-28 09:52:12 00000 0 \n", + "7075223 2023-05-25 16:54:56 00000 0 \n", + "7075224 2019-11-29 01:59:50 00000 0 \n", + "7075225 2019-11-30 04:02:36 00000 0 \n", + "7075226 2019-11-29 10:54:17 00000 0 \n", + "7075227 2019-11-28 21:27:10 00000 0 \n", + "7075228 2019-11-30 04:16:30 00000 0 \n", + "7075229 2019-11-30 08:21:02 00000 0 \n", + "7075230 2022-05-19 15:44:24 00000 0 \n", + "7075231 2023-02-14 14:38:33 00000 30 \n", + "7075232 2023-02-14 14:38:34 00000 30 \n", + "7075233 2016-11-21 14:04:18 00000 24 \n", + "7075234 2016-11-21 14:03:30 00000 24 \n", + "7075235 2022-02-02 17:19:35 00000 0 \n", + "7075236 2021-11-23 11:08:17 00000 0 \n", + "7075237 2021-11-23 11:08:27 00000 0 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from price_parser import download_all_prices\n", + "\n", + "df = download_all_prices()\n", + "df1.tail(20)\n", + "#print(df1.tail(20).to_markdown())\n", + "#df.tail(20)" + ] } ], "metadata": { diff --git a/xml_parser.py b/xml_parser.py new file mode 100644 index 0000000..4f49263 --- /dev/null +++ b/xml_parser.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +""" +Used to parse all store file formats +@author: Avi +""" +from lxml import objectify +import codecs + +def get_root(xml_file, encoding): + """get store xml root, in lxml format""" + with codecs.open(xml_file, encoding=encoding, errors="ignore") as store_file: + xml = store_file.read() + #print(xml[:90]) + xml = xml.replace('\r\n','') + xml = xml.encode("UTF-16") + + return objectify.fromstring(xml) \ No newline at end of file