From ef7fc5f5a957432afcc85979d7358fb5fba5ca86 Mon Sep 17 00:00:00 2001 From: Gordon Tyler Date: Thu, 14 Feb 2019 19:19:13 -0500 Subject: [PATCH] Fix recipe link discovery. Now includes additional categories that were broken out of the normal level range categories. --- main.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/main.py b/main.py index 49785c9..db02e57 100644 --- a/main.py +++ b/main.py @@ -60,8 +60,12 @@ 70: [ 220, 230, 250, 280, 310 ], # 290, 300, 320, 350, 380 } -LEVEL_RANGE = ["{0}-{1}".format(start, start + 4) for start in range(1, 70, 5)] -NUM_LEVEL_RANGES = len(LEVEL_RANGE) +MAX_LEVEL = 70 +LEVEL_RANGES = ["{0}-{1}".format(start, start + 4) for start in range(1, MAX_LEVEL, 5)] +NUM_LEVEL_RANGES = len(LEVEL_RANGES) +NUM_ADDITIONAL_CATEGORIES = 6 +LINK_CATEGORIES = ['%d' % (level_range,) for level_range in range(0, NUM_LEVEL_RANGES)] + \ + ['c%d' % (cat,) for cat in range(1, NUM_ADDITIONAL_CATEGORIES+1)] EMBED_CODE_RE = re.compile("\\[db:recipe=([0-9a-f]+)]") @@ -86,6 +90,7 @@ async def wait_with_progress(coros: list, desc: str = None, unit: str = None): async def fetch(session: aiohttp.ClientSession, url: str, **kwargs): err_count = 0 while err_count < 5: + # noinspection PyBroadException try: async with FETCH_SEMAPHORE: async with session.get(url, **kwargs) as res: @@ -115,20 +120,20 @@ def parse_recipe_links_page(text: str): return links, show_end, total -async def fetch_recipe_links_page(session: aiohttp.ClientSession, cls: str, level_range: int, page: int): +async def fetch_recipe_links_page(session: aiohttp.ClientSession, cls: str, category: str, page: int): params = { "category2": CLASSES.index(cls), - "category3": level_range, + "category3": category, "page": page, } return parse_recipe_links_page(await fetch(session, RECIPE_LIST_URL, params=params)) -async def fetch_recipe_links_range(session: aiohttp.ClientSession, cls: str, level_range: int): +async def fetch_recipe_links_range(session: aiohttp.ClientSession, cls: str, category: str): links = [] page = 1 while True: - page_links, show_end, total = await fetch_recipe_links_page(session, cls, level_range, page) + page_links, show_end, total = await fetch_recipe_links_page(session, cls, category, page) links += page_links if show_end < total: page += 1 @@ -139,7 +144,7 @@ async def fetch_recipe_links_range(session: aiohttp.ClientSession, cls: str, lev async def fetch_recipe_links(session: aiohttp.ClientSession, cls: str): results = wait_with_progress( - [fetch_recipe_links_range(session, cls, level_range) for level_range in range(0, NUM_LEVEL_RANGES)], + [fetch_recipe_links_range(session, cls, category) for category in LINK_CATEGORIES], desc=f"Fetching {cls} links", unit="" ) @@ -259,7 +264,7 @@ async def fetch_class(session: aiohttp.ClientSession, additional_languages: dict async def scrape_to_file(session: aiohttp.ClientSession, additional_languages: dict, cls: str): recipes = await fetch_class(session, additional_languages, cls) - with open("out/" + cls + ".json", mode="wt", encoding="utf-8") as db_file: + with open(f"out/{cls}.json", mode="wt", encoding="utf-8") as db_file: json.dump(recipes, db_file, indent=2, sort_keys=True, ensure_ascii=False)