Improved ingredient search using NLTK wordnet, updated version from 0…

….3.3 to 0.3.4
TurconiAndrea · Jul 24, 2021 · 64372a0 · 64372a0
1 parent aa2eb8f
commit 64372a0
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 This package provides a classification and tagging system for ingredients and recipes. 
 The functioning of the package is based on a dataset containing more than 700 ingredients mapped with their own class. 
-If a provided ingredient is not mapped into the dataset, the library search for it on wikipedia pages and into the dictionary.
+If a provided ingredient is not mapped into the dataset, the library search for it on wikipedia pages, into the dictionary and into NLTK Wordnet to find the best possible class. 
 
 An ingredient could be classified in one of the following class: 
 - Vegetable
@@ -15,6 +15,7 @@ An ingredient could be classified in one of the following class:
 - Condiment
 - Nut
 - Seafood
+- Dessert
 
 A recipe is tagged based on its ingredients class. 
 The library also provides a function to get the class percentage of recipe ingredients. 
@@ -54,5 +55,5 @@ recipe_tagger.get_recipe_class_percentage(['aubergine', 'chicken', 'beef'])
 
 ### Todo
 - [x] Handling of Wikipedia pages.
-- [ ] Better search over dictionary and Wikipedia pages of ingredient. 
+- [x] Better search over dictionary and Wikipedia pages of ingredient. 
 - [ ] Possibility to add ingredient after search if it is not present. 
diff --git a/recipe_tagger/foodcategory.py b/recipe_tagger/foodcategory.py
@@ -12,6 +12,7 @@ class FoodCategory(Enum):
     condiment = 7  
     nut = 8  
     seafood = 9
+    dessert = 10
 
 class CategorySynset():
     categories = [wordnet.synset(f'{FoodCategory.vegetable.name}.n.01'),

diff --git a/recipe_tagger/recipe_tagger.py b/recipe_tagger/recipe_tagger.py
@@ -15,8 +15,13 @@
 embedding_path = 'data/ingredient_embedding.npy'
 
 def __get_embedding():
-    embedding = io.BytesIO(pkgutil.get_data(__name__, embedding_path))
-    return np.load(embedding, allow_pickle=True).item()
+    """
+    Get the dataset of ingredients as a dictionary.
+
+    :return: a dictionary representing the embedding
+    """
+    embedding_io = io.BytesIO(pkgutil.get_data(__name__, embedding_path))
+    return np.load(embedding_io, allow_pickle=True).item()
 
 def lemmatize_word(word):
     """
@@ -36,6 +41,7 @@ def is_ingredient_vegan(ingredient):
     :param ingredient: the name of the ingredient.
     :return: a bool indicating whether the ingredient is vegan or not.
     """
+    ingredient = ingredient.strip()
     shelf = Shelf('Milan', month_id=0)
     results = shelf.process_ingredients([ingredient])
     return results['labels']['vegan']
@@ -53,12 +59,28 @@ def is_recipe_vegan(ingredients):
     return results['labels']['vegan']
 
 def add_ingredient(ingredient, tag):
+    """
+    Map the provided ingredient and the tag into the embedding dataset. 
+    Tag must be one the following FoodCategory: 
+    vegetable, fruit, meat, legume, diary, egg, staple, condiment, nut, seafood, dessert
+
+    :param ingredient: the name of the ingredient.
+    :param tag: the class of the ingredient. Must be one of the listed above.
+    :return: a bool indicating if the operation has succeded or not. 
+    """
     embedding = __get_embedding()
+    ingredient = ingredient.strip()
+    tag = tag.strip()
+    if ingredient in embedding:
+        return False
+
+    embedding[ingredient] = FoodCategory[tag].value
+    return True
 
 def search_ingredient_hypernyms(ingredient):
     """
     Predict the class of the provided ingredient based on the Wu & Palmer’s similarity between
-    ingredient, his hypernyms and the 10 FoodCategory. 
+    ingredient, his hypernyms and the 11 FoodCategory. 
     The FoodCategory is choosen based on the maximum similarity value between the ingredient, 
     its hypernym and the various categories. If the predicted category is different between ingredient
     and hypernym the category is choosen based on the avarege of both. 
@@ -116,6 +138,7 @@ def get_ingredient_class(ingredient):
     :return: the class of the ingredient.
     """
     embedding = __get_embedding()
+    ingredient = ingredient.strip()
     lemmatized_ing = lemmatize_word(ingredient)
     if lemmatized_ing in embedding:
         return FoodCategory(embedding[lemmatized_ing]).name
@@ -139,7 +162,7 @@ def get_recipe_class_percentage(ingredients):
 def get_recipe_tags(ingredients):
     """
     Classify a recipe in tags based on its ingredient. 
-    Tag could be: Vegetable, Fruit, Meat, Legume, Diary, Egg. 
+    Tag could be: Vegetable, Fruit, Meat, Legume, Diary, Egg, Staple, Condiment, Nut, Seafood 
 
     :param ingredients: list of ingredients in the recipe.
     :return: set of tags for the recipe. 

diff --git a/setup.py b/setup.py
@@ -10,16 +10,16 @@
 setup(
     name='recipe-tagger',
     packages=find_packages(include=['recipe_tagger']),
-    version='0.3.3',
+    version='0.3.4',
     description='A library for tagging and classify recipes',
     author='Andrea Turconi',
     license='MIT',
     long_description=README,
     long_description_content_type="text/markdown",
     url='https://github.com/TurconiAndrea/recipe-tagger',
-    download_url='https://github.com/TurconiAndrea/recipe-tagger/archive/refs/tags/0.3.3.tar.gz',
+    download_url='https://github.com/TurconiAndrea/recipe-tagger/archive/refs/tags/0.3.4.tar.gz',
     keywords=['food', 'recipe', 'tag', 'tagging', 'ingredient'],
-    install_requires=['wikipedia-api', 'PyDictionary', 'textblob', 'pyfood', 'unidecode', 'numpy'],
+    install_requires=['wikipedia-api', 'PyDictionary', 'textblob', 'pyfood', 'unidecode', 'numpy', 'nltk'],
     test_suite='tests',
     package_data={'': ['data/*.npy']},
 )
diff --git a/tests/test_recipe_tagger.py b/tests/test_recipe_tagger.py
@@ -17,6 +17,10 @@ def test_is_recipe_vegan():
     assert recipe_tagger.is_recipe_vegan(['apple', 'chicken']) == False
     assert recipe_tagger.is_recipe_vegan(['apple', 'pear']) == True
 
+#@pytest.mark.skip()
+def test_add_ingredient():
+    assert recipe_tagger.add_ingredient('milk', 'dairy') == True
+
 #@pytest.mark.skip()
 def test_search_ingredient_hypernyms():
     assert recipe_tagger.search_ingredient_hypernyms('pear') == 'fruit'
@@ -39,11 +43,11 @@ def test_get_ingredient_class():
     assert recipe_tagger.get_ingredient_class('cattle') == 'meat'
     assert recipe_tagger.get_ingredient_class('milk') == 'dairy'
 
-@pytest.mark.skip()
+#@pytest.mark.skip()
 def test_get_recipe_class_percentage():
     assert recipe_tagger.get_recipe_class_percentage(['chicken', 'sausage', 'apple']) == [('meat', '66.67%'), ('fruit', '33.33%')]
 
-@pytest.mark.skip()
+#@pytest.mark.skip()
 def test_get_recipe_tags():
     assert recipe_tagger.get_recipe_tags(['aubergine']) == ['vegetable']
     assert 'fruit' in recipe_tagger.get_recipe_tags(['pear', 'apple', 'aubergine'])