Merge pull request #1 from TurconiAndrea/dev

Changes for 0.4.0
TurconiAndrea · Aug 23, 2021 · cb5e165 · cb5e165
2 parents 2b46d3c + 9140dd7
commit cb5e165
Show file tree

Hide file tree

Showing 13 changed files with 1,293 additions and 70 deletions.
diff --git a/recipe_tagger/data/ingredient_embedding.npy b/recipe_tagger/data/ingredient_embedding.npy
diff --git a/recipe_tagger/data/ingredient_embedding_en.npy b/recipe_tagger/data/ingredient_embedding_en.npy
diff --git a/recipe_tagger/data/ingredient_embedding_it.npy b/recipe_tagger/data/ingredient_embedding_it.npy
diff --git a/recipe_tagger/data/ingredient_waterfootprint_en.npy b/recipe_tagger/data/ingredient_waterfootprint_en.npy
diff --git a/recipe_tagger/data/ingredient_waterfootprint_it.npy b/recipe_tagger/data/ingredient_waterfootprint_it.npy
diff --git a/recipe_tagger/foodcategory.py b/recipe_tagger/foodcategory.py
@@ -26,6 +26,8 @@ class FoodCategory(Enum):
     seafood = 9
     snack = 10
     mushroom = 11
+    dessert = 12
+    beverage = 13
 
 
 class CategorySynset:
@@ -51,4 +53,28 @@ class CategorySynset:
         wordnet.synset(f"{FoodCategory.seafood.name}.n.01"),
         wordnet.synset(f"{FoodCategory.snack.name}.n.01"),
         wordnet.synset(f"{FoodCategory.mushroom.name}.n.01"),
+        wordnet.synset(f"{FoodCategory.dessert.name}.n.01"),
+        wordnet.synset(f"{FoodCategory.beverage.name}.n.01"),
     ]
+
+
+class FoodCategoryWaterFootprint(Enum):
+    """
+    Enum class used to represent the default water footprint
+    of category of ingredients.
+    """
+
+    vegetable = 354
+    fruit = 962
+    legume = 4055
+    meat = 8623
+    egg = 3265
+    dairy = 3178
+    staple = 1644
+    condiment = 2364
+    nut = 9063
+    seafood = 2590
+    snack = 2980
+    mushroom = 322
+    dessert = 3140
+    beverage = 732
diff --git a/recipe_tagger/recipe_tagger.py b/recipe_tagger/recipe_tagger.py
@@ -3,55 +3,24 @@
 """
 
 
-import io
-import pkgutil
 import re
 from collections import Counter
 
-import numpy as np
 import wikipediaapi
-from nltk import tag
 from nltk.corpus import wordnet
 from PyDictionary import PyDictionary
 from pyfood.utils import Shelf
 from textblob import Word
 
-from .foodcategory import CategorySynset, FoodCategory
-
-embedding_path = "data/ingredient_embedding.npy"
-
-
-def __get_embedding():
-    """
-    Get the dataset of ingredients as a dictionary.
-
-    :return: a dictionary representing the embedding
-    """
-    embedding_io = io.BytesIO(pkgutil.get_data(__name__, embedding_path))
-    return np.load(embedding_io, allow_pickle=True).item()
-
+from recipe_tagger import util
 
-def __remove_punctuation(word):
-    """
-    Format the provided word to mantain only the clean word.
-
-    :param word: the provided word to be cleaned.
-    :return: the word cleaned.
-    """
-    word = word.strip()
-    return re.sub(r"[^\w\s]", "", word)
-
-
-def lemmatize_word(word):
-    """
-    Lemmatize the provided word.
-    Lemmatization is the process of converting a word to its base form.
+from .foodcategory import CategorySynset, FoodCategory
+from .util import get_embedding, process_ingredients
 
-    :param word: the word to be lemmatized.
-    :return: the word lemmatized.
-    """
-    w = Word(word)
-    return w.lemmatize()
+food_embedding_paths = {
+    "en": "data/ingredient_embedding_en.npy",
+    "it": "data/ingredient_embedding_it.npy",
+}
 
 
 def is_ingredient_vegan(ingredient):
@@ -80,18 +49,19 @@ def is_recipe_vegan(ingredients):
     return results["labels"]["vegan"]
 
 
-def add_ingredient(ingredient, tag):
+def add_ingredient(ingredient, tag, language="en"):
     """
     Map the provided ingredient and the tag into the embedding dataset.
     Tag must be one the following FoodCategory:
     vegetable, fruit, meat, legume, diary, egg, staple,
-    condiment, nut, seafood, dessert
+    condiment, nut, seafood, dessert.
 
     :param ingredient: the name of the ingredient.
     :param tag: the class of the ingredient. Must be one of the listed above.
+    :param language: the language of the ingredient.
     :return: a bool indicating if the operation has succeded or not.
     """
-    embedding = __get_embedding()
+    embedding = get_embedding(food_embedding_paths[language])
     ingredient = ingredient.strip()
     tag = tag.strip()
     if ingredient in embedding:
@@ -141,7 +111,7 @@ def search_ingredient_hypernyms(ingredient):
         return FoodCategory(sum.index(max(sum))).name
 
 
-def search_ingredient_class(ingredient):
+def search_ingredient_class(ingredient, language="en"):
     """
     Search on wikipedia and english dictionary the class of
     the provided ingredient.
@@ -155,7 +125,7 @@ def search_ingredient_class(ingredient):
         ingredient = ingredient.split(" ")[-1]
 
     dictionary = PyDictionary()
-    wiki = wikipediaapi.Wikipedia("en")
+    wiki = wikipediaapi.Wikipedia(language)
 
     page = wiki.page(ingredient)
     meaning = (
@@ -174,40 +144,40 @@ def search_ingredient_class(ingredient):
     return max(categories, key=categories.count) if len(categories) else None
 
 
-def get_ingredient_class(ingredient):
+def get_ingredient_class(ingredient, language="en"):
     """
     Predict the class of the provided ingredient based on the embeddings.
     If the ingredient cannot be found in the dictionary it will be
     searched on wikipedia pages or hypernyms.
 
     :param ingredient: the name of the ingredient.
+    :param language: the language of the ingredient.
     :return: the class of the ingredient.
     """
-    embedding = __get_embedding()
-    ingredient = __remove_punctuation(ingredient)
-    lemmatized_ing = lemmatize_word(ingredient)
-    if lemmatized_ing in embedding:
-        return FoodCategory(embedding[lemmatized_ing]).name
+    embedding = get_embedding(food_embedding_paths[language])
+    cleaned_ing = process_ingredients(ingredient, language=language)
+    if cleaned_ing in embedding:
+        return FoodCategory(embedding[cleaned_ing]).name
     else:
-        web_class = search_ingredient_class(ingredient)
-        hyp_class = search_ingredient_hypernyms(lemmatized_ing)
+        web_class = search_ingredient_class(ingredient, language)
+        hyp_class = search_ingredient_hypernyms(cleaned_ing)
         return web_class if web_class else hyp_class
 
 
-def get_recipe_class_percentage(ingredients):
+def get_recipe_class_percentage(ingredients, language="en"):
     """
     Classify a recipe in tags based on its ingredient.
     Returns the percentages of ingredient class in the recipe provided.
 
     :param ingredients: list of ingredients in the recipe.
     :return: list of tuples containg classes and percentages.
     """
-    tags = [get_ingredient_class(ingredient) for ingredient in ingredients]
+    tags = [get_ingredient_class(ingredient, language) for ingredient in ingredients]
     c = Counter(tags)
     return [(i, str(round(c[i] / len(tags) * 100.0, 2)) + "%") for i in c]
 
 
-def get_recipe_tags(ingredients):
+def get_recipe_tags(ingredients, language="en"):
     """
     Classify a recipe in tags based on its ingredient.
     Tag could be: Vegetable, Fruit, Meat, Legume, Diary,
@@ -216,9 +186,10 @@ def get_recipe_tags(ingredients):
     :param ingredients: list of ingredients in the recipe.
     :return: set of tags for the recipe.
     """
-    tags = [get_ingredient_class(ingredient) for ingredient in ingredients]
+    tags = [get_ingredient_class(ingredient, language) for ingredient in ingredients]
     if None in tags:
         tags.remove(None)
     if len(tags) >= 2 and FoodCategory.condiment.name in tags:
         tags.remove(FoodCategory.condiment.name)
+    print(tags)
     return list(set(tags)) if len(tags) else tags
diff --git a/recipe_tagger/recipe_waterfootprint.py b/recipe_tagger/recipe_waterfootprint.py
@@ -0,0 +1,124 @@
+"""
+Module containing all the methods in order to compute the water footprint of an ingredient or recipe. 
+"""
+
+import re
+
+import numpy as np
+from nltk.corpus.reader import toolbox
+
+from .foodcategory import FoodCategoryWaterFootprint
+from .recipe_tagger import get_ingredient_class
+from .util import get_embedding, process_ingredients
+
+waterfootprint_embedding_paths = {
+    "en": "data/ingredient_waterfootprint_en.npy",
+    "it": "data/ingredient_waterfootprint_ita.npy",
+}
+
+
+def __calculate_waterfootprint(wf_ing, quantity):
+    """
+    Calculate the right water footprint of a ingredient from its
+    (l/kg) water footprint and the quantity provided (in gr).
+
+    :param wf_ing: the water footprint of the ingredient.
+    :param quantity: the quantity of the ingredient.
+    :return: the water footprint calcuated on the quantity.
+    """
+    return round((wf_ing * quantity) / 1000, 2)
+
+
+def __get_default_waterfootprint(ingredient, language="en"):
+    """
+    Get the defualt water footprint of a food category. The recipe tagger
+    module is used to predict the class of the ingredient.
+
+    :param ingredient: the ingredient to be classified.
+    :param language: the language of the ingredient.
+    :return: the defualt water footprint of the predicted category.
+    """
+    ing_class = get_ingredient_class(ingredient, language)
+    return FoodCategoryWaterFootprint[ing_class].value if ing_class != None else 50
+
+
+def __get_quantites_formatted(ingredients, quantities, language):
+    """
+    Get the list of quantities well formatted in the same unit (gr).
+    :param ingredients: the list containing the ingredients.
+    :param quantities: the list containing quantites of the ingredients.
+    :return: a list with the quantites well formatted in gr.
+    """
+    embedding = get_embedding(waterfootprint_embedding_paths[language])
+    units = {"ml": 0.001, "gr": 1.0, "kg": 1000.0, "L": 1000.0, "l": 1000.0}
+    values_units = [re.findall(r"[A-Za-z]+|\d+", q) for q in quantities]
+    # return [
+    #    float(v[0]) * units[v[1]] / units["gr"] if len(v) == 2 else float(v[0])
+    #    for v in values_units
+    # ]
+    quantities = []
+    for i in range(len(values_units)):
+        value_unit = values_units[i]
+        if len(value_unit) != 2:
+            quantities.append(float(value_unit[0]))
+        elif value_unit[1] == "unit":
+            quantities.append(float(value_unit[0]) * embedding[ingredients[i]][1])
+        elif value_unit[1] == "None":
+            quantities.append(0.0)
+        else:
+            quantities.append(float(value_unit[0]) * units[value_unit[1]] / units["gr"])
+    return quantities
+
+
+def get_ingredient_waterfootprint(ingredient, quantity, process=False, language="en"):
+    """
+    Get the water footprint of the provided ingredient based on the quantity.
+    If the ingredient is not found in the embedding, the recipe tagger module is
+    used to search the category of the ingredient and retrieve the footprint based
+    on that.
+
+    :param ingredient: the name of the ingredient.
+    :param quantity: the quantity of ingredient to calculate water footprint. (in gr)
+    :param process: a bool indicating if the provided ingredient must be processed.
+    :param language: the language of the ingredient.
+    :return: the water footprint of the provided ingredient.
+    """
+    wf_embedding = get_embedding(waterfootprint_embedding_paths[language])
+    ingredient = (
+        process_ingredients(ingredient, language=language) if process else ingredient
+    )
+    ingredient_wf = (
+        int(wf_embedding[ingredient][0])
+        if ingredient in wf_embedding
+        else __get_default_waterfootprint(ingredient, language)
+    )
+    return __calculate_waterfootprint(ingredient_wf, quantity)
+
+
+def get_recipe_waterfootprint(
+    ingredients, quantities, information=False, language="en"
+):
+    """
+    Get the water footprint of a recipe, providing the ingredients and the
+    quantities for each ingredient. Params ingredients and quantities must have
+    the same length. Quantites are strings containing the values and the unit
+    without spaces (10gr).
+    :param ingredients: a list containing all the ingredients of the recipe.
+    :param quanities: a list containing all the quantities of the recipe ingredients.
+    :param information: a dictionary containing the ingredients and its water footprint.
+    :param language: the language of the ingredients.
+    :return: an integer representing the water footprint of the recipe and if information
+    param is setted to true, return also a dictionary with all ingredients and theirs
+    computed water footprints.
+    """
+    proc_ingredients = [process_ingredients(ing) for ing in ingredients]
+    quantities = __get_quantites_formatted(proc_ingredients, quantities, language)
+    total_wf = 0
+    information_wf = {}
+    for i in range(len(ingredients)):
+        ing_wf = get_ingredient_waterfootprint(
+            proc_ingredients[i], quantities[i], language
+        )
+        information_wf[ingredients[i]] = ing_wf
+        total_wf = round(total_wf + ing_wf, 2)
+    return (total_wf, information_wf) if information else total_wf