Skip to content

Commit

Permalink
Merge pull request #1 from TurconiAndrea/dev
Browse files Browse the repository at this point in the history
Changes for 0.4.0
  • Loading branch information
TurconiAndrea authored Aug 23, 2021
2 parents 2b46d3c + 9140dd7 commit cb5e165
Show file tree
Hide file tree
Showing 13 changed files with 1,293 additions and 70 deletions.
Binary file removed recipe_tagger/data/ingredient_embedding.npy
Binary file not shown.
Binary file added recipe_tagger/data/ingredient_embedding_en.npy
Binary file not shown.
Binary file added recipe_tagger/data/ingredient_embedding_it.npy
Binary file not shown.
Binary file not shown.
Binary file not shown.
26 changes: 26 additions & 0 deletions recipe_tagger/foodcategory.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class FoodCategory(Enum):
seafood = 9
snack = 10
mushroom = 11
dessert = 12
beverage = 13


class CategorySynset:
Expand All @@ -51,4 +53,28 @@ class CategorySynset:
wordnet.synset(f"{FoodCategory.seafood.name}.n.01"),
wordnet.synset(f"{FoodCategory.snack.name}.n.01"),
wordnet.synset(f"{FoodCategory.mushroom.name}.n.01"),
wordnet.synset(f"{FoodCategory.dessert.name}.n.01"),
wordnet.synset(f"{FoodCategory.beverage.name}.n.01"),
]


class FoodCategoryWaterFootprint(Enum):
"""
Enum class used to represent the default water footprint
of category of ingredients.
"""

vegetable = 354
fruit = 962
legume = 4055
meat = 8623
egg = 3265
dairy = 3178
staple = 1644
condiment = 2364
nut = 9063
seafood = 2590
snack = 2980
mushroom = 322
dessert = 3140
beverage = 732
81 changes: 26 additions & 55 deletions recipe_tagger/recipe_tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,55 +3,24 @@
"""


import io
import pkgutil
import re
from collections import Counter

import numpy as np
import wikipediaapi
from nltk import tag
from nltk.corpus import wordnet
from PyDictionary import PyDictionary
from pyfood.utils import Shelf
from textblob import Word

from .foodcategory import CategorySynset, FoodCategory

embedding_path = "data/ingredient_embedding.npy"


def __get_embedding():
"""
Get the dataset of ingredients as a dictionary.
:return: a dictionary representing the embedding
"""
embedding_io = io.BytesIO(pkgutil.get_data(__name__, embedding_path))
return np.load(embedding_io, allow_pickle=True).item()

from recipe_tagger import util

def __remove_punctuation(word):
"""
Format the provided word to mantain only the clean word.
:param word: the provided word to be cleaned.
:return: the word cleaned.
"""
word = word.strip()
return re.sub(r"[^\w\s]", "", word)


def lemmatize_word(word):
"""
Lemmatize the provided word.
Lemmatization is the process of converting a word to its base form.
from .foodcategory import CategorySynset, FoodCategory
from .util import get_embedding, process_ingredients

:param word: the word to be lemmatized.
:return: the word lemmatized.
"""
w = Word(word)
return w.lemmatize()
food_embedding_paths = {
"en": "data/ingredient_embedding_en.npy",
"it": "data/ingredient_embedding_it.npy",
}


def is_ingredient_vegan(ingredient):
Expand Down Expand Up @@ -80,18 +49,19 @@ def is_recipe_vegan(ingredients):
return results["labels"]["vegan"]


def add_ingredient(ingredient, tag):
def add_ingredient(ingredient, tag, language="en"):
"""
Map the provided ingredient and the tag into the embedding dataset.
Tag must be one the following FoodCategory:
vegetable, fruit, meat, legume, diary, egg, staple,
condiment, nut, seafood, dessert
condiment, nut, seafood, dessert.
:param ingredient: the name of the ingredient.
:param tag: the class of the ingredient. Must be one of the listed above.
:param language: the language of the ingredient.
:return: a bool indicating if the operation has succeded or not.
"""
embedding = __get_embedding()
embedding = get_embedding(food_embedding_paths[language])
ingredient = ingredient.strip()
tag = tag.strip()
if ingredient in embedding:
Expand Down Expand Up @@ -141,7 +111,7 @@ def search_ingredient_hypernyms(ingredient):
return FoodCategory(sum.index(max(sum))).name


def search_ingredient_class(ingredient):
def search_ingredient_class(ingredient, language="en"):
"""
Search on wikipedia and english dictionary the class of
the provided ingredient.
Expand All @@ -155,7 +125,7 @@ def search_ingredient_class(ingredient):
ingredient = ingredient.split(" ")[-1]

dictionary = PyDictionary()
wiki = wikipediaapi.Wikipedia("en")
wiki = wikipediaapi.Wikipedia(language)

page = wiki.page(ingredient)
meaning = (
Expand All @@ -174,40 +144,40 @@ def search_ingredient_class(ingredient):
return max(categories, key=categories.count) if len(categories) else None


def get_ingredient_class(ingredient):
def get_ingredient_class(ingredient, language="en"):
"""
Predict the class of the provided ingredient based on the embeddings.
If the ingredient cannot be found in the dictionary it will be
searched on wikipedia pages or hypernyms.
:param ingredient: the name of the ingredient.
:param language: the language of the ingredient.
:return: the class of the ingredient.
"""
embedding = __get_embedding()
ingredient = __remove_punctuation(ingredient)
lemmatized_ing = lemmatize_word(ingredient)
if lemmatized_ing in embedding:
return FoodCategory(embedding[lemmatized_ing]).name
embedding = get_embedding(food_embedding_paths[language])
cleaned_ing = process_ingredients(ingredient, language=language)
if cleaned_ing in embedding:
return FoodCategory(embedding[cleaned_ing]).name
else:
web_class = search_ingredient_class(ingredient)
hyp_class = search_ingredient_hypernyms(lemmatized_ing)
web_class = search_ingredient_class(ingredient, language)
hyp_class = search_ingredient_hypernyms(cleaned_ing)
return web_class if web_class else hyp_class


def get_recipe_class_percentage(ingredients):
def get_recipe_class_percentage(ingredients, language="en"):
"""
Classify a recipe in tags based on its ingredient.
Returns the percentages of ingredient class in the recipe provided.
:param ingredients: list of ingredients in the recipe.
:return: list of tuples containg classes and percentages.
"""
tags = [get_ingredient_class(ingredient) for ingredient in ingredients]
tags = [get_ingredient_class(ingredient, language) for ingredient in ingredients]
c = Counter(tags)
return [(i, str(round(c[i] / len(tags) * 100.0, 2)) + "%") for i in c]


def get_recipe_tags(ingredients):
def get_recipe_tags(ingredients, language="en"):
"""
Classify a recipe in tags based on its ingredient.
Tag could be: Vegetable, Fruit, Meat, Legume, Diary,
Expand All @@ -216,9 +186,10 @@ def get_recipe_tags(ingredients):
:param ingredients: list of ingredients in the recipe.
:return: set of tags for the recipe.
"""
tags = [get_ingredient_class(ingredient) for ingredient in ingredients]
tags = [get_ingredient_class(ingredient, language) for ingredient in ingredients]
if None in tags:
tags.remove(None)
if len(tags) >= 2 and FoodCategory.condiment.name in tags:
tags.remove(FoodCategory.condiment.name)
print(tags)
return list(set(tags)) if len(tags) else tags
124 changes: 124 additions & 0 deletions recipe_tagger/recipe_waterfootprint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""
Module containing all the methods in order to compute the water footprint of an ingredient or recipe.
"""

import re

import numpy as np
from nltk.corpus.reader import toolbox

from .foodcategory import FoodCategoryWaterFootprint
from .recipe_tagger import get_ingredient_class
from .util import get_embedding, process_ingredients

waterfootprint_embedding_paths = {
"en": "data/ingredient_waterfootprint_en.npy",
"it": "data/ingredient_waterfootprint_ita.npy",
}


def __calculate_waterfootprint(wf_ing, quantity):
"""
Calculate the right water footprint of a ingredient from its
(l/kg) water footprint and the quantity provided (in gr).
:param wf_ing: the water footprint of the ingredient.
:param quantity: the quantity of the ingredient.
:return: the water footprint calcuated on the quantity.
"""
return round((wf_ing * quantity) / 1000, 2)


def __get_default_waterfootprint(ingredient, language="en"):
"""
Get the defualt water footprint of a food category. The recipe tagger
module is used to predict the class of the ingredient.
:param ingredient: the ingredient to be classified.
:param language: the language of the ingredient.
:return: the defualt water footprint of the predicted category.
"""
ing_class = get_ingredient_class(ingredient, language)
return FoodCategoryWaterFootprint[ing_class].value if ing_class != None else 50


def __get_quantites_formatted(ingredients, quantities, language):
"""
Get the list of quantities well formatted in the same unit (gr).
:param ingredients: the list containing the ingredients.
:param quantities: the list containing quantites of the ingredients.
:return: a list with the quantites well formatted in gr.
"""
embedding = get_embedding(waterfootprint_embedding_paths[language])
units = {"ml": 0.001, "gr": 1.0, "kg": 1000.0, "L": 1000.0, "l": 1000.0}
values_units = [re.findall(r"[A-Za-z]+|\d+", q) for q in quantities]
# return [
# float(v[0]) * units[v[1]] / units["gr"] if len(v) == 2 else float(v[0])
# for v in values_units
# ]
quantities = []
for i in range(len(values_units)):
value_unit = values_units[i]
if len(value_unit) != 2:
quantities.append(float(value_unit[0]))
elif value_unit[1] == "unit":
quantities.append(float(value_unit[0]) * embedding[ingredients[i]][1])
elif value_unit[1] == "None":
quantities.append(0.0)
else:
quantities.append(float(value_unit[0]) * units[value_unit[1]] / units["gr"])
return quantities


def get_ingredient_waterfootprint(ingredient, quantity, process=False, language="en"):
"""
Get the water footprint of the provided ingredient based on the quantity.
If the ingredient is not found in the embedding, the recipe tagger module is
used to search the category of the ingredient and retrieve the footprint based
on that.
:param ingredient: the name of the ingredient.
:param quantity: the quantity of ingredient to calculate water footprint. (in gr)
:param process: a bool indicating if the provided ingredient must be processed.
:param language: the language of the ingredient.
:return: the water footprint of the provided ingredient.
"""
wf_embedding = get_embedding(waterfootprint_embedding_paths[language])
ingredient = (
process_ingredients(ingredient, language=language) if process else ingredient
)
ingredient_wf = (
int(wf_embedding[ingredient][0])
if ingredient in wf_embedding
else __get_default_waterfootprint(ingredient, language)
)
return __calculate_waterfootprint(ingredient_wf, quantity)


def get_recipe_waterfootprint(
ingredients, quantities, information=False, language="en"
):
"""
Get the water footprint of a recipe, providing the ingredients and the
quantities for each ingredient. Params ingredients and quantities must have
the same length. Quantites are strings containing the values and the unit
without spaces (10gr).
:param ingredients: a list containing all the ingredients of the recipe.
:param quanities: a list containing all the quantities of the recipe ingredients.
:param information: a dictionary containing the ingredients and its water footprint.
:param language: the language of the ingredients.
:return: an integer representing the water footprint of the recipe and if information
param is setted to true, return also a dictionary with all ingredients and theirs
computed water footprints.
"""
proc_ingredients = [process_ingredients(ing) for ing in ingredients]
quantities = __get_quantites_formatted(proc_ingredients, quantities, language)
total_wf = 0
information_wf = {}
for i in range(len(ingredients)):
ing_wf = get_ingredient_waterfootprint(
proc_ingredients[i], quantities[i], language
)
information_wf[ingredients[i]] = ing_wf
total_wf = round(total_wf + ing_wf, 2)
return (total_wf, information_wf) if information else total_wf
Loading

0 comments on commit cb5e165

Please sign in to comment.