TastyPiano / src /cocktails /utilities /ingredients_utilities.py
Cédric Colas
initial commit
e775f6d
raw
history blame
11.4 kB
# This script loads the list and profiles of our ingredients selection.
# It defines rules to recognize ingredients from the list in recipes and the function to extract that information from ingredient strings.
import pandas as pd
from src.cocktails.config import INGREDIENTS_LIST_PATH, COCKTAILS_CSV_DATA
import numpy as np
ingredient_profiles = pd.read_csv(INGREDIENTS_LIST_PATH)
ingredient_list = [ing.lower() for ing in ingredient_profiles['ingredient']]
n_ingredients = len(ingredient_list)
ingredient2ingredient_id = dict(zip(ingredient_list, range(n_ingredients)))
ingredients_types = sorted(set(ingredient_profiles['type']))
# for each type, get all ingredients
ing_per_type = [[ing for ing in ingredient_list if ingredient_profiles['type'][ingredient_list.index(ing)] == type] for type in ingredients_types]
ingredients_per_type = dict(zip(ingredients_types, ing_per_type))
bubble_ingredients = ['soda', 'ginger beer', 'tonic', 'sparkling wine']
# rules to recognize ingredients in recipes.
# in [] are separate rules with an OR relation: only one needs to be satisfied
# within [], rules apply with and AND relation: all rules need to be satisfied.
# ~ indicates that the following expression must NOT appear
# simple expression indicate that the expression MUST appear.
ingredient_search = {#'salt': ['salt'],
'lime juice': [['lime', '~soda', '~lemonade', '~cordial']],
'lemon juice': [['lemon', '~soda', '~lemonade']],
'angostura': [['angostura', '~orange'],
['bitter', '~campari', '~orange', '~red', '~italian', '~fernet']],
'orange bitters': [['orange', 'bitter', '~bittersweet']],
'orange juice': [['orange', '~bitter', '~jam', '~marmalade', '~liqueur', '~water'],
['orange', 'squeeze']],
'pineapple juice': [['pineapple']],
# 'apple juice': [['apple', 'juice', '~pine']],
'cranberry juice': [['cranberry', 'juice']],
'cointreau': ['cointreau', 'triple sec', 'grand marnier', 'curaçao', 'curacao'],
'luxardo maraschino': ['luxardo', 'maraschino', 'kirsch'],
'amaretto': ['amaretto'],
'benedictine': ['benedictine', 'bénédictine', 'bénedictine', 'benédictine'],
'campari': ['campari', ['italian', 'red', 'bitter'], 'aperol', 'bittersweet', 'aperitivo', 'orange-red'],
# 'campari': ['campari', ['italian', 'red', 'bitter']],
# 'crème de violette': [['violette', 'crème'], ['crême', 'violette'], ['liqueur', 'violette']],
# 'aperol': ['aperol', 'bittersweet', 'aperitivo', 'orange-red'],
'green chartreuse': ['chartreuse'],
'black raspberry liqueur': [['cassis', 'liqueur'],
['black raspberry', 'liqueur'],
['raspberry', 'liqueur'],
['strawberry', 'liqueur'],
['blackberry', 'liqueur'],
['violette', 'crème'], ['crême', 'violette'], ['liqueur', 'violette']],
# 'simple syrup': [],
# 'drambuie': ['drambuie'],
# 'fernet branca': ['fernet', 'branca'],
'gin': [['gin', '~sloe', '~ginger']],
'vodka': ['vodka'],
'cuban rum': [['rum', 'puerto rican'], ['light', 'rum'], ['white', 'rum'], ['rum', 'havana', '~7'], ['rum', 'bacardi']],
'cognac': [['cognac', '~grand marnier', '~cointreau', '~orange']],
# 'bourbon': [['bourbon', '~liqueur']],
# 'tequila': ['tequila', 'pisco'],
# 'tequila': ['tequila'],
'scotch': ['scotch'],
'dark rum': [['rum', 'age', '~bacardi', '~havana'],
['rum', 'dark', '~bacardi', '~havana'],
['rum', 'old', '~bacardi', '~havana'],
['rum', 'old', '7'],
['rum', 'havana', '7'],
['havana', 'rum', 'especial']],
'absinthe': ['absinthe'],
'rye whiskey': ['rye', ['bourbon', '~liqueur']],
# 'rye whiskey': ['rye'],
'apricot brandy': [['apricot', 'brandy']],
# 'pisco': ['pisco'],
# 'cachaça': ['cachaça', 'cachaca'],
'egg': [['egg', 'white', '~yolk', '~whole']],
'soda': [['soda', 'water', '~lemon', '~lime']],
'mint': ['mint'],
'sparkling wine': ['sparkling wine', 'prosecco', 'champagne'],
'ginger beer': [['ginger', 'beer'], ['ginger', 'ale']],
'tonic': [['tonic'], ['7up'], ['sprite']],
# 'espresso': ['espresso', 'expresso', ['café', '~liqueur', '~cream'],
# ['cafe', '~liqueur', '~cream'],
# ['coffee', '~liqueur', '~cream']],
# 'southern comfort': ['southern comfort'],
# 'cola': ['cola', 'coke', 'pepsi'],
'double syrup': [['sugar','~raspberry'], ['simple', 'syrup'], ['double', 'syrup']],
# 'grenadine': ['grenadine', ['pomegranate', 'syrup']],
'grenadine': ['grenadine', ['pomegranate', 'syrup'], ['raspberry', 'syrup', '~black']],
'honey syrup': ['honey', ['maple', 'syrup']],
# 'raspberry syrup': [['raspberry', 'syrup', '~black']],
'dry vermouth': [['vermouth', 'dry'], ['vermouth', 'white'], ['vermouth', 'french'], 'lillet'],
'sweet vermouth': [['vermouth', 'sweet'], ['vermouth', 'red'], ['vermouth', 'italian']],
# 'lillet blanc': ['lillet'],
'water': [['water', '~sugar', '~coconut', '~soda', '~tonic', '~honey', '~orange', '~melon']]
}
# check that there is a rule for all ingredients in the list
assert sorted(ingredient_list) == sorted(ingredient_search.keys()), 'ing search dict keys do not match ingredient list'
def get_ingredients_info():
data = pd.read_csv(COCKTAILS_CSV_DATA)
max_ingredients, ingredient_set, liquor_set, liqueur_set, vermouth_set = get_max_n_ingredients(data)
ingredient_list = sorted(ingredient_set)
alcohol = sorted(liquor_set.union(liqueur_set).union(vermouth_set).union(set(['sparkling wine'])))
ind_alcohol = [i for i in range(len(ingredient_list)) if ingredient_list[i] in alcohol]
return max_ingredients, ingredient_list, ind_alcohol
def get_max_n_ingredients(data):
max_count = 0
ingredient_set = set()
alcohol_set = set()
liqueur_set = set()
vermouth_set = set()
ing_str = np.array(data['ingredients_str'])
for i in range(len(data['names'])):
ingredients, quantities = extract_ingredients(ing_str[i])
max_count = max(max_count, len(ingredients))
for ing in ingredients:
ingredient_set.add(ing)
if ing in ingredients_per_type['liquor']:
alcohol_set.add(ing)
if ing in ingredients_per_type['liqueur']:
liqueur_set.add(ing)
if ing in ingredients_per_type['vermouth']:
vermouth_set.add(ing)
return max_count, ingredient_set, alcohol_set, liqueur_set, vermouth_set
def find_ingredient_from_str(ing_str):
# function that assigns an ingredient string to one of the ingredient if possible, following the rules defined above.
# return a flag and the ingredient string. When flag is false, the ingredient has not been found and the cocktail is rejected.
ing_str = ing_str.lower()
flags = []
for k in ingredient_list:
or_flags = [] # get flag for each of several conditions
for i_p, pattern in enumerate(ingredient_search[k]):
or_flags.append(True)
if isinstance(pattern, str):
if pattern[0] == '~' and pattern[1:] in ing_str:
or_flags[-1] = False
elif pattern[0] != '~' and pattern not in ing_str:
or_flags[-1] = False
elif isinstance(pattern, list):
for element in pattern:
if element[0] == '~':
or_flags[-1] = or_flags[-1] and not element[1:] in ing_str
else:
or_flags[-1] = or_flags[-1] and element in ing_str
else:
raise ValueError
flags.append(any(or_flags))
if sum(flags) > 1:
print(ing_str)
for i_f, f in enumerate(flags):
if f:
print(ingredient_list[i_f])
stop = 1
return True, ingredient_list[flags.index(True)]
elif sum(flags) == 0:
# if 'grape' not in ing_str:
# print('\t\t Not found:', ing_str)
return True, None
else:
return False, ingredient_list[flags.index(True)]
def get_cocktails_per_ingredient(ing_strs):
cocktails_per_ing = dict(zip(ingredient_list, [[] for _ in range(len(ingredient_list))]))
for i_ing, ing_str in enumerate(ing_strs):
ingredients, _ = extract_ingredients(ing_str)
for ing in ingredients:
cocktails_per_ing[ing].append(i_ing)
return cocktails_per_ing
def extract_ingredients(ingredient_str):
# extract list of ingredients and quantities from an formatted ingredient string (reverse of format_ingredients)
ingredient_str = ingredient_str[1: -1]
words = ingredient_str.split(',')
ingredients = []
quantities = []
for i in range(len(words)//2):
ingredients.append(words[2 * i][1:])
quantities.append(float(words[2 * i + 1][:-1]))
return ingredients, quantities
def format_ingredients(ingredients, quantities):
# format an ingredient string from the lists of ingredients and quantities (reverse of extract_ingredients)
out = '['
for ing, q in zip(ingredients, quantities):
if ing[-1] == ' ':
ingre = ing[:-1]
else:
ingre = ing
out += f'({ingre},{q}),'
out = out[:-1] + ']'
return out
def get_ingredient_count(data):
# get count of ingredients in the whole dataset
ingredient_counts = dict(zip(ingredient_list, [0] * len(ingredient_list)))
for i in range(len(data['names'])):
if data['to_keep'][i]:
ingredients, _ = extract_ingredients(data['ingredients_str'][i])
for i in ingredients:
ingredient_counts[i] += 1
return ingredient_counts
def add_counts_to_ingredient_list(data):
# update the list of ingredients to add their count of occurence in dataset.
ingredient_counts = get_ingredient_count(data)
counts = [ingredient_counts[k] for k in ingredient_list]
ingredient_profiles['counts'] = counts
ingredient_profiles.to_csv(INGREDIENTS_LIST_PATH, index=False)