Files
MasterarbeitCode/clean_dataset/dataset_instructions_helpers.py
2021-04-11 23:28:41 +02:00

584 lines
22 KiB
Python

import json
import re
import gc
import spacy
from clean_dataset.lists import regex_list, cheese, milk, meat, sausage, fish, pasta, bread, rolls
from colorama import Fore
nlp = spacy.load('de_core_news_lg')
data_path = "data/"
home_path = "prep_dataset/"
def make_list_str(given_list, reg):
if given_list:
return "null"
list_str = '['
count = 0
for element in given_list:
element = reg.sub(r' ', str(element))
if count != 0:
list_str += ','
count += 1
list_str += '"' + str(element) + '"'
list_str += ']'
return list_str
# creates dataset of only recipe URLs and steps
def sep_sentences_file(input_file='dataset_sep_sentences.json', output_file='sep_sentences.json'):
print("Making steps file")
with open(data_path + input_file, "r") as whole_sep_json_file:
whole_dataset = json.load(whole_sep_json_file)
# all_sentences = {}
sentence_count = 0
recipe_count = 0
with open(data_path + output_file, "w") as sep_file:
sep_file.write('{')
for recipe in whole_dataset.keys():
if recipe_count != 0:
sep_file.write(',')
recipe_count += 1
print(recipe_count)
sentence_count += len(whole_dataset[recipe]['instructions'])
sep_file.write('"' + recipe + '":' + json.dumps(whole_dataset[recipe]['instructions'], ensure_ascii=False, indent=4) + '\n')
# whole_dataset[recipe]['instructions'] = sentences
# all_sentences[recipe] = sentences
# if recipe_count % 1000 == 0:
# gc.collect()
sep_file.write('}')
print("sentences: " + str(sentence_count))
def sep_sentences(whole_dataset):
print("Separate sentences in instructions")
reg = re.compile(r'[\s]+')
# all_sentences = {}
sentence_count = 0
recipe_count = 0
with open(data_path + "dataset_sep_sentences.json", "w") as dataset_sep_file:
dataset_sep_file.write('{')
for recipe in whole_dataset.keys():
if recipe_count != 0:
dataset_sep_file.write(',')
recipe_count += 1
print(recipe_count)
# dataset_sep_file.write('{')
dataset_sep_file.write('"' + recipe + '":')
instr = whole_dataset[recipe]['instructions']
instr_nlp = nlp(instr)
sentences = list(instr_nlp.sents)
sentence_count += len(sentences)
whole_dataset[recipe]['instructions'] = list(map(lambda x: reg.sub(r' ', str(x)), sentences))
json.dump(whole_dataset[recipe], dataset_sep_file, ensure_ascii=False, indent=4)
# dataset_sep_file.write(',"instructions":' + make_list_str(sentences, reg))
# dataset_sep_file.write(',"comments":'+ make_list_str(whole_dataset[recipe]['comments'], reg) + '}')
# whole_dataset[recipe]['instructions'] = sentences
# all_sentences[recipe] = sentences
if recipe_count % 10000 == 0:
gc.collect()
dataset_sep_file.write('}')
print("sentences: " + str(sentence_count))
def check_occurrances(step_file='sep_sentences.json', out_file='ingredient_occurrance.json'):
print("Checking for occurrances of ingredients in steps")
with open(data_path + "mult_ingredients_nice.json", "r") as ingredients_json_file:
all_ingredients = json.load(ingredients_json_file)
with open(data_path + step_file, "r") as steps_json_file:
all_steps = json.load(steps_json_file)
occurrances = {}
all_steps_str = ""
for recipe in all_steps.keys():
for step in all_steps[recipe]:
all_steps_str += step
count_ingr = 0
small_amounts = []
for ingr in all_ingredients.keys():
count_ingr += 1
print(count_ingr)
counts = all_steps_str.count(ingr)
if counts < 10:
small_amounts.append((ingr, counts))
occurrances[ingr] = counts
print(small_amounts)
print(len(small_amounts))
with open(data_path + out_file, "w") as ingredient_json_file:
json.dump(occurrances, ingredient_json_file, ensure_ascii=False, indent=4)
def replace_alt_words(step, ingredients):
# Eiswürfel und Crushed Ice ersetzen
ice_types = ["Eis_Schokolade", "Eis_Vanillegeschmack", "Eis_Vanille", "Eis"]
if "Crushed_Ice" in ingredients:
ice_cream_bool = False
for ice in ice_types:
if ice in ingredients:
ice_cream_bool = True
if not ice_cream_bool:
# replace ice, Eis with Crushed_Ice
step = regex_list[28].sub('Crushed_Ice', step)
step = regex_list[29].sub('Crushed_Ice', step)
if "Eiswürfel" in ingredients:
ice_cream_bool = False
for ice in ice_types:
if ice in ingredients:
ice_cream_bool = True
if not ice_cream_bool:
# replace ice, Eis with Eiswürfel
step = regex_list[29].sub('Eiswürfel', step)
# Zitrusabrieb verbessern
if "Zitronenabrieb" in ingredients:
# replace Zitronenschale with Zitronenabrieb
step = regex_list[30].sub('Zitronenabrieb', step)
if "Limettenabrieb" in ingredients:
# replace Limettenschale with Limettenabrieb
step = regex_list[31].sub('Limettenabrieb', step)
if "Orangenabrieb" in ingredients:
# replace Orangenschale with Orangenabrieb
step = regex_list[32].sub('Orangenabrieb', step)
if "Mandarinenabrieb" in ingredients:
# replace Mandarinenschale with Mandarinenabrieb
step = regex_list[33].sub('Mandarinenabrieb', step)
for ingred in ingredients:
# change certain ingredient names
if "Paprika" in ingred:
# replace Paprikaschote(n)?
step = regex_list[1].sub('Paprika', step)
if "Lauch" in ingred:
# replace Porree
step = regex_list[2].sub('Lauch', step)
if "Zucchini" in ingred:
# replace Zucchino
step = regex_list[3].sub('Zucchini', step)
if "Sahne" in ingred and "saure" not in ingred:
# replace Schlagsahne
step = regex_list[6].sub('Sahne', step)
# replace Schlagobers
step = regex_list[7].sub('Sahne', step)
# replace süße Sahne
step = regex_list[8].sub('Sahne', step)
if "Zimt" in ingred:
# replace Zimtpulver
step = regex_list[9].sub('Zimt', step)
if "Wasser" in ingred:
# replace Wasser\sohne\sKohlens\u00e4ure
step = regex_list[10].sub('Wasser', step)
# Mineralwasser\sohne\sKohlens\u00e4ure
step = regex_list[11].sub('Wasser', step)
if "Sprudelwasser" in ingred and "Wasser" not in ingredients:
# replace Wasser
step = regex_list[12].sub('Sprudelwasser', step)
# replace Mineralwasser
step = regex_list[13].sub('Sprudelwasser', step)
# replace Sodawasser
step = regex_list[14].sub('Sprudelwasser', step)
if "Ingwer" in ingred:
# replace Ingwerwurzel
step = regex_list[15].sub('Ingwer', step)
if "Quark" in ingred:
# replace Topfen
step = regex_list[16].sub('Quark', step)
if "Karotte" in ingred:
# replace Möhre
step = regex_list[17].sub('Karotte', step)
if 'Puderzucker' in ingred:
# replace Staubzucker
step = regex_list[18].sub('Puderzucker', step)
if "Pfeffer" in ingred:
# replace Pfefferkörner
step = regex_list[19].sub('Pfeffer', step)
# replace Pfefferbeeren
step = regex_list[20].sub('Pfeffer', step)
if "Kartoffel" in ingred:
# replace Erdäpfel
step = regex_list[21].sub('Kartoffel', step)
# replace Erdapfel
step = regex_list[22].sub('Kartoffel', step)
if "Eigelb" in ingred:
# replace (Eidotter|Dotter)
step = regex_list[23].sub('Eigelb', step)
if "Eier" in ingred:
# replace Freilandeier
step = regex_list[24].sub('Eier', step)
# replace Vollei(er)?
step = regex_list[25].sub('Eier', step)
if "Eiweiß" in ingred:
# replace Eiklar
step = regex_list[26].sub('Eiweiß', step)
# replace Eischnee
step = regex_list[27].sub('Eiweiß', step)
return step
def try_replace(repl, ingred, step, caps=False, regex_short=True):
# match all capitals '^[A-ZÄÜÖ]'
res = re.match(regex_list[0], repl)
found = False
if res:
if regex_short:
repl_whole = '\s[a-zA-ZäÄöÖüÜß]*' + repl + '[a-zA-ZäÄöÖüÜß]{0,3}(?P<end>[\s.,])'
else:
repl_whole = '\s[a-zA-ZäÄöÖüÜß]*' + repl + '[a-zA-ZäÄöÖüÜß]*(?P<end>[\s.,])'
if caps:
step = re.sub(repl_whole, " " + ingred + "\g<end>", step, flags=re.IGNORECASE)
else:
step = re.sub(repl_whole, " " + ingred + "\g<end>", step)
if ingred in step:
found = True
return step, found
def replace_categories(step, ingred):
# avoid Vollmilch_Joghurt problem
if "Joghurt" in ingred or "joghurt" in ingred or "Schoko" in ingred or "schoko" in ingred:
return step, False
res = False
found = False
if "Käse " in step or "Käsescheibe " in step or "Käsescheiben " in step:
word_list = cheese
reg = "Käse(scheibe|scheiben) "
end = " "
if ingred in word_list:
found = True
if not found and ("Käse." in step or "Käsescheibe." in step or "Käsescheiben." in step):
word_list = cheese
reg = "Käse(scheibe|scheiben)\."
end = "."
if ingred in word_list:
found = True
if not found and "Milch " in step:
word_list = milk
reg = "Milch "
end = " "
if ingred in word_list:
found = True
if not found and "Milch." in step:
word_list = milk
reg = "Milch\."
end = "."
if ingred in word_list:
found = True
if not found and "Fleisch " in step:
word_list = meat
reg = "Fleisch "
end = " "
if ingred in word_list:
found = True
if not found and "Fleisch." in step:
word_list = meat
reg = "Fleisch\."
end = "."
if ingred in word_list:
found = True
if not found and ("Wurst " in step or "Würste " in step):
word_list = sausage
reg = "(Wurst|Würste) "
end = " "
if ingred in word_list:
found = True
if not found and ("Wurst." in step or "Würste." in step):
word_list = sausage
reg = "(Wurst|Würste)\."
end = "."
if ingred in word_list:
found = True
if not found and "Fisch " in step:
word_list = fish
reg = "Fisch "
end = " "
if ingred in word_list:
found = True
if not found and "Fisch." in step:
word_list = fish
reg = "Fisch\."
end = "."
if ingred in word_list:
found = True
if not found and "Nudeln " in step:
word_list = pasta
reg = "Nudeln "
end = " "
if ingred in word_list:
found = True
if not found and "Nudeln." in step:
word_list = pasta
reg = "Nudeln\."
end = "."
if ingred in word_list:
found = True
if not found and "Brot " in step:
word_list = bread
reg = "Brot "
end = " "
if ingred in word_list:
found = True
if not found and "Brot." in step:
word_list = bread
reg = "Brot\."
end = "."
if ingred in word_list:
found = True
if not found and ("Semmel " in step or "Semmeln " in step or "Brötchen " in step):
word_list = rolls
reg = "(Semmel(n)?|Brötchen) "
end = " "
if ingred in word_list:
found = True
if not found and ("Semmel." in step or "Semmeln." in step or "Brötchen." in step):
word_list = rolls
reg = "(Semmel(n)?|Brötchen)\."
end = "."
if ingred in word_list:
found = True
if not found:
return step, False
long_reg = "(?P<pre>[a-zA-ZäöüßÄÖÜ]{4,})(\-| )" + reg
reg_res = re.finditer(long_reg, step)
orig_step = step
if reg_res:
for match in reg_res:
if match.groupdict()["pre"] in ingred:
# replace whole regex
res = True
step = step[:match.start()] + ingred + end + step[match.end():]
step = re.sub("( |^)" + reg, " " + ingred + end, step)
if orig_step != step:
res = True
return step, res
def clean_steps(input_file='dataset_sep_sentences.json', output='dataset_clean_steps.json'):
print("Cleaning recipe steps")
with open(data_path + input_file, "r") as steps_json_file:
whole_dataset = json.load(steps_json_file)
rec_count = 0
for recipe in whole_dataset:
rec_count += 1
if rec_count % 1000 == 0:
print(rec_count)
steps = whole_dataset[recipe]['instructions']
ingredients = list(sorted(whole_dataset[recipe]['ingredients'], key=len, reverse=True))
new_steps = []
res = False
for step in steps:
# replace certain words (Schlagober, Eis gestoßen)
step = replace_alt_words(step, ingredients)
for ingred in ingredients:
ingred_list = ingred.split("_")
for ingr_part in ingred_list:
repl = ingr_part
# if starts with capital letter, check for occurrence and replace ingr_part with ingr
# Try replacing ingr_part with whole ingredient
step, res = try_replace(repl, ingred, step)
if not res:
# make certain changes and try replacing again
if "brühe" in ingr_part:
repl = "brühe"
elif "mehl" in ingr_part:
repl = "Mehl"
elif "öl" in ingr_part:
repl = "Öl"
elif "Maiskörner" in ingr_part:
repl = "Mais"
elif "honig" in ingr_part:
repl = "Honig"
elif "tomate" in ingr_part:
repl = "Tomate"
elif "Knoblauchzehe" in ingr_part:
# replace Knoblauchzehe(n)?
step = regex_list[4].sub('Knoblauchzehe', step)
# replace Knoblauch(\s)?(-)?(\s)?(z|Z)ehe(n)?
step = regex_list[5].sub('Knoblauchzehe', step)
elif "Knoblauch" in ingr_part:
# replace Knoblauchzehe(n)?
step = regex_list[4].sub('Knoblauch', step)
# replace Knoblauch(\s)?(-)?(\s)?(z|Z)ehe(n)?
step = regex_list[5].sub('Knoblauch', step)
else:
# replace category words with exact ingredient
# (e.g. "Das Brot_Roggen ..." instead of "Das Brot ...")
step, res = replace_categories(step, ingred)
if not res:
# Try replacing ingr_part with whole ingredient
step, res = try_replace(repl, ingred, step)
if not res:
# try replacing without considering case
step, res = try_replace(repl, ingred, step, caps=True)
if not res:
# replace even if word is longer ("Käsescheibe" not "Käse")
step, res = try_replace(repl, ingred, step, caps=False, regex_short=False)
if not res:
# replace even if word is longer without considering case
step, res = try_replace(repl, ingred, step, caps=True, regex_short=False)
new_steps.append(step)
# in step remove double occurrance of ingr
whole_dataset[recipe]['instructions'] = new_steps
with open(data_path + output, "w") as dataset_json_file:
json.dump(whole_dataset, dataset_json_file, ensure_ascii=False, indent=4)
gc.collect()
def count_occurrances(amount, file_name='cleaned_steps_occurrance.json'):
with open(data_path + file_name, "r") as json_file:
dataset = json.load(json_file)
count_under_amount = 0
for ingredient in dataset.keys():
if dataset[ingredient] <= amount:
count_under_amount += 1
print(ingredient)
print("There are " + str(count_under_amount) + " ingredients with up to " + str(amount) + " occurrances")
# checks for words of certain categories (that are not in list yet) in instructions
# if they occur, put the recipe's data and the word into file
# user can then see which words should be put into list of replacement words
def check_word(word, ingr_list_file, input_file='dataset_sep_sentences.json', output='ingredients.txt'):
# get list of all cleaned ingredients > 20
with open(ingr_list_file, "r") as json_file:
whole_word_list = json.load(json_file).keys()
# check for category
if word == "Käse " or word == "Käse." or word == "Käsescheibe":
word_list = cheese
word_pure = "Käse"
elif word == "Milch " or word == "Milch.":
word_list = milk
word_pure = "Milch"
elif word == "Fleisch " or word == "Fleisch.":
word_list = meat
word_pure = "Fleisch"
elif word == "Wurst " or word == "Wurst.":
word_list = sausage
word_pure = "Wurst"
elif word == "Würste " or word == "Würste.":
word_list = sausage
word_pure = "Würste"
elif word == "Fisch " or word == "Fisch.":
word_list = fish
word_pure = "Fisch"
elif word == "Nudeln " or word == "Nudeln.":
word_list = pasta
word_pure = "Nudeln"
elif word == "Brot " or word == "Brot.":
word_list = bread
word_pure = "Brot"
elif word == "Semmel " or word == "Semmel." or word == "Semmeln " or word == "Semmeln.":
word_list = rolls
word_pure = "Semmel"
elif word == "Brötchen " or word == "Brötchen.":
word_list = rolls
word_pure = "Brötchen"
else:
print(Fore.LIGHTRED_EX + "This category can't be processed correctly!")
return
with open(data_path + input_file, "r") as steps_json_file:
whole_dataset = json.load(steps_json_file)
for recipe in whole_dataset.keys():
exists = False
for step in whole_dataset[recipe]['instructions']:
if word in step:
for list_elem in word_list:
if list_elem in whole_dataset[recipe]['ingredients']:
exists = True
if word_pure in whole_dataset[recipe]['ingredients']:
exists = True
if not exists:
with open(output, "a") as dataset_json_file:
out_list = []
exists = True
for ingr in whole_dataset[recipe]['ingredients']:
if ingr in whole_word_list:
test = whole_dataset[recipe]
out_list.append(ingr)
if len(out_list) > 0:
dataset_json_file.write("\n" + whole_dataset[recipe]['name'] + "\n")
dataset_json_file.write(recipe + "\n")
dataset_json_file.write(step + "\n")
for ingr2 in out_list:
dataset_json_file.write(ingr2 + "\n")
def main():
print("started")
# separate sentences
with open(data_path + "dataset_cleaned_nice.json", "r") as whole_cleaned_json_file:
whole_dataset_cleaned = json.load(whole_cleaned_json_file)
sep_sentences(whole_dataset_cleaned)
clean_steps(input_file='dataset_sep_sentences.json', output='dataset_cleaned_steps.json')
sep_sentences_file(input_file='dataset_cleaned_steps.json', output_file='cleaned_sep_sentences.json')
#
# check_occurrances(step_file='cleaned_sep_sentences.json', out_file='cleaned_steps_occurrance.json')
# count_occurrances(300)
# check_word("Käse ")
# check_word("Käse.")
# check_word("Käsescheibe")
# check_word("Milch ")
# check_word("Fleisch ", "data/meats.json")
# check_word("Fleisch.", "data/meats.json")
# check_word("Wurst ", "data/sausage.json")
# check_word("Wurst.", "data/sausage.json")
# check_word("Würste ", "data/sausage.json")
# check_word("Würste.", "data/sausage.json")
# check_word("Fisch ", "data/fish.json")
# check_word("Fisch.", "data/fish.json")
# check_word("Nudeln ", "data/pasta.json")
# check_word("Nudeln.", "data/pasta.json")
# check_word("Brot ", "data/bread.json")
# check_word("Brot.", "data/bread.json")
# check_word("Semmel ", "data/rolls.json")
# check_word("Semmel.", "data/rolls.json")
# check_word("Semmeln ", "data/rolls.json")
# check_word("Semmeln.", "data/rolls.json")
# check_word("Brötchen ", "data/rolls.json")
# check_word("Brötchen.", "data/rolls.json")
# check_word("test", "data/rolls.json")
# output = replace_categories("Den Käse auf das Burger Brot legen", "Burgerbrötchen")
# print(output)
# test
# clean_steps(input='dataset_test.json', output='dataset_cleaned_steps_test.json')
# sep_sentences_file(input_file='dataset_cleaned_steps_test.json', output_file='cleaned_sep_sentences_test.json')
# check_occurrances(step_file='cleaned_sep_sentences_test.json', out_file='cleaned_steps_occurrance_test.json')
# clean_steps()
# sep_sentences_file(input_file='dataset_clean_steps.json', output_file='cleaned_sep_sentences.json')
# check_occurrances(step_file='cleaned_sep_sentences.json', out_file='cleaned_steps_occurrance.json')
main()