584 lines
22 KiB
Python
584 lines
22 KiB
Python
import json
|
|
import re
|
|
import gc
|
|
import spacy
|
|
from clean_dataset.lists import regex_list, cheese, milk, meat, sausage, fish, pasta, bread, rolls
|
|
from colorama import Fore
|
|
|
|
nlp = spacy.load('de_core_news_lg')
|
|
data_path = "data/"
|
|
home_path = "prep_dataset/"
|
|
|
|
|
|
def make_list_str(given_list, reg):
|
|
if given_list:
|
|
return "null"
|
|
list_str = '['
|
|
count = 0
|
|
for element in given_list:
|
|
element = reg.sub(r' ', str(element))
|
|
if count != 0:
|
|
list_str += ','
|
|
count += 1
|
|
list_str += '"' + str(element) + '"'
|
|
|
|
list_str += ']'
|
|
return list_str
|
|
|
|
|
|
# creates dataset of only recipe URLs and steps
|
|
def sep_sentences_file(input_file='dataset_sep_sentences.json', output_file='sep_sentences.json'):
|
|
print("Making steps file")
|
|
with open(data_path + input_file, "r") as whole_sep_json_file:
|
|
whole_dataset = json.load(whole_sep_json_file)
|
|
|
|
# all_sentences = {}
|
|
sentence_count = 0
|
|
recipe_count = 0
|
|
with open(data_path + output_file, "w") as sep_file:
|
|
sep_file.write('{')
|
|
for recipe in whole_dataset.keys():
|
|
if recipe_count != 0:
|
|
sep_file.write(',')
|
|
recipe_count += 1
|
|
print(recipe_count)
|
|
sentence_count += len(whole_dataset[recipe]['instructions'])
|
|
sep_file.write('"' + recipe + '":' + json.dumps(whole_dataset[recipe]['instructions'], ensure_ascii=False, indent=4) + '\n')
|
|
|
|
# whole_dataset[recipe]['instructions'] = sentences
|
|
# all_sentences[recipe] = sentences
|
|
# if recipe_count % 1000 == 0:
|
|
# gc.collect()
|
|
|
|
sep_file.write('}')
|
|
print("sentences: " + str(sentence_count))
|
|
|
|
|
|
def sep_sentences(whole_dataset):
|
|
print("Separate sentences in instructions")
|
|
reg = re.compile(r'[\s]+')
|
|
# all_sentences = {}
|
|
sentence_count = 0
|
|
recipe_count = 0
|
|
with open(data_path + "dataset_sep_sentences.json", "w") as dataset_sep_file:
|
|
dataset_sep_file.write('{')
|
|
for recipe in whole_dataset.keys():
|
|
if recipe_count != 0:
|
|
dataset_sep_file.write(',')
|
|
recipe_count += 1
|
|
print(recipe_count)
|
|
# dataset_sep_file.write('{')
|
|
|
|
dataset_sep_file.write('"' + recipe + '":')
|
|
instr = whole_dataset[recipe]['instructions']
|
|
instr_nlp = nlp(instr)
|
|
sentences = list(instr_nlp.sents)
|
|
sentence_count += len(sentences)
|
|
whole_dataset[recipe]['instructions'] = list(map(lambda x: reg.sub(r' ', str(x)), sentences))
|
|
json.dump(whole_dataset[recipe], dataset_sep_file, ensure_ascii=False, indent=4)
|
|
# dataset_sep_file.write(',"instructions":' + make_list_str(sentences, reg))
|
|
# dataset_sep_file.write(',"comments":'+ make_list_str(whole_dataset[recipe]['comments'], reg) + '}')
|
|
# whole_dataset[recipe]['instructions'] = sentences
|
|
# all_sentences[recipe] = sentences
|
|
if recipe_count % 10000 == 0:
|
|
gc.collect()
|
|
|
|
dataset_sep_file.write('}')
|
|
print("sentences: " + str(sentence_count))
|
|
|
|
|
|
def check_occurrances(step_file='sep_sentences.json', out_file='ingredient_occurrance.json'):
|
|
print("Checking for occurrances of ingredients in steps")
|
|
with open(data_path + "mult_ingredients_nice.json", "r") as ingredients_json_file:
|
|
all_ingredients = json.load(ingredients_json_file)
|
|
with open(data_path + step_file, "r") as steps_json_file:
|
|
all_steps = json.load(steps_json_file)
|
|
occurrances = {}
|
|
|
|
all_steps_str = ""
|
|
for recipe in all_steps.keys():
|
|
for step in all_steps[recipe]:
|
|
all_steps_str += step
|
|
count_ingr = 0
|
|
|
|
small_amounts = []
|
|
for ingr in all_ingredients.keys():
|
|
count_ingr += 1
|
|
print(count_ingr)
|
|
counts = all_steps_str.count(ingr)
|
|
if counts < 10:
|
|
small_amounts.append((ingr, counts))
|
|
occurrances[ingr] = counts
|
|
|
|
print(small_amounts)
|
|
print(len(small_amounts))
|
|
with open(data_path + out_file, "w") as ingredient_json_file:
|
|
json.dump(occurrances, ingredient_json_file, ensure_ascii=False, indent=4)
|
|
|
|
|
|
def replace_alt_words(step, ingredients):
|
|
# Eiswürfel und Crushed Ice ersetzen
|
|
ice_types = ["Eis_Schokolade", "Eis_Vanillegeschmack", "Eis_Vanille", "Eis"]
|
|
if "Crushed_Ice" in ingredients:
|
|
ice_cream_bool = False
|
|
for ice in ice_types:
|
|
if ice in ingredients:
|
|
ice_cream_bool = True
|
|
if not ice_cream_bool:
|
|
# replace ice, Eis with Crushed_Ice
|
|
step = regex_list[28].sub('Crushed_Ice', step)
|
|
step = regex_list[29].sub('Crushed_Ice', step)
|
|
|
|
if "Eiswürfel" in ingredients:
|
|
ice_cream_bool = False
|
|
for ice in ice_types:
|
|
if ice in ingredients:
|
|
ice_cream_bool = True
|
|
if not ice_cream_bool:
|
|
# replace ice, Eis with Eiswürfel
|
|
step = regex_list[29].sub('Eiswürfel', step)
|
|
|
|
# Zitrusabrieb verbessern
|
|
if "Zitronenabrieb" in ingredients:
|
|
# replace Zitronenschale with Zitronenabrieb
|
|
step = regex_list[30].sub('Zitronenabrieb', step)
|
|
if "Limettenabrieb" in ingredients:
|
|
# replace Limettenschale with Limettenabrieb
|
|
step = regex_list[31].sub('Limettenabrieb', step)
|
|
if "Orangenabrieb" in ingredients:
|
|
# replace Orangenschale with Orangenabrieb
|
|
step = regex_list[32].sub('Orangenabrieb', step)
|
|
if "Mandarinenabrieb" in ingredients:
|
|
# replace Mandarinenschale with Mandarinenabrieb
|
|
step = regex_list[33].sub('Mandarinenabrieb', step)
|
|
|
|
for ingred in ingredients:
|
|
# change certain ingredient names
|
|
if "Paprika" in ingred:
|
|
# replace Paprikaschote(n)?
|
|
step = regex_list[1].sub('Paprika', step)
|
|
if "Lauch" in ingred:
|
|
# replace Porree
|
|
step = regex_list[2].sub('Lauch', step)
|
|
if "Zucchini" in ingred:
|
|
# replace Zucchino
|
|
step = regex_list[3].sub('Zucchini', step)
|
|
if "Sahne" in ingred and "saure" not in ingred:
|
|
# replace Schlagsahne
|
|
step = regex_list[6].sub('Sahne', step)
|
|
# replace Schlagobers
|
|
step = regex_list[7].sub('Sahne', step)
|
|
# replace süße Sahne
|
|
step = regex_list[8].sub('Sahne', step)
|
|
if "Zimt" in ingred:
|
|
# replace Zimtpulver
|
|
step = regex_list[9].sub('Zimt', step)
|
|
if "Wasser" in ingred:
|
|
# replace Wasser\sohne\sKohlens\u00e4ure
|
|
step = regex_list[10].sub('Wasser', step)
|
|
# Mineralwasser\sohne\sKohlens\u00e4ure
|
|
step = regex_list[11].sub('Wasser', step)
|
|
if "Sprudelwasser" in ingred and "Wasser" not in ingredients:
|
|
# replace Wasser
|
|
step = regex_list[12].sub('Sprudelwasser', step)
|
|
# replace Mineralwasser
|
|
step = regex_list[13].sub('Sprudelwasser', step)
|
|
# replace Sodawasser
|
|
step = regex_list[14].sub('Sprudelwasser', step)
|
|
if "Ingwer" in ingred:
|
|
# replace Ingwerwurzel
|
|
step = regex_list[15].sub('Ingwer', step)
|
|
if "Quark" in ingred:
|
|
# replace Topfen
|
|
step = regex_list[16].sub('Quark', step)
|
|
if "Karotte" in ingred:
|
|
# replace Möhre
|
|
step = regex_list[17].sub('Karotte', step)
|
|
if 'Puderzucker' in ingred:
|
|
# replace Staubzucker
|
|
step = regex_list[18].sub('Puderzucker', step)
|
|
if "Pfeffer" in ingred:
|
|
# replace Pfefferkörner
|
|
step = regex_list[19].sub('Pfeffer', step)
|
|
# replace Pfefferbeeren
|
|
step = regex_list[20].sub('Pfeffer', step)
|
|
if "Kartoffel" in ingred:
|
|
# replace Erdäpfel
|
|
step = regex_list[21].sub('Kartoffel', step)
|
|
# replace Erdapfel
|
|
step = regex_list[22].sub('Kartoffel', step)
|
|
if "Eigelb" in ingred:
|
|
# replace (Eidotter|Dotter)
|
|
step = regex_list[23].sub('Eigelb', step)
|
|
if "Eier" in ingred:
|
|
# replace Freilandeier
|
|
step = regex_list[24].sub('Eier', step)
|
|
# replace Vollei(er)?
|
|
step = regex_list[25].sub('Eier', step)
|
|
if "Eiweiß" in ingred:
|
|
# replace Eiklar
|
|
step = regex_list[26].sub('Eiweiß', step)
|
|
# replace Eischnee
|
|
step = regex_list[27].sub('Eiweiß', step)
|
|
|
|
return step
|
|
|
|
|
|
def try_replace(repl, ingred, step, caps=False, regex_short=True):
|
|
# match all capitals '^[A-ZÄÜÖ]'
|
|
res = re.match(regex_list[0], repl)
|
|
found = False
|
|
|
|
if res:
|
|
if regex_short:
|
|
repl_whole = '\s[a-zA-ZäÄöÖüÜß]*' + repl + '[a-zA-ZäÄöÖüÜß]{0,3}(?P<end>[\s.,])'
|
|
else:
|
|
repl_whole = '\s[a-zA-ZäÄöÖüÜß]*' + repl + '[a-zA-ZäÄöÖüÜß]*(?P<end>[\s.,])'
|
|
if caps:
|
|
step = re.sub(repl_whole, " " + ingred + "\g<end>", step, flags=re.IGNORECASE)
|
|
else:
|
|
step = re.sub(repl_whole, " " + ingred + "\g<end>", step)
|
|
if ingred in step:
|
|
found = True
|
|
|
|
return step, found
|
|
|
|
|
|
def replace_categories(step, ingred):
|
|
# avoid Vollmilch_Joghurt problem
|
|
if "Joghurt" in ingred or "joghurt" in ingred or "Schoko" in ingred or "schoko" in ingred:
|
|
return step, False
|
|
|
|
res = False
|
|
found = False
|
|
|
|
if "Käse " in step or "Käsescheibe " in step or "Käsescheiben " in step:
|
|
word_list = cheese
|
|
reg = "Käse(scheibe|scheiben) "
|
|
end = " "
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and ("Käse." in step or "Käsescheibe." in step or "Käsescheiben." in step):
|
|
word_list = cheese
|
|
reg = "Käse(scheibe|scheiben)\."
|
|
end = "."
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and "Milch " in step:
|
|
word_list = milk
|
|
reg = "Milch "
|
|
end = " "
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and "Milch." in step:
|
|
word_list = milk
|
|
reg = "Milch\."
|
|
end = "."
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and "Fleisch " in step:
|
|
word_list = meat
|
|
reg = "Fleisch "
|
|
end = " "
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and "Fleisch." in step:
|
|
word_list = meat
|
|
reg = "Fleisch\."
|
|
end = "."
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and ("Wurst " in step or "Würste " in step):
|
|
word_list = sausage
|
|
reg = "(Wurst|Würste) "
|
|
end = " "
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and ("Wurst." in step or "Würste." in step):
|
|
word_list = sausage
|
|
reg = "(Wurst|Würste)\."
|
|
end = "."
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and "Fisch " in step:
|
|
word_list = fish
|
|
reg = "Fisch "
|
|
end = " "
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and "Fisch." in step:
|
|
word_list = fish
|
|
reg = "Fisch\."
|
|
end = "."
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and "Nudeln " in step:
|
|
word_list = pasta
|
|
reg = "Nudeln "
|
|
end = " "
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and "Nudeln." in step:
|
|
word_list = pasta
|
|
reg = "Nudeln\."
|
|
end = "."
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and "Brot " in step:
|
|
word_list = bread
|
|
reg = "Brot "
|
|
end = " "
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and "Brot." in step:
|
|
word_list = bread
|
|
reg = "Brot\."
|
|
end = "."
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and ("Semmel " in step or "Semmeln " in step or "Brötchen " in step):
|
|
word_list = rolls
|
|
reg = "(Semmel(n)?|Brötchen) "
|
|
end = " "
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found and ("Semmel." in step or "Semmeln." in step or "Brötchen." in step):
|
|
word_list = rolls
|
|
reg = "(Semmel(n)?|Brötchen)\."
|
|
end = "."
|
|
if ingred in word_list:
|
|
found = True
|
|
if not found:
|
|
return step, False
|
|
|
|
long_reg = "(?P<pre>[a-zA-ZäöüßÄÖÜ]{4,})(\-| )" + reg
|
|
reg_res = re.finditer(long_reg, step)
|
|
|
|
orig_step = step
|
|
if reg_res:
|
|
for match in reg_res:
|
|
if match.groupdict()["pre"] in ingred:
|
|
# replace whole regex
|
|
res = True
|
|
step = step[:match.start()] + ingred + end + step[match.end():]
|
|
|
|
step = re.sub("( |^)" + reg, " " + ingred + end, step)
|
|
|
|
if orig_step != step:
|
|
res = True
|
|
|
|
return step, res
|
|
|
|
|
|
def clean_steps(input_file='dataset_sep_sentences.json', output='dataset_clean_steps.json'):
|
|
print("Cleaning recipe steps")
|
|
with open(data_path + input_file, "r") as steps_json_file:
|
|
whole_dataset = json.load(steps_json_file)
|
|
|
|
rec_count = 0
|
|
for recipe in whole_dataset:
|
|
rec_count += 1
|
|
if rec_count % 1000 == 0:
|
|
print(rec_count)
|
|
steps = whole_dataset[recipe]['instructions']
|
|
ingredients = list(sorted(whole_dataset[recipe]['ingredients'], key=len, reverse=True))
|
|
|
|
new_steps = []
|
|
res = False
|
|
for step in steps:
|
|
# replace certain words (Schlagober, Eis gestoßen)
|
|
step = replace_alt_words(step, ingredients)
|
|
|
|
for ingred in ingredients:
|
|
ingred_list = ingred.split("_")
|
|
for ingr_part in ingred_list:
|
|
repl = ingr_part
|
|
# if starts with capital letter, check for occurrence and replace ingr_part with ingr
|
|
# Try replacing ingr_part with whole ingredient
|
|
step, res = try_replace(repl, ingred, step)
|
|
if not res:
|
|
# make certain changes and try replacing again
|
|
if "brühe" in ingr_part:
|
|
repl = "brühe"
|
|
elif "mehl" in ingr_part:
|
|
repl = "Mehl"
|
|
elif "öl" in ingr_part:
|
|
repl = "Öl"
|
|
elif "Maiskörner" in ingr_part:
|
|
repl = "Mais"
|
|
elif "honig" in ingr_part:
|
|
repl = "Honig"
|
|
elif "tomate" in ingr_part:
|
|
repl = "Tomate"
|
|
elif "Knoblauchzehe" in ingr_part:
|
|
# replace Knoblauchzehe(n)?
|
|
step = regex_list[4].sub('Knoblauchzehe', step)
|
|
# replace Knoblauch(\s)?(-)?(\s)?(z|Z)ehe(n)?
|
|
step = regex_list[5].sub('Knoblauchzehe', step)
|
|
elif "Knoblauch" in ingr_part:
|
|
# replace Knoblauchzehe(n)?
|
|
step = regex_list[4].sub('Knoblauch', step)
|
|
# replace Knoblauch(\s)?(-)?(\s)?(z|Z)ehe(n)?
|
|
step = regex_list[5].sub('Knoblauch', step)
|
|
else:
|
|
# replace category words with exact ingredient
|
|
# (e.g. "Das Brot_Roggen ..." instead of "Das Brot ...")
|
|
step, res = replace_categories(step, ingred)
|
|
if not res:
|
|
# Try replacing ingr_part with whole ingredient
|
|
step, res = try_replace(repl, ingred, step)
|
|
if not res:
|
|
# try replacing without considering case
|
|
step, res = try_replace(repl, ingred, step, caps=True)
|
|
if not res:
|
|
# replace even if word is longer ("Käsescheibe" not "Käse")
|
|
step, res = try_replace(repl, ingred, step, caps=False, regex_short=False)
|
|
if not res:
|
|
# replace even if word is longer without considering case
|
|
step, res = try_replace(repl, ingred, step, caps=True, regex_short=False)
|
|
|
|
new_steps.append(step)
|
|
|
|
# in step remove double occurrance of ingr
|
|
whole_dataset[recipe]['instructions'] = new_steps
|
|
|
|
with open(data_path + output, "w") as dataset_json_file:
|
|
json.dump(whole_dataset, dataset_json_file, ensure_ascii=False, indent=4)
|
|
gc.collect()
|
|
|
|
|
|
def count_occurrances(amount, file_name='cleaned_steps_occurrance.json'):
|
|
with open(data_path + file_name, "r") as json_file:
|
|
dataset = json.load(json_file)
|
|
|
|
count_under_amount = 0
|
|
for ingredient in dataset.keys():
|
|
if dataset[ingredient] <= amount:
|
|
count_under_amount += 1
|
|
print(ingredient)
|
|
print("There are " + str(count_under_amount) + " ingredients with up to " + str(amount) + " occurrances")
|
|
|
|
|
|
# checks for words of certain categories (that are not in list yet) in instructions
|
|
# if they occur, put the recipe's data and the word into file
|
|
# user can then see which words should be put into list of replacement words
|
|
def check_word(word, ingr_list_file, input_file='dataset_sep_sentences.json', output='ingredients.txt'):
|
|
# get list of all cleaned ingredients > 20
|
|
with open(ingr_list_file, "r") as json_file:
|
|
whole_word_list = json.load(json_file).keys()
|
|
|
|
# check for category
|
|
if word == "Käse " or word == "Käse." or word == "Käsescheibe":
|
|
word_list = cheese
|
|
word_pure = "Käse"
|
|
elif word == "Milch " or word == "Milch.":
|
|
word_list = milk
|
|
word_pure = "Milch"
|
|
elif word == "Fleisch " or word == "Fleisch.":
|
|
word_list = meat
|
|
word_pure = "Fleisch"
|
|
elif word == "Wurst " or word == "Wurst.":
|
|
word_list = sausage
|
|
word_pure = "Wurst"
|
|
elif word == "Würste " or word == "Würste.":
|
|
word_list = sausage
|
|
word_pure = "Würste"
|
|
elif word == "Fisch " or word == "Fisch.":
|
|
word_list = fish
|
|
word_pure = "Fisch"
|
|
elif word == "Nudeln " or word == "Nudeln.":
|
|
word_list = pasta
|
|
word_pure = "Nudeln"
|
|
elif word == "Brot " or word == "Brot.":
|
|
word_list = bread
|
|
word_pure = "Brot"
|
|
elif word == "Semmel " or word == "Semmel." or word == "Semmeln " or word == "Semmeln.":
|
|
word_list = rolls
|
|
word_pure = "Semmel"
|
|
elif word == "Brötchen " or word == "Brötchen.":
|
|
word_list = rolls
|
|
word_pure = "Brötchen"
|
|
else:
|
|
print(Fore.LIGHTRED_EX + "This category can't be processed correctly!")
|
|
return
|
|
|
|
with open(data_path + input_file, "r") as steps_json_file:
|
|
whole_dataset = json.load(steps_json_file)
|
|
for recipe in whole_dataset.keys():
|
|
exists = False
|
|
for step in whole_dataset[recipe]['instructions']:
|
|
if word in step:
|
|
for list_elem in word_list:
|
|
if list_elem in whole_dataset[recipe]['ingredients']:
|
|
exists = True
|
|
if word_pure in whole_dataset[recipe]['ingredients']:
|
|
exists = True
|
|
if not exists:
|
|
with open(output, "a") as dataset_json_file:
|
|
out_list = []
|
|
exists = True
|
|
for ingr in whole_dataset[recipe]['ingredients']:
|
|
if ingr in whole_word_list:
|
|
test = whole_dataset[recipe]
|
|
out_list.append(ingr)
|
|
if len(out_list) > 0:
|
|
dataset_json_file.write("\n" + whole_dataset[recipe]['name'] + "\n")
|
|
dataset_json_file.write(recipe + "\n")
|
|
dataset_json_file.write(step + "\n")
|
|
for ingr2 in out_list:
|
|
dataset_json_file.write(ingr2 + "\n")
|
|
|
|
def main():
|
|
print("started")
|
|
# separate sentences
|
|
with open(data_path + "dataset_cleaned_nice.json", "r") as whole_cleaned_json_file:
|
|
whole_dataset_cleaned = json.load(whole_cleaned_json_file)
|
|
sep_sentences(whole_dataset_cleaned)
|
|
|
|
clean_steps(input_file='dataset_sep_sentences.json', output='dataset_cleaned_steps.json')
|
|
|
|
sep_sentences_file(input_file='dataset_cleaned_steps.json', output_file='cleaned_sep_sentences.json')
|
|
#
|
|
# check_occurrances(step_file='cleaned_sep_sentences.json', out_file='cleaned_steps_occurrance.json')
|
|
|
|
|
|
# count_occurrances(300)
|
|
# check_word("Käse ")
|
|
# check_word("Käse.")
|
|
# check_word("Käsescheibe")
|
|
# check_word("Milch ")
|
|
# check_word("Fleisch ", "data/meats.json")
|
|
# check_word("Fleisch.", "data/meats.json")
|
|
# check_word("Wurst ", "data/sausage.json")
|
|
# check_word("Wurst.", "data/sausage.json")
|
|
# check_word("Würste ", "data/sausage.json")
|
|
# check_word("Würste.", "data/sausage.json")
|
|
# check_word("Fisch ", "data/fish.json")
|
|
# check_word("Fisch.", "data/fish.json")
|
|
# check_word("Nudeln ", "data/pasta.json")
|
|
# check_word("Nudeln.", "data/pasta.json")
|
|
# check_word("Brot ", "data/bread.json")
|
|
# check_word("Brot.", "data/bread.json")
|
|
# check_word("Semmel ", "data/rolls.json")
|
|
# check_word("Semmel.", "data/rolls.json")
|
|
# check_word("Semmeln ", "data/rolls.json")
|
|
# check_word("Semmeln.", "data/rolls.json")
|
|
# check_word("Brötchen ", "data/rolls.json")
|
|
# check_word("Brötchen.", "data/rolls.json")
|
|
# check_word("test", "data/rolls.json")
|
|
|
|
# output = replace_categories("Den Käse auf das Burger Brot legen", "Burgerbrötchen")
|
|
# print(output)
|
|
|
|
|
|
# test
|
|
# clean_steps(input='dataset_test.json', output='dataset_cleaned_steps_test.json')
|
|
# sep_sentences_file(input_file='dataset_cleaned_steps_test.json', output_file='cleaned_sep_sentences_test.json')
|
|
# check_occurrances(step_file='cleaned_sep_sentences_test.json', out_file='cleaned_steps_occurrance_test.json')
|
|
|
|
|
|
# clean_steps()
|
|
# sep_sentences_file(input_file='dataset_clean_steps.json', output_file='cleaned_sep_sentences.json')
|
|
# check_occurrances(step_file='cleaned_sep_sentences.json', out_file='cleaned_steps_occurrance.json')
|
|
|
|
main() |