# adapted by Franziska Paulus from: # Copyright 2016 Leon Zucchini # # This file is part of the "recipes" project # Repository: https://github.com/leonzucchini/recipes import json import os import re from bs4 import BeautifulSoup as bs import pandas as pd # import py2neo as pn def parse_info(file_path, counter=1, verbose=True): """ Grab html code from a file and parse for information - return dict of dicts. """ this_counter = counter # Open file and parse with BS4 with open(file_path, "r") as f: soup = bs(f.read(), "lxml") # Get category category = re.match(r"g\d+(\w+)_.*", os.path.basename(file_path)).group(1) recipes = {} search_hits = soup.find_all("script", type="application/ld+json") for hit in search_hits: # Parse results in search hit for basic information try: # Recipe ID id = hit['id'].replace('recipe-','') recipes[id] = {} # Info on category and category list page recipes[id]["category"] = category pNum, rText, sNum = re.match(r".*(g\d+)(.*)_(\d*).*\.(txt)", file_path).group(1,2,3) recipes[id]["category_list_page"] = "www.chefkoch.de/rs/s" + sNum + pNum + "/" + rText + ".html" # URL, title, subtitle recipes[id]["url"] = "".join(["http://www.chefkoch.de", hit.a['href']]) recipes[id]["title"] = hit.a.find("div", class_="search-list-item-title").get_text().strip().replace("\n","") recipes[id]["subtitle"] = hit.a.find("p", class_="search-list-item-subtitle").get_text().strip().replace("\n","") # Votes (number and average) votes_raw = hit.a.find("span", class_="search-list-item-uservotes-stars")["title"] recipes[id]["votes_n"] = re.match(r"^(\d*)\s.*", votes_raw).group(1) recipes[id]["votes_avg"] = re.match(r".*\s(.*?)$", votes_raw).group(1) # Other info recipes[id]["difficulty"] = hit.a.find("span", class_="search-list-item-difficulty").get_text() recipes[id]["preptime"] = hit.a.find("span", class_="search-list-item-preptime").get_text() recipes[id]["activationdate"] = hit.a.find("span", class_="search-list-item-activationdate").get_text() if verbose: print(file_path + " #" + str(this_counter) + " successfully parsed") except Exception: if verbose: print(file_path + " #" + str(this_counter) + " problem with parsing") this_counter += 1 return (recipes, this_counter) def parse_subpage(file_path, counter=1, verbose=True): """ Grab html code from a file and parse for information - return dict of dicts. """ this_counter = counter # Open file and parse with BS4 with open(file_path, "r") as f: soup = bs(f.read(), "lxml") #Get URLs and save to list recipe_urls = [] search_hits = soup.find_all("script", type="application/ld+json")[1] hits_json = json.loads(search_hits.string.replace("\t", " ")) for hit in hits_json["itemListElement"]: recipe_urls.append(hit["url"]) return recipe_urls # def neo_dict(dictionary): # """ Write content of dictionary to graph database as attributes (node = dict name). """ # pass def main(): folder_path = "crawl_recipes/textFiles/" out_path = "recipe_urls.txt" curr_file_nr = 0 curr_url_nr = 0 # Get file names from folder file_paths = [] [file_paths.append(os.path.join(folder_path, fn)) for fn in os.listdir(folder_path)] # file_paths.pop(0) # Remove log file from list # Get URLs counter = 1 all_recipe_urls = [] for fp in file_paths: print("file nr: " + str(curr_file_nr)) curr_file_nr = curr_file_nr + 1 urls = parse_subpage(fp, counter=counter, verbose=False) for url in urls: print("URL Nr.: " + str(curr_url_nr)) curr_url_nr = curr_url_nr + 1 if url not in all_recipe_urls: all_recipe_urls.append(url) # Save to file with open(out_path, 'w') as f: for item in all_recipe_urls: print(item) f.write("%s\n" % item) print("All done!") main()