initial commit of project

This commit is contained in:
2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions

View File

@@ -0,0 +1,121 @@
# adapted by Franziska Paulus from:
# Copyright 2016 Leon Zucchini
#
# This file is part of the "recipes" project
# Repository: https://github.com/leonzucchini/recipes
import json
import os
import re
from bs4 import BeautifulSoup as bs
import pandas as pd
# import py2neo as pn
def parse_info(file_path, counter=1, verbose=True):
""" Grab html code from a file and parse for information - return dict of dicts. """
this_counter = counter
# Open file and parse with BS4
with open(file_path, "r") as f:
soup = bs(f.read(), "lxml")
# Get category
category = re.match(r"g\d+(\w+)_.*", os.path.basename(file_path)).group(1)
recipes = {}
search_hits = soup.find_all("script", type="application/ld+json")
for hit in search_hits:
# Parse results in search hit for basic information
try:
# Recipe ID
id = hit['id'].replace('recipe-','')
recipes[id] = {}
# Info on category and category list page
recipes[id]["category"] = category
pNum, rText, sNum = re.match(r".*(g\d+)(.*)_(\d*).*\.(txt)", file_path).group(1,2,3)
recipes[id]["category_list_page"] = "www.chefkoch.de/rs/s" + sNum + pNum + "/" + rText + ".html"
# URL, title, subtitle
recipes[id]["url"] = "".join(["http://www.chefkoch.de", hit.a['href']])
recipes[id]["title"] = hit.a.find("div", class_="search-list-item-title").get_text().strip().replace("\n","")
recipes[id]["subtitle"] = hit.a.find("p", class_="search-list-item-subtitle").get_text().strip().replace("\n","")
# Votes (number and average)
votes_raw = hit.a.find("span", class_="search-list-item-uservotes-stars")["title"]
recipes[id]["votes_n"] = re.match(r"^(\d*)\s.*", votes_raw).group(1)
recipes[id]["votes_avg"] = re.match(r".*\s(.*?)$", votes_raw).group(1)
# Other info
recipes[id]["difficulty"] = hit.a.find("span", class_="search-list-item-difficulty").get_text()
recipes[id]["preptime"] = hit.a.find("span", class_="search-list-item-preptime").get_text()
recipes[id]["activationdate"] = hit.a.find("span", class_="search-list-item-activationdate").get_text()
if verbose:
print(file_path + " #" + str(this_counter) + " successfully parsed")
except Exception:
if verbose:
print(file_path + " #" + str(this_counter) + " problem with parsing")
this_counter += 1
return (recipes, this_counter)
def parse_subpage(file_path, counter=1, verbose=True):
""" Grab html code from a file and parse for information - return dict of dicts. """
this_counter = counter
# Open file and parse with BS4
with open(file_path, "r") as f:
soup = bs(f.read(), "lxml")
#Get URLs and save to list
recipe_urls = []
search_hits = soup.find_all("script", type="application/ld+json")[1]
hits_json = json.loads(search_hits.string.replace("\t", " "))
for hit in hits_json["itemListElement"]:
recipe_urls.append(hit["url"])
return recipe_urls
# def neo_dict(dictionary):
# """ Write content of dictionary to graph database as attributes (node = dict name). """
# pass
def main():
folder_path = "crawl_recipes/textFiles/"
out_path = "recipe_urls.txt"
curr_file_nr = 0
curr_url_nr = 0
# Get file names from folder
file_paths = []
[file_paths.append(os.path.join(folder_path, fn)) for fn in os.listdir(folder_path)]
# file_paths.pop(0) # Remove log file from list
# Get URLs
counter = 1
all_recipe_urls = []
for fp in file_paths:
print("file nr: " + str(curr_file_nr))
curr_file_nr = curr_file_nr + 1
urls = parse_subpage(fp, counter=counter, verbose=False)
for url in urls:
print("URL Nr.: " + str(curr_url_nr))
curr_url_nr = curr_url_nr + 1
if url not in all_recipe_urls:
all_recipe_urls.append(url)
# Save to file
with open(out_path, 'w') as f:
for item in all_recipe_urls:
print(item)
f.write("%s\n" % item)
print("All done!")
main()