122 lines
4.2 KiB
Python
122 lines
4.2 KiB
Python
# adapted by Franziska Paulus from:
|
|
# Copyright 2016 Leon Zucchini
|
|
#
|
|
# This file is part of the "recipes" project
|
|
# Repository: https://github.com/leonzucchini/recipes
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
from bs4 import BeautifulSoup as bs
|
|
import pandas as pd
|
|
# import py2neo as pn
|
|
|
|
def parse_info(file_path, counter=1, verbose=True):
|
|
""" Grab html code from a file and parse for information - return dict of dicts. """
|
|
|
|
this_counter = counter
|
|
# Open file and parse with BS4
|
|
with open(file_path, "r") as f:
|
|
soup = bs(f.read(), "lxml")
|
|
|
|
# Get category
|
|
category = re.match(r"g\d+(\w+)_.*", os.path.basename(file_path)).group(1)
|
|
recipes = {}
|
|
|
|
search_hits = soup.find_all("script", type="application/ld+json")
|
|
for hit in search_hits:
|
|
# Parse results in search hit for basic information
|
|
|
|
try:
|
|
# Recipe ID
|
|
id = hit['id'].replace('recipe-','')
|
|
recipes[id] = {}
|
|
|
|
# Info on category and category list page
|
|
recipes[id]["category"] = category
|
|
pNum, rText, sNum = re.match(r".*(g\d+)(.*)_(\d*).*\.(txt)", file_path).group(1,2,3)
|
|
recipes[id]["category_list_page"] = "www.chefkoch.de/rs/s" + sNum + pNum + "/" + rText + ".html"
|
|
|
|
# URL, title, subtitle
|
|
recipes[id]["url"] = "".join(["http://www.chefkoch.de", hit.a['href']])
|
|
recipes[id]["title"] = hit.a.find("div", class_="search-list-item-title").get_text().strip().replace("\n","")
|
|
recipes[id]["subtitle"] = hit.a.find("p", class_="search-list-item-subtitle").get_text().strip().replace("\n","")
|
|
|
|
# Votes (number and average)
|
|
votes_raw = hit.a.find("span", class_="search-list-item-uservotes-stars")["title"]
|
|
recipes[id]["votes_n"] = re.match(r"^(\d*)\s.*", votes_raw).group(1)
|
|
recipes[id]["votes_avg"] = re.match(r".*\s(.*?)$", votes_raw).group(1)
|
|
|
|
# Other info
|
|
recipes[id]["difficulty"] = hit.a.find("span", class_="search-list-item-difficulty").get_text()
|
|
recipes[id]["preptime"] = hit.a.find("span", class_="search-list-item-preptime").get_text()
|
|
recipes[id]["activationdate"] = hit.a.find("span", class_="search-list-item-activationdate").get_text()
|
|
|
|
if verbose:
|
|
print(file_path + " #" + str(this_counter) + " successfully parsed")
|
|
|
|
except Exception:
|
|
if verbose:
|
|
print(file_path + " #" + str(this_counter) + " problem with parsing")
|
|
|
|
this_counter += 1
|
|
|
|
return (recipes, this_counter)
|
|
|
|
def parse_subpage(file_path, counter=1, verbose=True):
|
|
""" Grab html code from a file and parse for information - return dict of dicts. """
|
|
|
|
this_counter = counter
|
|
# Open file and parse with BS4
|
|
with open(file_path, "r") as f:
|
|
soup = bs(f.read(), "lxml")
|
|
|
|
|
|
#Get URLs and save to list
|
|
recipe_urls = []
|
|
search_hits = soup.find_all("script", type="application/ld+json")[1]
|
|
hits_json = json.loads(search_hits.string.replace("\t", " "))
|
|
for hit in hits_json["itemListElement"]:
|
|
recipe_urls.append(hit["url"])
|
|
return recipe_urls
|
|
|
|
|
|
# def neo_dict(dictionary):
|
|
# """ Write content of dictionary to graph database as attributes (node = dict name). """
|
|
# pass
|
|
|
|
def main():
|
|
folder_path = "crawl_recipes/textFiles/"
|
|
out_path = "recipe_urls.txt"
|
|
curr_file_nr = 0
|
|
curr_url_nr = 0
|
|
|
|
# Get file names from folder
|
|
file_paths = []
|
|
[file_paths.append(os.path.join(folder_path, fn)) for fn in os.listdir(folder_path)]
|
|
# file_paths.pop(0) # Remove log file from list
|
|
|
|
# Get URLs
|
|
counter = 1
|
|
all_recipe_urls = []
|
|
for fp in file_paths:
|
|
print("file nr: " + str(curr_file_nr))
|
|
curr_file_nr = curr_file_nr + 1
|
|
urls = parse_subpage(fp, counter=counter, verbose=False)
|
|
for url in urls:
|
|
print("URL Nr.: " + str(curr_url_nr))
|
|
curr_url_nr = curr_url_nr + 1
|
|
|
|
if url not in all_recipe_urls:
|
|
all_recipe_urls.append(url)
|
|
|
|
# Save to file
|
|
with open(out_path, 'w') as f:
|
|
for item in all_recipe_urls:
|
|
print(item)
|
|
f.write("%s\n" % item)
|
|
|
|
print("All done!")
|
|
|
|
main()
|