initial commit of project
This commit is contained in:
121
crawl_recipes/parse_category_subpages.py
Normal file
121
crawl_recipes/parse_category_subpages.py
Normal file
@@ -0,0 +1,121 @@
|
||||
# adapted by Franziska Paulus from:
|
||||
# Copyright 2016 Leon Zucchini
|
||||
#
|
||||
# This file is part of the "recipes" project
|
||||
# Repository: https://github.com/leonzucchini/recipes
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from bs4 import BeautifulSoup as bs
|
||||
import pandas as pd
|
||||
# import py2neo as pn
|
||||
|
||||
def parse_info(file_path, counter=1, verbose=True):
|
||||
""" Grab html code from a file and parse for information - return dict of dicts. """
|
||||
|
||||
this_counter = counter
|
||||
# Open file and parse with BS4
|
||||
with open(file_path, "r") as f:
|
||||
soup = bs(f.read(), "lxml")
|
||||
|
||||
# Get category
|
||||
category = re.match(r"g\d+(\w+)_.*", os.path.basename(file_path)).group(1)
|
||||
recipes = {}
|
||||
|
||||
search_hits = soup.find_all("script", type="application/ld+json")
|
||||
for hit in search_hits:
|
||||
# Parse results in search hit for basic information
|
||||
|
||||
try:
|
||||
# Recipe ID
|
||||
id = hit['id'].replace('recipe-','')
|
||||
recipes[id] = {}
|
||||
|
||||
# Info on category and category list page
|
||||
recipes[id]["category"] = category
|
||||
pNum, rText, sNum = re.match(r".*(g\d+)(.*)_(\d*).*\.(txt)", file_path).group(1,2,3)
|
||||
recipes[id]["category_list_page"] = "www.chefkoch.de/rs/s" + sNum + pNum + "/" + rText + ".html"
|
||||
|
||||
# URL, title, subtitle
|
||||
recipes[id]["url"] = "".join(["http://www.chefkoch.de", hit.a['href']])
|
||||
recipes[id]["title"] = hit.a.find("div", class_="search-list-item-title").get_text().strip().replace("\n","")
|
||||
recipes[id]["subtitle"] = hit.a.find("p", class_="search-list-item-subtitle").get_text().strip().replace("\n","")
|
||||
|
||||
# Votes (number and average)
|
||||
votes_raw = hit.a.find("span", class_="search-list-item-uservotes-stars")["title"]
|
||||
recipes[id]["votes_n"] = re.match(r"^(\d*)\s.*", votes_raw).group(1)
|
||||
recipes[id]["votes_avg"] = re.match(r".*\s(.*?)$", votes_raw).group(1)
|
||||
|
||||
# Other info
|
||||
recipes[id]["difficulty"] = hit.a.find("span", class_="search-list-item-difficulty").get_text()
|
||||
recipes[id]["preptime"] = hit.a.find("span", class_="search-list-item-preptime").get_text()
|
||||
recipes[id]["activationdate"] = hit.a.find("span", class_="search-list-item-activationdate").get_text()
|
||||
|
||||
if verbose:
|
||||
print(file_path + " #" + str(this_counter) + " successfully parsed")
|
||||
|
||||
except Exception:
|
||||
if verbose:
|
||||
print(file_path + " #" + str(this_counter) + " problem with parsing")
|
||||
|
||||
this_counter += 1
|
||||
|
||||
return (recipes, this_counter)
|
||||
|
||||
def parse_subpage(file_path, counter=1, verbose=True):
|
||||
""" Grab html code from a file and parse for information - return dict of dicts. """
|
||||
|
||||
this_counter = counter
|
||||
# Open file and parse with BS4
|
||||
with open(file_path, "r") as f:
|
||||
soup = bs(f.read(), "lxml")
|
||||
|
||||
|
||||
#Get URLs and save to list
|
||||
recipe_urls = []
|
||||
search_hits = soup.find_all("script", type="application/ld+json")[1]
|
||||
hits_json = json.loads(search_hits.string.replace("\t", " "))
|
||||
for hit in hits_json["itemListElement"]:
|
||||
recipe_urls.append(hit["url"])
|
||||
return recipe_urls
|
||||
|
||||
|
||||
# def neo_dict(dictionary):
|
||||
# """ Write content of dictionary to graph database as attributes (node = dict name). """
|
||||
# pass
|
||||
|
||||
def main():
|
||||
folder_path = "crawl_recipes/textFiles/"
|
||||
out_path = "recipe_urls.txt"
|
||||
curr_file_nr = 0
|
||||
curr_url_nr = 0
|
||||
|
||||
# Get file names from folder
|
||||
file_paths = []
|
||||
[file_paths.append(os.path.join(folder_path, fn)) for fn in os.listdir(folder_path)]
|
||||
# file_paths.pop(0) # Remove log file from list
|
||||
|
||||
# Get URLs
|
||||
counter = 1
|
||||
all_recipe_urls = []
|
||||
for fp in file_paths:
|
||||
print("file nr: " + str(curr_file_nr))
|
||||
curr_file_nr = curr_file_nr + 1
|
||||
urls = parse_subpage(fp, counter=counter, verbose=False)
|
||||
for url in urls:
|
||||
print("URL Nr.: " + str(curr_url_nr))
|
||||
curr_url_nr = curr_url_nr + 1
|
||||
|
||||
if url not in all_recipe_urls:
|
||||
all_recipe_urls.append(url)
|
||||
|
||||
# Save to file
|
||||
with open(out_path, 'w') as f:
|
||||
for item in all_recipe_urls:
|
||||
print(item)
|
||||
f.write("%s\n" % item)
|
||||
|
||||
print("All done!")
|
||||
|
||||
main()
|
||||
Reference in New Issue
Block a user