initial commit of project
This commit is contained in:
119
crawl_recipes/crawl_category_subpages.py
Normal file
119
crawl_recipes/crawl_category_subpages.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# adapted by Franziska Paulus from:
|
||||
# Copyright 2016 Leon Zucchini
|
||||
#
|
||||
# This file is part of the "recipes" project
|
||||
# Repository: https://github.com/leonzucchini/recipes
|
||||
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime as dt
|
||||
|
||||
from crawl_recipes.tools import (
|
||||
get_input,
|
||||
get_response,
|
||||
make_folders,
|
||||
remove_pyc,
|
||||
write_text
|
||||
)
|
||||
|
||||
|
||||
|
||||
def crawl_categories(category_dict, folder_path,
|
||||
log_name, log_option="Exit", files_option="Exit",
|
||||
verbose=False, short_cycle=False):
|
||||
"""
|
||||
Get syntax framework for URLs of recipe category sub-pages (containing links to recipes).
|
||||
Crawl through recipe category sub-pages (loop over varying parts of URLs).
|
||||
Get HTML, check for errors, and store results.
|
||||
"""
|
||||
|
||||
log_list = []
|
||||
log_path = os.path.join(folder_path, log_name)
|
||||
|
||||
# Get syntax framework
|
||||
url_tuples = []
|
||||
for k, v in category_dict.items():
|
||||
test = re.match(r"(.*chefkoch.de/rs/s)\d+(.*)", v)
|
||||
url_tuples.append((test.group(1), test.group(2)))
|
||||
|
||||
for url_tuple in url_tuples:
|
||||
|
||||
# Cycle through URL sub-pages
|
||||
SUBPAGE_NO = 0
|
||||
URL_ERROR_COUNT = 0
|
||||
|
||||
while True:
|
||||
# Cycle through increments in URL pages
|
||||
|
||||
# Set break points
|
||||
break_now = URL_ERROR_COUNT > 5 # Break after 5 bad responses
|
||||
if short_cycle:
|
||||
break_now = URL_ERROR_COUNT > 5 or SUBPAGE_NO > 100 # Use shorter cycle for testing
|
||||
if break_now:
|
||||
break
|
||||
|
||||
else:
|
||||
# Get response
|
||||
cat_url = "".join([url_tuple[0], str(SUBPAGE_NO), url_tuple[1]])
|
||||
html_response = get_response.HTMLresponse(cat_url)
|
||||
|
||||
# Check for html get errors
|
||||
if isinstance(html_response.text, bytes):
|
||||
not_found = re.match("Zu deiner Suchanfrage konnten keine Rezepte gefunden werden.", html_response.text.decode("utf-8"))
|
||||
else:
|
||||
not_found = re.match("Zu deiner Suchanfrage konnten keine Rezepte gefunden werden.", html_response.text)
|
||||
if not_found or html_response.error:
|
||||
|
||||
# Add to error count and log
|
||||
URL_ERROR_COUNT += 1
|
||||
log_list.append(html_response.error_message)
|
||||
if verbose:
|
||||
print(html_response.error_message)
|
||||
|
||||
else:
|
||||
# If no error write text to file
|
||||
get_message = " ".join(["Got category sub-page:", cat_url])
|
||||
log_list.append(get_message)
|
||||
file_name = ".".join(["_".join([url_tuple[1][:-5].replace("/",""), str(SUBPAGE_NO)]), "txt"])
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
write_text.write_text(html_response.text, file_path, files_option)
|
||||
|
||||
if verbose:
|
||||
print(get_message)
|
||||
|
||||
# Increment of 30 due to specific syntax of chefkoch.de category sub-pages
|
||||
SUBPAGE_NO += 30
|
||||
|
||||
# Write log to file
|
||||
write_text.write_text("\n".join(log_list), log_path, log_option)
|
||||
|
||||
def main():
|
||||
""" Crawl through the chefkoch.de recipe category sub-pages and store HTML to files.
|
||||
Note:
|
||||
Recipes in chefkoch.de are sorted into categories (e.g. baking).
|
||||
Each category has many sub-pages containing links to 30 recipes.
|
||||
This file crawles through and stores the sub-pages so I can parse for the links later.
|
||||
"""
|
||||
|
||||
# Define path to configs
|
||||
config_path = "crawl_recipes/config/config.json"
|
||||
|
||||
# Get configs and set up paths
|
||||
config = get_input.get_input(config_path, print_config=False)
|
||||
output_path = os.path.join(config['_home'], config['_linkFiles'])
|
||||
make_folders.make_output_folder(output_path, debug=False)
|
||||
|
||||
# Parse cateogry urls and store pages to local files
|
||||
categories_path = os.path.join(config['_home'], config['category_links'])
|
||||
categories_links = get_input.get_input(categories_path, print_config=False)
|
||||
crawl_categories(
|
||||
categories_links, output_path,
|
||||
log_name = "_category_log.txt", log_option="Append",
|
||||
files_option="Exit", verbose=True, short_cycle=False
|
||||
)
|
||||
|
||||
# Clear up pyc files
|
||||
remove_pyc.remove_pyc(config['_home'])
|
||||
print("All done!")
|
||||
|
||||
main()
|
||||
Reference in New Issue
Block a user