initial commit of project
This commit is contained in:
79
crawl_recipes/crawl_recipe_pages.py
Normal file
79
crawl_recipes/crawl_recipe_pages.py
Normal file
@@ -0,0 +1,79 @@
|
||||
# adapted by Franziska Paulus from:
|
||||
# Copyright 2016 Leon Zucchini
|
||||
#
|
||||
# This file is part of the "recipes" project
|
||||
# Repository: https://github.com/leonzucchini/recipes
|
||||
"""
|
||||
Crawl through recipe pages and parse data to database
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
# import grequests
|
||||
import pandas as pd
|
||||
import crawl_recipes.parse_recipe_pages
|
||||
from crawl_recipes.tools import (
|
||||
get_user_agent,
|
||||
get_response
|
||||
)
|
||||
|
||||
def main():
|
||||
|
||||
# Get links for supporting files
|
||||
# url_list_path = "/Users/Leon/Documents/02_Research_Learning/Research/Recipes/03_Data/link_data.csv"
|
||||
# user_agents_path = "/Users/Leon/Documents/02_Research_Learning/Research/Recipes/02_Code/config/user_agents.txt"
|
||||
|
||||
# Get page url from database
|
||||
# url_list = pd.read_csv(url_list_path)[['url']].values.tolist()
|
||||
# url_list = [item for sublist in url_list for item in sublist] # Flatten list of lists
|
||||
# num_urls = len(url_list)
|
||||
|
||||
# Select user-agents from list (downloaded)
|
||||
# user_agent_list = get_user_agent.user_agent_list(user_agents_path)
|
||||
|
||||
### THIS IS WHERE THE LOOP OVER URLS GOES
|
||||
# Select url and user-agent
|
||||
# url = url_list[2051]
|
||||
# user_agent = get_user_agent.select_user_agent(user_agent_list)
|
||||
# url = "http://www.chefkoch.de/rezepte/1651531272966946/Schaschlik-wie-im-Kaukasus-grillen.html"
|
||||
user_agent = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; fr-FR) AppleWebKit/528.16 (KHTML, like Gecko) Version/4.0 Safari/528.16'}
|
||||
url_list_path = "recipe_urls.txt"
|
||||
# url_list_path = "errors_1.txt"
|
||||
dataset_path = "crawl_recipes/dataset.json"
|
||||
|
||||
# Opening file
|
||||
url_file = open(url_list_path, 'r')
|
||||
count = 1
|
||||
|
||||
recipe_json = json.loads('{}')
|
||||
|
||||
# Using for loop
|
||||
for url in url_file:
|
||||
|
||||
print("parsing recipe " + str(count) + " from URL: " + url)
|
||||
count += 1
|
||||
# Open page using selected user-agent
|
||||
response = get_response.HTMLresponse(url, user_agent=user_agent)
|
||||
text = response.text
|
||||
|
||||
# Parse info on page
|
||||
result = crawl_recipes.parse_recipe_pages.parse_recipe_info(url, text)
|
||||
if result is not None:
|
||||
recipe_json.update(result)
|
||||
if count % 1000 == 0:
|
||||
with open(dataset_path, 'w') as f:
|
||||
json.dump(recipe_json, f, indent=4)
|
||||
|
||||
# Closing files
|
||||
url_file.close()
|
||||
|
||||
# Save dataset to file
|
||||
|
||||
with open(dataset_path, 'w') as f:
|
||||
json.dump(recipe_json, f, indent=4)
|
||||
# print(json.dumps(recipe_json))
|
||||
print("Done!")
|
||||
|
||||
|
||||
|
||||
main()
|
||||
Reference in New Issue
Block a user