# adapted by Franziska Paulus from: # Copyright 2016 Leon Zucchini # # This file is part of the "recipes" project # Repository: https://github.com/leonzucchini/recipes """ Crawl through recipe pages and parse data to database """ import json import os import requests # import grequests import pandas as pd import crawl_recipes.parse_recipe_pages from crawl_recipes.tools import ( get_user_agent, get_response ) def main(): # Get links for supporting files # url_list_path = "/Users/Leon/Documents/02_Research_Learning/Research/Recipes/03_Data/link_data.csv" # user_agents_path = "/Users/Leon/Documents/02_Research_Learning/Research/Recipes/02_Code/config/user_agents.txt" # Get page url from database # url_list = pd.read_csv(url_list_path)[['url']].values.tolist() # url_list = [item for sublist in url_list for item in sublist] # Flatten list of lists # num_urls = len(url_list) # Select user-agents from list (downloaded) # user_agent_list = get_user_agent.user_agent_list(user_agents_path) ### THIS IS WHERE THE LOOP OVER URLS GOES # Select url and user-agent # url = url_list[2051] # user_agent = get_user_agent.select_user_agent(user_agent_list) # url = "http://www.chefkoch.de/rezepte/1651531272966946/Schaschlik-wie-im-Kaukasus-grillen.html" user_agent = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; fr-FR) AppleWebKit/528.16 (KHTML, like Gecko) Version/4.0 Safari/528.16'} url_list_path = "recipe_urls.txt" # url_list_path = "errors_1.txt" dataset_path = "crawl_recipes/dataset.json" # Opening file url_file = open(url_list_path, 'r') count = 1 recipe_json = json.loads('{}') # Using for loop for url in url_file: print("parsing recipe " + str(count) + " from URL: " + url) count += 1 # Open page using selected user-agent response = get_response.HTMLresponse(url, user_agent=user_agent) text = response.text # Parse info on page result = crawl_recipes.parse_recipe_pages.parse_recipe_info(url, text) if result is not None: recipe_json.update(result) if count % 1000 == 0: with open(dataset_path, 'w') as f: json.dump(recipe_json, f, indent=4) # Closing files url_file.close() # Save dataset to file with open(dataset_path, 'w') as f: json.dump(recipe_json, f, indent=4) # print(json.dumps(recipe_json)) print("Done!") main()