Skip to content

Commit 423bb01

Browse files
author
Joey Erskine
committed
added finecooking spider
1 parent 9b7d456 commit 423bb01

File tree

1 file changed

+76
-0
lines changed

1 file changed

+76
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from scrapy.contrib.spiders import CrawlSpider, Rule
2+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
3+
from scrapy.selector import HtmlXPathSelector
4+
from openrecipes.items import RecipeItem, RecipeItemLoader
5+
import re
6+
7+
8+
class FinecookingMixin(object):
9+
source = 'finecooking'
10+
11+
def parse_item(self, response):
12+
13+
hxs = HtmlXPathSelector(response)
14+
15+
base_path = './/*[@id="ctl00_phMainContent_recipe_content"]'
16+
17+
recipes_scopes = hxs.select(base_path)
18+
19+
name_path = '//*[@class="main-title"]/text()'
20+
description_path = '//*[@property="v:summary"]/text()'
21+
image_path = 'concat("http://www.finecooking.com", //*[@rel="v:photo"]/@src)'
22+
#prepTime_path = 'TODO'
23+
#cookTime_path = 'TODO'
24+
recipeYield_path = '//*[@class="yield"]/text()'
25+
ingredients_path = '//*[@property="v:amount"]/text()'
26+
datePublished = 'TODO'
27+
28+
recipes = []
29+
30+
label_regex = re.compile(r'^For ')
31+
32+
for r_scope in recipes_scopes:
33+
il = RecipeItemLoader(item=RecipeItem())
34+
35+
il.add_value('source', self.source)
36+
37+
il.add_value('name', r_scope.select(name_path).extract())
38+
il.add_value('image', r_scope.select(image_path).extract())
39+
il.add_value('url', response.url)
40+
il.add_value('description', r_scope.select(description_path).extract())
41+
42+
#il.add_value('prepTime', r_scope.select(prepTime_path).extract())
43+
#il.add_value('cookTime', r_scope.select(cookTime_path).extract())
44+
il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
45+
46+
ingredient_scopes = r_scope.select(ingredients_path)
47+
ingredients = []
48+
for i_scope in ingredient_scopes:
49+
ingredient = i_scope.extract().strip()
50+
if not label_regex.match(ingredient) and not ingredient.endswith(':'):
51+
ingredients.append(ingredient)
52+
il.add_value('ingredients', ingredients)
53+
54+
il.add_value('datePublished', r_scope.select(datePublished).extract())
55+
56+
recipes.append(il.load_item())
57+
58+
return recipes
59+
60+
61+
class FinecookingcrawlSpider(CrawlSpider, FinecookingMixin):
62+
63+
name = "finecooking.com"
64+
65+
allowed_domains = ["finecooking.com"]
66+
67+
start_urls = [
68+
"http://www.finecooking.com/browseallingredients.aspx",
69+
]
70+
71+
rules = (
72+
Rule(SgmlLinkExtractor(allow=('/recipes/[A-Za-z]+/\d+.aspx'))),
73+
74+
Rule(SgmlLinkExtractor(allow=('/recipes/[a-z]+.aspx')),
75+
callback='parse_item'),
76+
)

0 commit comments

Comments
 (0)