Dev to master (#6)

* Feature/readme cleanup (#1) * cleaning up readme * removing coverage * adding support for python 2.7 (#2) * adding run_tester to ingnore * rm * adding back test file * Feature/budget bytes (#3) * adding budgetbytes class * getting the basics working * updating ignore * some updates * better support for recipe miner * cleaing up setup * upadting git ingore * upadting git ingore * updating version * Feature/doc clean up (#4) * adding utf-8 * adding data fetcher * updating abstract * Feature/doc clean up (#5) * adding utf-8 * adding data fetcher * updating abstract * removing some sites I don't use
RyanNoelk · Apr 6, 2017 · 231e2bf · 231e2bf
1 parent 3a34459
commit 231e2bf
Show file tree

Hide file tree

Showing 57 changed files with 201 additions and 689 deletions.
diff --git a/.coveragerc b/.coveragerc
diff --git a/.coveralls.yml b/.coveralls.yml
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,8 @@
 *.pyc
 *.egg-info
 .coverage
+/.idea
 
 recipe_scrapers/tests/test_data/*.html
+build/*
+dist/*
diff --git a/.travis.yml b/.travis.yml
@@ -1,20 +1,14 @@
 language: python
 
 python:
+  - "2.7"
   - "3.2"
   - "3.3"
   - "3.4"
   - "3.5"
 
 install:
-  - pip install beautifulsoup4==4.4.0 coverage==3.7.1 coveralls
-
-before_script:
-  - curl -L -o test_data.zip https://www.dropbox.com/sh/wkxm933pae6q0e6/AAAXSinoeSn1-fz5Fz_LHhM6a?dl=1
-  - unzip test_data.zip -d recipe_scrapers/tests/test_data || true
+  - pip install beautifulsoup4==4.4.0
 
 script:
-  - coverage run tests.py
-
-after_success:
-  - coveralls
+  - python tests.py
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -2,4 +2,5 @@ include LICENSE
 include README.md
 
 exclude tests.py
+exclude run_tester.py
 recursive-exclude recipe_scrapers/tests *
diff --git a/README.md b/README.md
@@ -1,12 +1,11 @@
 ## Recipe scrapers
 
 [![Build Status](https://travis-ci.org/hhursev/recipe-scraper.svg?branch=master)](https://travis-ci.org/hhursev/recipe-scraper)
-[![Coverage Status](https://coveralls.io/repos/hhursev/recipe-scraper/badge.svg?branch=master&service=github)](https://coveralls.io/github/hhursev/recipe-scraper?branch=master)
 
 A simple web scraping tool for recipe sites I use in a project of mine that makes sense to live as
-a separate package. **No Python 2 support.**
+a separate package.
 
-    pip install git+git://github.com/hhursev/recipe-scraper.git
+    pip install git+git://github.com/RyanNoelk/recipe-scraper.git
 
 then:
 
@@ -19,6 +18,10 @@ then:
     scrap_me.total_time()
     scrap_me.ingredients()
     scrap_me.instructions()
+
+    or
+
+    scrap_me.data()
 
 
 ### Contribute
@@ -35,24 +38,17 @@ If you are programmer PRs with fixes are warmly welcomed and acknowledged with a
 
 ### Scrapers available for:
 
-- [http://101cookbooks.com/](http://101cookbooks.com/)
 - [http://allrecipes.com/](http://allrecipes.com/)
-- [http://bbc.co.uk/](http://bbc.co.uk/food/recipes/)
-- [http://bbcgoodfood.com/](http://bbcgoodfood.com/)
 - [http://bonappetit.com/](http://bonappetit.com/)
-- [http://closetcooking.com/](http://closetcooking.com/)
 - [http://cookstr.com/](http://cookstr.com/)
 - [http://epicurious.com/](http://epicurious.com/)
 - [http://finedininglovers.com/](https://www.finedininglovers.com/)
 - [http://foodrepublic.com/](http://foodrepublic.com)
 - [http://jamieoliver.com/](http://www.jamieoliver.com/)
 - [http://mybakingaddiction.com/](http://mybakingaddiction.com/)
-- [http://paninihappy.com/](http://paninihappy.com/)
-- [http://realsimple.com/](http://www.realsimple.com/)
 - [http://simplyrecipes.com/](http://www.simplyrecipes.com)
 - [http://steamykitchen.com/](http://steamykitchen.com/)
 - [http://tastykitchen.com/](http://tastykitchen.com/)
-- [http://thepioneerwoman.com/](http://thepioneerwoman.com/)
 - [http://thevintagemixer.com/](http://www.thevintagemixer.com/)
 - [http://twopeasandtheirpod.com/](http://twopeasandtheirpod.com/)
 - [http://whatsgabycooking.com/](http://whatsgabycooking.com/)
diff --git a/recipe_scrapers/__init__.py b/recipe_scrapers/__init__.py
@@ -1,47 +1,37 @@
+#!/usr/bin/env python
+# encoding: utf-8
 import re
 
 from .allrecipes import AllRecipes
-from .bbcfood import BBCFood
-from .bbcgoodfood import BBCGoodFood
 from .bonappetit import BonAppetit
-from .closetcooking import ClosetCooking
+from .budgetbytes import BudgetBytes
 from .cookstr import Cookstr
 from .epicurious import Epicurious
 from .finedininglovers import FineDiningLovers
 from .foodrepublic import FoodRepublic
-from .hundredandonecookbooks import HundredAndOneCookbooks
 from .jamieoliver import JamieOliver
 from .mybakingaddiction import MyBakingAddiction
-from .paninihappy import PaniniHappy
-from .realsimple import RealSimple
 from .simplyrecipes import SimplyRecipes
 from .steamykitchen import SteamyKitchen
 from .tastykitchen import TastyKitchen
-from .thepioneerwoman import ThePioneerWoman
 from .thevintagemixer import TheVintageMixer
 from .twopeasandtheirpod import TwoPeasAndTheirPod
 from .whatsgabycooking import WhatsGabyCooking
 
 
 SCRAPERS = {
     AllRecipes.host(): AllRecipes,
-    BBCFood.host(): BBCFood,
-    BBCGoodFood.host(): BBCGoodFood,
     BonAppetit.host(): BonAppetit,
-    ClosetCooking.host(): ClosetCooking,
+    BudgetBytes.host(): BudgetBytes,
     Cookstr.host(): Cookstr,
     Epicurious.host(): Epicurious,
     FineDiningLovers.host(): FineDiningLovers,
     FoodRepublic.host(): FoodRepublic,
-    HundredAndOneCookbooks.host(): HundredAndOneCookbooks,
     JamieOliver.host(): JamieOliver,
     MyBakingAddiction.host(): MyBakingAddiction,
-    PaniniHappy.host(): PaniniHappy,
-    RealSimple.host(): RealSimple,
     SimplyRecipes.host(): SimplyRecipes,
     SteamyKitchen.host(): SteamyKitchen,
     TastyKitchen.host(): TastyKitchen,
-    ThePioneerWoman.host(): ThePioneerWoman,
     TheVintageMixer.host(): TheVintageMixer,
     TwoPeasAndTheirPod.host(): TwoPeasAndTheirPod,
     WhatsGabyCooking.host(): WhatsGabyCooking,

diff --git a/recipe_scrapers/_abstract.py b/recipe_scrapers/_abstract.py
@@ -1,4 +1,10 @@
-from urllib import request
+#!/usr/bin/env python
+# encoding: utf-8
+try:
+    from urllib import request
+except:
+    from urllib2 import urlopen as request
+    from urllib2 import Request
 
 from bs4 import BeautifulSoup
 
@@ -9,15 +15,19 @@
 }
 
 
-class AbstractScraper():
+class AbstractScraper(object):
 
     def __init__(self, url, test=False):
         if test:  # when testing, we load a file
             with url:
                 self.soup = BeautifulSoup(url.read(), "html.parser")
         else:
-            self.soup = BeautifulSoup(request.urlopen(
-                request.Request(url, headers=HEADERS)).read(), "html.parser")
+            try:
+                self.soup = BeautifulSoup(request.urlopen(
+                    request.Request(url, headers=HEADERS)).read(), "html.parser")
+            except:
+                self.soup = BeautifulSoup(request(
+                    Request(url, headers=HEADERS)).read(), "html.parser")
 
     def host(self):
         """ get the host of the url, so we can use the correct scraper (check __init__.py) """
@@ -26,6 +36,9 @@ def host(self):
     def title(self):
         raise NotImplementedError("This should be implemented.")
 
+    def servings(self):
+        raise NotImplementedError("This should be implemented.")
+
     def total_time(self):
         """ total time it takes to preparate the recipe in minutes """
         raise NotImplementedError("This should be implemented.")
@@ -35,3 +48,20 @@ def ingredients(self):
 
     def instructions(self):
         raise NotImplementedError("This should be implemented.")
+
+    def description(self):
+        return NotImplementedError("This should be implemented.")
+
+    def image(self):
+        return NotImplementedError("This should be implemented.")
+
+    def data(self):
+        return {
+            'title': self.title(),
+            'servings': self.servings(),
+            'total_time': self.total_time(),
+            'ingredients': self.ingredients(),
+            'instructions': self.instructions(),
+            'description': self.description(),
+            'image': self.image(),
+        }
diff --git a/recipe_scrapers/_utils.py b/recipe_scrapers/_utils.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+# encoding: utf-8
 import re
 
 
@@ -23,7 +25,7 @@ def normalize_string(string):
     return re.sub(
         r'\s+', ' ',
         string.replace(
-            '\xa0', ' ').replace(  # &nbsp;
-            '\n', ' ').replace(
-            '\t', ' ').strip()
+            u'\xa0', u' ').replace(  # &nbsp;
+            u'\n', u' ').replace(
+            u'\t', u' ').strip()
     )
diff --git a/recipe_scrapers/allrecipes.py b/recipe_scrapers/allrecipes.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+# encoding: utf-8
 from ._abstract import AbstractScraper
 from ._utils import get_minutes, normalize_string
 
@@ -15,7 +17,7 @@ def total_time(self):
         return get_minutes(self.soup.find('span', {'class': 'ready-in-time'}))
 
     def ingredients(self):
-        ingredients_html = self.soup.findAll('li', {'class': "checkList__line"})
+        ingredients_html = self.soup.findAll('span', {'class': "recipe-ingred_txt added"})
 
         return [
             normalize_string(ingredient.get_text())

diff --git a/recipe_scrapers/bbcfood.py b/recipe_scrapers/bbcfood.py
diff --git a/recipe_scrapers/bbcgoodfood.py b/recipe_scrapers/bbcgoodfood.py
diff --git a/recipe_scrapers/bonappetit.py b/recipe_scrapers/bonappetit.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+# encoding: utf-8
 from ._abstract import AbstractScraper
 from ._utils import get_minutes, normalize_string