diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..e3d7cd0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,74 @@ +### Linux template +*~ + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +*.pyc +*.egg +*.egg-info +*.dmp +*.zip +.DS_Store diff --git a/.travis.yml b/.travis.yml new file mode 100755 index 0000000..e002ddb --- /dev/null +++ b/.travis.yml @@ -0,0 +1,25 @@ +language: python +sudo: false +python: + - '3.6' + - '3.7' +matrix: + allow_failures: + - python: '3.7' +deploy: + provider: pypi + user: beni + password: + secure: XfUbc5Tnjq8mUHnv/rrQvcQ5m+k7mvk2sAwhS1Hzi2NFXaiPQF0YR2er0BDDQOFYba+MBd57l4zHdyti8Y39uVI2ZfY10c5KYio3VzXDU2doycLf7hY8cqKs8UioabVehrPU96GErVEUyA2Jj1cqrsIUX7Smj8qby0DfX+igJtM= + on: + tags: true + repo: abenassi/Google-Search-API +install: +- pip install -r requirements.txt +- python setup.py install +- pip install coveralls +script: +- nosetests +after_success: coveralls +os: +- linux diff --git a/BeautifulSoup.py b/BeautifulSoup.py deleted file mode 100644 index 4b17b85..0000000 --- a/BeautifulSoup.py +++ /dev/null @@ -1,2014 +0,0 @@ -"""Beautiful Soup -Elixir and Tonic -"The Screen-Scraper's Friend" -http://www.crummy.com/software/BeautifulSoup/ - -Beautiful Soup parses a (possibly invalid) XML or HTML document into a -tree representation. It provides methods and Pythonic idioms that make -it easy to navigate, search, and modify the tree. - -A well-formed XML/HTML document yields a well-formed data -structure. An ill-formed XML/HTML document yields a correspondingly -ill-formed data structure. If your document is only locally -well-formed, you can use this library to find and process the -well-formed part of it. - -Beautiful Soup works with Python 2.2 and up. It has no external -dependencies, but you'll have more success at converting data to UTF-8 -if you also install these three packages: - -* chardet, for auto-detecting character encodings - http://chardet.feedparser.org/ -* cjkcodecs and iconv_codec, which add more encodings to the ones supported - by stock Python. - http://cjkpython.i18n.org/ - -Beautiful Soup defines classes for two main parsing strategies: - - * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific - language that kind of looks like XML. - - * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid - or invalid. This class has web browser-like heuristics for - obtaining a sensible parse tree in the face of common HTML errors. - -Beautiful Soup also defines a class (UnicodeDammit) for autodetecting -the encoding of an HTML or XML document, and converting it to -Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. - -For more than you ever wanted to know about Beautiful Soup, see the -documentation: -http://www.crummy.com/software/BeautifulSoup/documentation.html - -Here, have some legalese: - -Copyright (c) 2004-2010, Leonard Richardson - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of the the Beautiful Soup Consortium and All - Night Kosher Bakery nor the names of its contributors may be - used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. - -""" -from __future__ import generators - -__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "3.2.0" -__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson" -__license__ = "New-style BSD" - -from sgmllib import SGMLParser, SGMLParseError -import codecs -import markupbase -import types -import re -import sgmllib -try: - from htmlentitydefs import name2codepoint -except ImportError: - name2codepoint = {} -try: - set -except NameError: - from sets import Set as set - -#These hacks make Beautiful Soup able to parse XML with namespaces -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') -markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match - -DEFAULT_OUTPUT_ENCODING = "utf-8" - -def _match_css_class(str): - """Build a RE to match the given CSS class.""" - return re.compile(r"(^|.*\s)%s($|\s)" % str) - -# First, the classes that represent markup elements. - -class PageElement(object): - """Contains the navigational information for some part of the page - (either a tag or a piece of text)""" - - def setup(self, parent=None, previous=None): - """Sets up the initial relations between this element and - other elements.""" - self.parent = parent - self.previous = previous - self.next = None - self.previousSibling = None - self.nextSibling = None - if self.parent and self.parent.contents: - self.previousSibling = self.parent.contents[-1] - self.previousSibling.nextSibling = self - - def replaceWith(self, replaceWith): - oldParent = self.parent - myIndex = self.parent.index(self) - if hasattr(replaceWith, "parent")\ - and replaceWith.parent is self.parent: - # We're replacing this element with one of its siblings. - index = replaceWith.parent.index(replaceWith) - if index and index < myIndex: - # Furthermore, it comes before this element. That - # means that when we extract it, the index of this - # element will change. - myIndex = myIndex - 1 - self.extract() - oldParent.insert(myIndex, replaceWith) - - def replaceWithChildren(self): - myParent = self.parent - myIndex = self.parent.index(self) - self.extract() - reversedChildren = list(self.contents) - reversedChildren.reverse() - for child in reversedChildren: - myParent.insert(myIndex, child) - - def extract(self): - """Destructively rips this element out of the tree.""" - if self.parent: - try: - del self.parent.contents[self.parent.index(self)] - except ValueError: - pass - - #Find the two elements that would be next to each other if - #this element (and any children) hadn't been parsed. Connect - #the two. - lastChild = self._lastRecursiveChild() - nextElement = lastChild.next - - if self.previous: - self.previous.next = nextElement - if nextElement: - nextElement.previous = self.previous - self.previous = None - lastChild.next = None - - self.parent = None - if self.previousSibling: - self.previousSibling.nextSibling = self.nextSibling - if self.nextSibling: - self.nextSibling.previousSibling = self.previousSibling - self.previousSibling = self.nextSibling = None - return self - - def _lastRecursiveChild(self): - "Finds the last element beneath this object to be parsed." - lastChild = self - while hasattr(lastChild, 'contents') and lastChild.contents: - lastChild = lastChild.contents[-1] - return lastChild - - def insert(self, position, newChild): - if isinstance(newChild, basestring) \ - and not isinstance(newChild, NavigableString): - newChild = NavigableString(newChild) - - position = min(position, len(self.contents)) - if hasattr(newChild, 'parent') and newChild.parent is not None: - # We're 'inserting' an element that's already one - # of this object's children. - if newChild.parent is self: - index = self.index(newChild) - if index > position: - # Furthermore we're moving it further down the - # list of this object's children. That means that - # when we extract this element, our target index - # will jump down one. - position = position - 1 - newChild.extract() - - newChild.parent = self - previousChild = None - if position == 0: - newChild.previousSibling = None - newChild.previous = self - else: - previousChild = self.contents[position-1] - newChild.previousSibling = previousChild - newChild.previousSibling.nextSibling = newChild - newChild.previous = previousChild._lastRecursiveChild() - if newChild.previous: - newChild.previous.next = newChild - - newChildsLastElement = newChild._lastRecursiveChild() - - if position >= len(self.contents): - newChild.nextSibling = None - - parent = self - parentsNextSibling = None - while not parentsNextSibling: - parentsNextSibling = parent.nextSibling - parent = parent.parent - if not parent: # This is the last element in the document. - break - if parentsNextSibling: - newChildsLastElement.next = parentsNextSibling - else: - newChildsLastElement.next = None - else: - nextChild = self.contents[position] - newChild.nextSibling = nextChild - if newChild.nextSibling: - newChild.nextSibling.previousSibling = newChild - newChildsLastElement.next = nextChild - - if newChildsLastElement.next: - newChildsLastElement.next.previous = newChildsLastElement - self.contents.insert(position, newChild) - - def append(self, tag): - """Appends the given tag to the contents of this tag.""" - self.insert(len(self.contents), tag) - - def findNext(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears after this Tag in the document.""" - return self._findOne(self.findAllNext, name, attrs, text, **kwargs) - - def findAllNext(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns all items that match the given criteria and appear - after this Tag in the document.""" - return self._findAll(name, attrs, text, limit, self.nextGenerator, - **kwargs) - - def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears after this Tag in the document.""" - return self._findOne(self.findNextSiblings, name, attrs, text, - **kwargs) - - def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear after this Tag in the document.""" - return self._findAll(name, attrs, text, limit, - self.nextSiblingGenerator, **kwargs) - fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x - - def findPrevious(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears before this Tag in the document.""" - return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) - - def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns all items that match the given criteria and appear - before this Tag in the document.""" - return self._findAll(name, attrs, text, limit, self.previousGenerator, - **kwargs) - fetchPrevious = findAllPrevious # Compatibility with pre-3.x - - def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears before this Tag in the document.""" - return self._findOne(self.findPreviousSiblings, name, attrs, text, - **kwargs) - - def findPreviousSiblings(self, name=None, attrs={}, text=None, - limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear before this Tag in the document.""" - return self._findAll(name, attrs, text, limit, - self.previousSiblingGenerator, **kwargs) - fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x - - def findParent(self, name=None, attrs={}, **kwargs): - """Returns the closest parent of this Tag that matches the given - criteria.""" - # NOTE: We can't use _findOne because findParents takes a different - # set of arguments. - r = None - l = self.findParents(name, attrs, 1) - if l: - r = l[0] - return r - - def findParents(self, name=None, attrs={}, limit=None, **kwargs): - """Returns the parents of this Tag that match the given - criteria.""" - - return self._findAll(name, attrs, None, limit, self.parentGenerator, - **kwargs) - fetchParents = findParents # Compatibility with pre-3.x - - #These methods do the real heavy lifting. - - def _findOne(self, method, name, attrs, text, **kwargs): - r = None - l = method(name, attrs, text, 1, **kwargs) - if l: - r = l[0] - return r - - def _findAll(self, name, attrs, text, limit, generator, **kwargs): - "Iterates over a generator looking for things that match." - - if isinstance(name, SoupStrainer): - strainer = name - # (Possibly) special case some findAll*(...) searches - elif text is None and not limit and not attrs and not kwargs: - # findAll*(True) - if name is True: - return [element for element in generator() - if isinstance(element, Tag)] - # findAll*('tag-name') - elif isinstance(name, basestring): - return [element for element in generator() - if isinstance(element, Tag) and - element.name == name] - else: - strainer = SoupStrainer(name, attrs, text, **kwargs) - # Build a SoupStrainer - else: - strainer = SoupStrainer(name, attrs, text, **kwargs) - results = ResultSet(strainer) - g = generator() - while True: - try: - i = g.next() - except StopIteration: - break - if i: - found = strainer.search(i) - if found: - results.append(found) - if limit and len(results) >= limit: - break - return results - - #These Generators can be used to navigate starting from both - #NavigableStrings and Tags. - def nextGenerator(self): - i = self - while i is not None: - i = i.next - yield i - - def nextSiblingGenerator(self): - i = self - while i is not None: - i = i.nextSibling - yield i - - def previousGenerator(self): - i = self - while i is not None: - i = i.previous - yield i - - def previousSiblingGenerator(self): - i = self - while i is not None: - i = i.previousSibling - yield i - - def parentGenerator(self): - i = self - while i is not None: - i = i.parent - yield i - - # Utility methods - def substituteEncoding(self, str, encoding=None): - encoding = encoding or "utf-8" - return str.replace("%SOUP-ENCODING%", encoding) - - def toEncoding(self, s, encoding=None): - """Encodes an object to a string in some encoding, or to Unicode. - .""" - if isinstance(s, unicode): - if encoding: - s = s.encode(encoding) - elif isinstance(s, str): - if encoding: - s = s.encode(encoding) - else: - s = unicode(s) - else: - if encoding: - s = self.toEncoding(str(s), encoding) - else: - s = unicode(s) - return s - -class NavigableString(unicode, PageElement): - - def __new__(cls, value): - """Create a new NavigableString. - - When unpickling a NavigableString, this method is called with - the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be - passed in to the superclass's __new__ or the superclass won't know - how to handle non-ASCII characters. - """ - if isinstance(value, unicode): - return unicode.__new__(cls, value) - return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) - - def __getnewargs__(self): - return (NavigableString.__str__(self),) - - def __getattr__(self, attr): - """text.string gives you text. This is for backwards - compatibility for Navigable*String, but for CData* it lets you - get the string without the CData wrapper.""" - if attr == 'string': - return self - else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - - def __unicode__(self): - return str(self).decode(DEFAULT_OUTPUT_ENCODING) - - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - if encoding: - return self.encode(encoding) - else: - return self - -class CData(NavigableString): - - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) - -class ProcessingInstruction(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - output = self - if "%SOUP-ENCODING%" in output: - output = self.substituteEncoding(output, encoding) - return "" % self.toEncoding(output, encoding) - -class Comment(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) - -class Declaration(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "" % NavigableString.__str__(self, encoding) - -class Tag(PageElement): - - """Represents a found HTML tag with its attributes and contents.""" - - def _invert(h): - "Cheap function to invert a hash." - i = {} - for k,v in h.items(): - i[v] = k - return i - - XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", - "quot" : '"', - "amp" : "&", - "lt" : "<", - "gt" : ">" } - - XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) - - def _convertEntities(self, match): - """Used in a call to re.sub to replace HTML, XML, and numeric - entities with the appropriate Unicode characters. If HTML - entities are being converted, any unrecognized entities are - escaped.""" - x = match.group(1) - if self.convertHTMLEntities and x in name2codepoint: - return unichr(name2codepoint[x]) - elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: - if self.convertXMLEntities: - return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] - else: - return u'&%s;' % x - elif len(x) > 0 and x[0] == '#': - # Handle numeric entities - if len(x) > 1 and x[1] == 'x': - return unichr(int(x[2:], 16)) - else: - return unichr(int(x[1:])) - - elif self.escapeUnrecognizedEntities: - return u'&%s;' % x - else: - return u'&%s;' % x - - def __init__(self, parser, name, attrs=None, parent=None, - previous=None): - "Basic constructor." - - # We don't actually store the parser object: that lets extracted - # chunks be garbage-collected - self.parserClass = parser.__class__ - self.isSelfClosing = parser.isSelfClosingTag(name) - self.name = name - if attrs is None: - attrs = [] - elif isinstance(attrs, dict): - attrs = attrs.items() - self.attrs = attrs - self.contents = [] - self.setup(parent, previous) - self.hidden = False - self.containsSubstitutions = False - self.convertHTMLEntities = parser.convertHTMLEntities - self.convertXMLEntities = parser.convertXMLEntities - self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities - - # Convert any HTML, XML, or numeric entities in the attribute values. - convert = lambda(k, val): (k, - re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", - self._convertEntities, - val)) - self.attrs = map(convert, self.attrs) - - def getString(self): - if (len(self.contents) == 1 - and isinstance(self.contents[0], NavigableString)): - return self.contents[0] - - def setString(self, string): - """Replace the contents of the tag with a string""" - self.clear() - self.append(string) - - string = property(getString, setString) - - def getText(self, separator=u""): - if not len(self.contents): - return u"" - stopNode = self._lastRecursiveChild().next - strings = [] - current = self.contents[0] - while current is not stopNode: - if isinstance(current, NavigableString): - strings.append(current.strip()) - current = current.next - return separator.join(strings) - - text = property(getText) - - def get(self, key, default=None): - """Returns the value of the 'key' attribute for the tag, or - the value given for 'default' if it doesn't have that - attribute.""" - return self._getAttrMap().get(key, default) - - def clear(self): - """Extract all children.""" - for child in self.contents[:]: - child.extract() - - def index(self, element): - for i, child in enumerate(self.contents): - if child is element: - return i - raise ValueError("Tag.index: element not in tag") - - def has_key(self, key): - return self._getAttrMap().has_key(key) - - def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the tag, - and throws an exception if it's not there.""" - return self._getAttrMap()[key] - - def __iter__(self): - "Iterating over a tag iterates over its contents." - return iter(self.contents) - - def __len__(self): - "The length of a tag is the length of its list of contents." - return len(self.contents) - - def __contains__(self, x): - return x in self.contents - - def __nonzero__(self): - "A tag is non-None even if it has no contents." - return True - - def __setitem__(self, key, value): - """Setting tag[key] sets the value of the 'key' attribute for the - tag.""" - self._getAttrMap() - self.attrMap[key] = value - found = False - for i in range(0, len(self.attrs)): - if self.attrs[i][0] == key: - self.attrs[i] = (key, value) - found = True - if not found: - self.attrs.append((key, value)) - self._getAttrMap()[key] = value - - def __delitem__(self, key): - "Deleting tag[key] deletes all 'key' attributes for the tag." - for item in self.attrs: - if item[0] == key: - self.attrs.remove(item) - #We don't break because bad HTML can define the same - #attribute multiple times. - self._getAttrMap() - if self.attrMap.has_key(key): - del self.attrMap[key] - - def __call__(self, *args, **kwargs): - """Calling a tag like a function is the same as calling its - findAll() method. Eg. tag('a') returns a list of all the A tags - found within this tag.""" - return apply(self.findAll, args, kwargs) - - def __getattr__(self, tag): - #print "Getattr %s.%s" % (self.__class__, tag) - if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: - return self.find(tag[:-3]) - elif tag.find('__') != 0: - return self.find(tag) - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) - - def __eq__(self, other): - """Returns true iff this tag has the same name, the same attributes, - and the same contents (recursively) as the given tag. - - NOTE: right now this will return false if two tags have the - same attributes in a different order. Should this be fixed?""" - if other is self: - return True - if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): - return False - for i in range(0, len(self.contents)): - if self.contents[i] != other.contents[i]: - return False - return True - - def __ne__(self, other): - """Returns true iff this tag is not identical to the other tag, - as defined in __eq__.""" - return not self == other - - def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): - """Renders this tag as a string.""" - return self.__str__(encoding) - - def __unicode__(self): - return self.__str__(None) - - BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" - + ")") - - def _sub_entity(self, x): - """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" - return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" - - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - """Returns a string or Unicode representation of this tag and - its contents. To get Unicode, pass None for encoding. - - NOTE: since Python's HTML parser consumes whitespace, this - method is not certain to reproduce the whitespace present in - the original string.""" - - encodedName = self.toEncoding(self.name, encoding) - - attrs = [] - if self.attrs: - for key, val in self.attrs: - fmt = '%s="%s"' - if isinstance(val, basestring): - if self.containsSubstitutions and '%SOUP-ENCODING%' in val: - val = self.substituteEncoding(val, encoding) - - # The attribute value either: - # - # * Contains no embedded double quotes or single quotes. - # No problem: we enclose it in double quotes. - # * Contains embedded single quotes. No problem: - # double quotes work here too. - # * Contains embedded double quotes. No problem: - # we enclose it in single quotes. - # * Embeds both single _and_ double quotes. This - # can't happen naturally, but it can happen if - # you modify an attribute value after parsing - # the document. Now we have a bit of a - # problem. We solve it by enclosing the - # attribute in single quotes, and escaping any - # embedded single quotes to XML entities. - if '"' in val: - fmt = "%s='%s'" - if "'" in val: - # TODO: replace with apos when - # appropriate. - val = val.replace("'", "&squot;") - - # Now we're okay w/r/t quotes. But the attribute - # value might also contain angle brackets, or - # ampersands that aren't part of entities. We need - # to escape those to XML entities too. - val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) - - attrs.append(fmt % (self.toEncoding(key, encoding), - self.toEncoding(val, encoding))) - close = '' - closeTag = '' - if self.isSelfClosing: - close = ' /' - else: - closeTag = '' % encodedName - - indentTag, indentContents = 0, 0 - if prettyPrint: - indentTag = indentLevel - space = (' ' * (indentTag-1)) - indentContents = indentTag + 1 - contents = self.renderContents(encoding, prettyPrint, indentContents) - if self.hidden: - s = contents - else: - s = [] - attributeString = '' - if attrs: - attributeString = ' ' + ' '.join(attrs) - if prettyPrint: - s.append(space) - s.append('<%s%s%s>' % (encodedName, attributeString, close)) - if prettyPrint: - s.append("\n") - s.append(contents) - if prettyPrint and contents and contents[-1] != "\n": - s.append("\n") - if prettyPrint and closeTag: - s.append(space) - s.append(closeTag) - if prettyPrint and closeTag and self.nextSibling: - s.append("\n") - s = ''.join(s) - return s - - def decompose(self): - """Recursively destroys the contents of this tree.""" - self.extract() - if len(self.contents) == 0: - return - current = self.contents[0] - while current is not None: - next = current.next - if isinstance(current, Tag): - del current.contents[:] - current.parent = None - current.previous = None - current.previousSibling = None - current.next = None - current.nextSibling = None - current = next - - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): - return self.__str__(encoding, True) - - def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - """Renders the contents of this tag as a string in the given - encoding. If encoding is None, returns a Unicode string..""" - s=[] - for c in self: - text = None - if isinstance(c, NavigableString): - text = c.__str__(encoding) - elif isinstance(c, Tag): - s.append(c.__str__(encoding, prettyPrint, indentLevel)) - if text and prettyPrint: - text = text.strip() - if text: - if prettyPrint: - s.append(" " * (indentLevel-1)) - s.append(text) - if prettyPrint: - s.append("\n") - return ''.join(s) - - #Soup methods - - def find(self, name=None, attrs={}, recursive=True, text=None, - **kwargs): - """Return only the first child of this Tag matching the given - criteria.""" - r = None - l = self.findAll(name, attrs, recursive, text, 1, **kwargs) - if l: - r = l[0] - return r - findChild = find - - def findAll(self, name=None, attrs={}, recursive=True, text=None, - limit=None, **kwargs): - """Extracts a list of Tag objects that match the given - criteria. You can specify the name of the Tag and any - attributes you want the Tag to have. - - The value of a key-value pair in the 'attrs' map can be a - string, a list of strings, a regular expression object, or a - callable that takes a string and returns whether or not the - string matches for some custom definition of 'matches'. The - same is true of the tag name.""" - generator = self.recursiveChildGenerator - if not recursive: - generator = self.childGenerator - return self._findAll(name, attrs, text, limit, generator, **kwargs) - findChildren = findAll - - # Pre-3.x compatibility methods - first = find - fetch = findAll - - def fetchText(self, text=None, recursive=True, limit=None): - return self.findAll(text=text, recursive=recursive, limit=limit) - - def firstText(self, text=None, recursive=True): - return self.find(text=text, recursive=recursive) - - #Private methods - - def _getAttrMap(self): - """Initializes a map representation of this tag's attributes, - if not already initialized.""" - if not getattr(self, 'attrMap'): - self.attrMap = {} - for (key, value) in self.attrs: - self.attrMap[key] = value - return self.attrMap - - #Generator methods - def childGenerator(self): - # Just use the iterator from the contents - return iter(self.contents) - - def recursiveChildGenerator(self): - if not len(self.contents): - raise StopIteration - stopNode = self._lastRecursiveChild().next - current = self.contents[0] - while current is not stopNode: - yield current - current = current.next - - -# Next, a couple classes to represent queries and their results. -class SoupStrainer: - """Encapsulates a number of ways of matching a markup element (tag or - text).""" - - def __init__(self, name=None, attrs={}, text=None, **kwargs): - self.name = name - if isinstance(attrs, basestring): - kwargs['class'] = _match_css_class(attrs) - attrs = None - if kwargs: - if attrs: - attrs = attrs.copy() - attrs.update(kwargs) - else: - attrs = kwargs - self.attrs = attrs - self.text = text - - def __str__(self): - if self.text: - return self.text - else: - return "%s|%s" % (self.name, self.attrs) - - def searchTag(self, markupName=None, markupAttrs={}): - found = None - markup = None - if isinstance(markupName, Tag): - markup = markupName - markupAttrs = markup - callFunctionWithTagData = callable(self.name) \ - and not isinstance(markupName, Tag) - - if (not self.name) \ - or callFunctionWithTagData \ - or (markup and self._matches(markup, self.name)) \ - or (not markup and self._matches(markupName, self.name)): - if callFunctionWithTagData: - match = self.name(markupName, markupAttrs) - else: - match = True - markupAttrMap = None - for attr, matchAgainst in self.attrs.items(): - if not markupAttrMap: - if hasattr(markupAttrs, 'get'): - markupAttrMap = markupAttrs - else: - markupAttrMap = {} - for k,v in markupAttrs: - markupAttrMap[k] = v - attrValue = markupAttrMap.get(attr) - if not self._matches(attrValue, matchAgainst): - match = False - break - if match: - if markup: - found = markup - else: - found = markupName - return found - - def search(self, markup): - #print 'looking for %s in %s' % (self, markup) - found = None - # If given a list of items, scan it for a text element that - # matches. - if hasattr(markup, "__iter__") \ - and not isinstance(markup, Tag): - for element in markup: - if isinstance(element, NavigableString) \ - and self.search(element): - found = element - break - # If it's a Tag, make sure its name or attributes match. - # Don't bother with Tags if we're searching for text. - elif isinstance(markup, Tag): - if not self.text: - found = self.searchTag(markup) - # If it's text, make sure the text matches. - elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): - if self._matches(markup, self.text): - found = markup - else: - raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ - return found - - def _matches(self, markup, matchAgainst): - #print "Matching %s against %s" % (markup, matchAgainst) - result = False - if matchAgainst is True: - result = markup is not None - elif callable(matchAgainst): - result = matchAgainst(markup) - else: - #Custom match methods take the tag as an argument, but all - #other ways of matching match the tag name as a string. - if isinstance(markup, Tag): - markup = markup.name - if markup and not isinstance(markup, basestring): - markup = unicode(markup) - #Now we know that chunk is either a string, or None. - if hasattr(matchAgainst, 'match'): - # It's a regexp object. - result = markup and matchAgainst.search(markup) - elif hasattr(matchAgainst, '__iter__'): # list-like - result = markup in matchAgainst - elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) - elif matchAgainst and isinstance(markup, basestring): - if isinstance(markup, unicode): - matchAgainst = unicode(matchAgainst) - else: - matchAgainst = str(matchAgainst) - - if not result: - result = matchAgainst == markup - return result - -class ResultSet(list): - """A ResultSet is just a list that keeps track of the SoupStrainer - that created it.""" - def __init__(self, source): - list.__init__([]) - self.source = source - -# Now, some helper functions. - -def buildTagMap(default, *args): - """Turns a list of maps, lists, or scalars into a single map. - Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and - NESTING_RESET_TAGS maps out of lists and partial maps.""" - built = {} - for portion in args: - if hasattr(portion, 'items'): - #It's a map. Merge it. - for k,v in portion.items(): - built[k] = v - elif hasattr(portion, '__iter__'): # is a list - #It's a list. Map each item to the default. - for k in portion: - built[k] = default - else: - #It's a scalar. Map it to the default. - built[portion] = default - return built - -# Now, the parser classes. - -class BeautifulStoneSoup(Tag, SGMLParser): - - """This class contains the basic parser and search code. It defines - a parser that knows nothing about tag behavior except for the - following: - - You can't close a tag without closing all the tags it encloses. - That is, "" actually means - "". - - [Another possible explanation is "", but since - this class defines no SELF_CLOSING_TAGS, it will never use that - explanation.] - - This class is useful for parsing XML or made-up markup languages, - or when BeautifulSoup makes an assumption counter to what you were - expecting.""" - - SELF_CLOSING_TAGS = {} - NESTABLE_TAGS = {} - RESET_NESTING_TAGS = {} - QUOTE_TAGS = {} - PRESERVE_WHITESPACE_TAGS = [] - - MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), - lambda x: x.group(1) + ' />'), - (re.compile(']*)>'), - lambda x: '') - ] - - ROOT_TAG_NAME = u'[document]' - - HTML_ENTITIES = "html" - XML_ENTITIES = "xml" - XHTML_ENTITIES = "xhtml" - # TODO: This only exists for backwards-compatibility - ALL_ENTITIES = XHTML_ENTITIES - - # Used when determining whether a text node is all whitespace and - # can be replaced with a single space. A text node that contains - # fancy Unicode spaces (usually non-breaking) should be left - # alone. - STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } - - def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, - markupMassage=True, smartQuotesTo=XML_ENTITIES, - convertEntities=None, selfClosingTags=None, isHTML=False): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser. - - sgmllib will process most bad HTML, and the BeautifulSoup - class has some tricks for dealing with some HTML that kills - sgmllib, but Beautiful Soup can nonetheless choke or lose data - if your data uses self-closing tags or declarations - incorrectly. - - By default, Beautiful Soup uses regexes to sanitize input, - avoiding the vast majority of these problems. If the problems - don't apply to you, pass in False for markupMassage, and - you'll get better performance. - - The default parser massage techniques fix the two most common - instances of invalid HTML that choke sgmllib: - -
(No space between name of closing tag and tag close) - (Extraneous whitespace in declaration) - - You can pass in a custom list of (RE object, replace method) - tuples to get Beautiful Soup to scrub your input the way you - want.""" - - self.parseOnlyThese = parseOnlyThese - self.fromEncoding = fromEncoding - self.smartQuotesTo = smartQuotesTo - self.convertEntities = convertEntities - # Set the rules for how we'll deal with the entities we - # encounter - if self.convertEntities: - # It doesn't make sense to convert encoded characters to - # entities even while you're converting entities to Unicode. - # Just convert it all to Unicode. - self.smartQuotesTo = None - if convertEntities == self.HTML_ENTITIES: - self.convertXMLEntities = False - self.convertHTMLEntities = True - self.escapeUnrecognizedEntities = True - elif convertEntities == self.XHTML_ENTITIES: - self.convertXMLEntities = True - self.convertHTMLEntities = True - self.escapeUnrecognizedEntities = False - elif convertEntities == self.XML_ENTITIES: - self.convertXMLEntities = True - self.convertHTMLEntities = False - self.escapeUnrecognizedEntities = False - else: - self.convertXMLEntities = False - self.convertHTMLEntities = False - self.escapeUnrecognizedEntities = False - - self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) - SGMLParser.__init__(self) - - if hasattr(markup, 'read'): # It's a file-type object. - markup = markup.read() - self.markup = markup - self.markupMassage = markupMassage - try: - self._feed(isHTML=isHTML) - except StopParsing: - pass - self.markup = None # The markup can now be GCed - - def convert_charref(self, name): - """This method fixes a bug in Python's SGMLParser.""" - try: - n = int(name) - except ValueError: - return - if not 0 <= n <= 127 : # ASCII ends at 127, not 255 - return - return self.convert_codepoint(n) - - def _feed(self, inDocumentEncoding=None, isHTML=False): - # Convert the document to Unicode. - markup = self.markup - if isinstance(markup, unicode): - if not hasattr(self, 'originalEncoding'): - self.originalEncoding = None - else: - dammit = UnicodeDammit\ - (markup, [self.fromEncoding, inDocumentEncoding], - smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) - markup = dammit.unicode - self.originalEncoding = dammit.originalEncoding - self.declaredHTMLEncoding = dammit.declaredHTMLEncoding - if markup: - if self.markupMassage: - if not hasattr(self.markupMassage, "__iter__"): - self.markupMassage = self.MARKUP_MASSAGE - for fix, m in self.markupMassage: - markup = fix.sub(m, markup) - # TODO: We get rid of markupMassage so that the - # soup object can be deepcopied later on. Some - # Python installations can't copy regexes. If anyone - # was relying on the existence of markupMassage, this - # might cause problems. - del(self.markupMassage) - self.reset() - - SGMLParser.feed(self, markup) - # Close out any unfinished strings and close all the open tags. - self.endData() - while self.currentTag.name != self.ROOT_TAG_NAME: - self.popTag() - - def __getattr__(self, methodName): - """This method routes method call requests to either the SGMLParser - superclass or the Tag superclass, depending on the method name.""" - #print "__getattr__ called on %s.%s" % (self.__class__, methodName) - - if methodName.startswith('start_') or methodName.startswith('end_') \ - or methodName.startswith('do_'): - return SGMLParser.__getattr__(self, methodName) - elif not methodName.startswith('__'): - return Tag.__getattr__(self, methodName) - else: - raise AttributeError - - def isSelfClosingTag(self, name): - """Returns true iff the given string is the name of a - self-closing tag according to this parser.""" - return self.SELF_CLOSING_TAGS.has_key(name) \ - or self.instanceSelfClosingTags.has_key(name) - - def reset(self): - Tag.__init__(self, self, self.ROOT_TAG_NAME) - self.hidden = 1 - SGMLParser.reset(self) - self.currentData = [] - self.currentTag = None - self.tagStack = [] - self.quoteStack = [] - self.pushTag(self) - - def popTag(self): - tag = self.tagStack.pop() - - #print "Pop", tag.name - if self.tagStack: - self.currentTag = self.tagStack[-1] - return self.currentTag - - def pushTag(self, tag): - #print "Push", tag.name - if self.currentTag: - self.currentTag.contents.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] - - def endData(self, containerClass=NavigableString): - if self.currentData: - currentData = u''.join(self.currentData) - if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and - not set([tag.name for tag in self.tagStack]).intersection( - self.PRESERVE_WHITESPACE_TAGS)): - if '\n' in currentData: - currentData = '\n' - else: - currentData = ' ' - self.currentData = [] - if self.parseOnlyThese and len(self.tagStack) <= 1 and \ - (not self.parseOnlyThese.text or \ - not self.parseOnlyThese.search(currentData)): - return - o = containerClass(currentData) - o.setup(self.currentTag, self.previous) - if self.previous: - self.previous.next = o - self.previous = o - self.currentTag.contents.append(o) - - - def _popToTag(self, name, inclusivePop=True): - """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" - #print "Popping to %s" % name - if name == self.ROOT_TAG_NAME: - return - - numPops = 0 - mostRecentTag = None - for i in range(len(self.tagStack)-1, 0, -1): - if name == self.tagStack[i].name: - numPops = len(self.tagStack)-i - break - if not inclusivePop: - numPops = numPops - 1 - - for i in range(0, numPops): - mostRecentTag = self.popTag() - return mostRecentTag - - def _smartPop(self, name): - - """We need to pop up to the previous tag of this type, unless - one of this tag's nesting reset triggers comes between this - tag and the previous tag of this type, OR unless this tag is a - generic nesting trigger and another generic nesting trigger - comes between this tag and the previous tag of this type. - - Examples: -

FooBar *

* should pop to 'p', not 'b'. -

FooBar *

* should pop to 'table', not 'p'. -

Foo

Bar *

* should pop to 'tr', not 'p'. - -

    • *
    • * should pop to 'ul', not the first 'li'. -
  • ** should pop to 'table', not the first 'tr' - tag should - implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' - """ - - nestingResetTriggers = self.NESTABLE_TAGS.get(name) - isNestable = nestingResetTriggers != None - isResetNesting = self.RESET_NESTING_TAGS.has_key(name) - popTo = None - inclusive = True - for i in range(len(self.tagStack)-1, 0, -1): - p = self.tagStack[i] - if (not p or p.name == name) and not isNestable: - #Non-nestable tags get popped to the top or to their - #last occurance. - popTo = name - break - if (nestingResetTriggers is not None - and p.name in nestingResetTriggers) \ - or (nestingResetTriggers is None and isResetNesting - and self.RESET_NESTING_TAGS.has_key(p.name)): - - #If we encounter one of the nesting reset triggers - #peculiar to this tag, or we encounter another tag - #that causes nesting to reset, pop up to but not - #including that tag. - popTo = p.name - inclusive = False - break - p = p.parent - if popTo: - self._popToTag(popTo, inclusive) - - def unknown_starttag(self, name, attrs, selfClosing=0): - #print "Start tag %s: %s" % (name, attrs) - if self.quoteStack: - #This is not a real tag. - #print "<%s> is not real!" % name - attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) - self.handle_data('<%s%s>' % (name, attrs)) - return - self.endData() - - if not self.isSelfClosingTag(name) and not selfClosing: - self._smartPop(name) - - if self.parseOnlyThese and len(self.tagStack) <= 1 \ - and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): - return - - tag = Tag(self, name, attrs, self.currentTag, self.previous) - if self.previous: - self.previous.next = tag - self.previous = tag - self.pushTag(tag) - if selfClosing or self.isSelfClosingTag(name): - self.popTag() - if name in self.QUOTE_TAGS: - #print "Beginning quote (%s)" % name - self.quoteStack.append(name) - self.literal = 1 - return tag - - def unknown_endtag(self, name): - #print "End tag %s" % name - if self.quoteStack and self.quoteStack[-1] != name: - #This is not a real end tag. - #print " is not real!" % name - self.handle_data('' % name) - return - self.endData() - self._popToTag(name) - if self.quoteStack and self.quoteStack[-1] == name: - self.quoteStack.pop() - self.literal = (len(self.quoteStack) > 0) - - def handle_data(self, data): - self.currentData.append(data) - - def _toStringSubclass(self, text, subclass): - """Adds a certain piece of text to the tree as a NavigableString - subclass.""" - self.endData() - self.handle_data(text) - self.endData(subclass) - - def handle_pi(self, text): - """Handle a processing instruction as a ProcessingInstruction - object, possibly one with a %SOUP-ENCODING% slot into which an - encoding will be plugged later.""" - if text[:3] == "xml": - text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" - self._toStringSubclass(text, ProcessingInstruction) - - def handle_comment(self, text): - "Handle comments as Comment objects." - self._toStringSubclass(text, Comment) - - def handle_charref(self, ref): - "Handle character references as data." - if self.convertEntities: - data = unichr(int(ref)) - else: - data = '&#%s;' % ref - self.handle_data(data) - - def handle_entityref(self, ref): - """Handle entity references as data, possibly converting known - HTML and/or XML entity references to the corresponding Unicode - characters.""" - data = None - if self.convertHTMLEntities: - try: - data = unichr(name2codepoint[ref]) - except KeyError: - pass - - if not data and self.convertXMLEntities: - data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) - - if not data and self.convertHTMLEntities and \ - not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): - # TODO: We've got a problem here. We're told this is - # an entity reference, but it's not an XML entity - # reference or an HTML entity reference. Nonetheless, - # the logical thing to do is to pass it through as an - # unrecognized entity reference. - # - # Except: when the input is "&carol;" this function - # will be called with input "carol". When the input is - # "AT&T", this function will be called with input - # "T". We have no way of knowing whether a semicolon - # was present originally, so we don't know whether - # this is an unknown entity or just a misplaced - # ampersand. - # - # The more common case is a misplaced ampersand, so I - # escape the ampersand and omit the trailing semicolon. - data = "&%s" % ref - if not data: - # This case is different from the one above, because we - # haven't already gone through a supposedly comprehensive - # mapping of entities to Unicode characters. We might not - # have gone through any mapping at all. So the chances are - # very high that this is a real entity, and not a - # misplaced ampersand. - data = "&%s;" % ref - self.handle_data(data) - - def handle_decl(self, data): - "Handle DOCTYPEs and the like as Declaration objects." - self._toStringSubclass(data, Declaration) - - def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as a CData object.""" - j = None - if self.rawdata[i:i+9] == '', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) - else: - try: - j = SGMLParser.parse_declaration(self, i) - except SGMLParseError: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) - j = i + len(toHandle) - return j - -class BeautifulSoup(BeautifulStoneSoup): - - """This parser knows the following facts about HTML: - - * Some tags have no closing tag and should be interpreted as being - closed as soon as they are encountered. - - * The text inside some tags (ie. 'script') may contain tags which - are not really part of the document and which should be parsed - as text, not tags. If you want to parse the text as tags, you can - always fetch it and parse it explicitly. - - * Tag nesting rules: - - Most tags can't be nested at all. For instance, the occurance of - a

    tag should implicitly close the previous

    tag. - -

    Para1

    Para2 - should be transformed into: -

    Para1

    Para2 - - Some tags can be nested arbitrarily. For instance, the occurance - of a

    tag should _not_ implicitly close the previous -
    tag. - - Alice said:
    Bob said:
    Blah - should NOT be transformed into: - Alice said:
    Bob said:
    Blah - - Some tags can be nested, but the nesting is reset by the - interposition of other tags. For instance, a
    , - but not close a tag in another table. - -
    BlahBlah - should be transformed into: -
    BlahBlah - but, - Blah
    Blah - should NOT be transformed into - Blah
    Blah - - Differing assumptions about tag nesting rules are a major source - of problems with the BeautifulSoup class. If BeautifulSoup is not - treating as nestable a tag your page author treats as nestable, - try ICantBelieveItsBeautifulSoup, MinimalSoup, or - BeautifulStoneSoup before writing your own subclass.""" - - def __init__(self, *args, **kwargs): - if not kwargs.has_key('smartQuotesTo'): - kwargs['smartQuotesTo'] = self.HTML_ENTITIES - kwargs['isHTML'] = True - BeautifulStoneSoup.__init__(self, *args, **kwargs) - - SELF_CLOSING_TAGS = buildTagMap(None, - ('br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base', 'col')) - - PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) - - QUOTE_TAGS = {'script' : None, 'textarea' : None} - - #According to the HTML standard, each of these inline tags can - #contain another tag of the same type. Furthermore, it's common - #to actually use these tags this way. - NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', - 'center') - - #According to the HTML standard, these block tags can contain - #another tag of the same type. Furthermore, it's common - #to actually use these tags this way. - NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') - - #Lists can contain other lists, but there are restrictions. - NESTABLE_LIST_TAGS = { 'ol' : [], - 'ul' : [], - 'li' : ['ul', 'ol'], - 'dl' : [], - 'dd' : ['dl'], - 'dt' : ['dl'] } - - #Tables can contain other tables, but there are restrictions. - NESTABLE_TABLE_TAGS = {'table' : [], - 'tr' : ['table', 'tbody', 'tfoot', 'thead'], - 'td' : ['tr'], - 'th' : ['tr'], - 'thead' : ['table'], - 'tbody' : ['table'], - 'tfoot' : ['table'], - } - - NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') - - #If one of these tags is encountered, all tags up to the next tag of - #this type are popped. - RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', - NON_NESTABLE_BLOCK_TAGS, - NESTABLE_LIST_TAGS, - NESTABLE_TABLE_TAGS) - - NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, - NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) - - # Used to detect the charset in a META tag; see start_meta - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) - - def start_meta(self, attrs): - """Beautiful Soup can detect a charset included in a META tag, - try to convert the document to that charset, and re-parse the - document from the beginning.""" - httpEquiv = None - contentType = None - contentTypeIndex = None - tagNeedsEncodingSubstitution = False - - for i in range(0, len(attrs)): - key, value = attrs[i] - key = key.lower() - if key == 'http-equiv': - httpEquiv = value - elif key == 'content': - contentType = value - contentTypeIndex = i - - if httpEquiv and contentType: # It's an interesting meta tag. - match = self.CHARSET_RE.search(contentType) - if match: - if (self.declaredHTMLEncoding is not None or - self.originalEncoding == self.fromEncoding): - # An HTML encoding was sniffed while converting - # the document to Unicode, or an HTML encoding was - # sniffed during a previous pass through the - # document, or an encoding was specified - # explicitly and it worked. Rewrite the meta tag. - def rewrite(match): - return match.group(1) + "%SOUP-ENCODING%" - newAttr = self.CHARSET_RE.sub(rewrite, contentType) - attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], - newAttr) - tagNeedsEncodingSubstitution = True - else: - # This is our first pass through the document. - # Go through it again with the encoding information. - newCharset = match.group(3) - if newCharset and newCharset != self.originalEncoding: - self.declaredHTMLEncoding = newCharset - self._feed(self.declaredHTMLEncoding) - raise StopParsing - pass - tag = self.unknown_starttag("meta", attrs) - if tag and tagNeedsEncodingSubstitution: - tag.containsSubstitutions = True - -class StopParsing(Exception): - pass - -class ICantBelieveItsBeautifulSoup(BeautifulSoup): - - """The BeautifulSoup class is oriented towards skipping over - common HTML errors like unclosed tags. However, sometimes it makes - errors of its own. For instance, consider this fragment: - - FooBar - - This is perfectly valid (if bizarre) HTML. However, the - BeautifulSoup class will implicitly close the first b tag when it - encounters the second 'b'. It will think the author wrote - "FooBar", and didn't close the first 'b' tag, because - there's no real-world reason to bold something that's already - bold. When it encounters '' it will close two more 'b' - tags, for a grand total of three tags closed instead of two. This - can throw off the rest of your document structure. The same is - true of a number of other tags, listed below. - - It's much more common for someone to forget to close a 'b' tag - than to actually use nested 'b' tags, and the BeautifulSoup class - handles the common case. This class handles the not-co-common - case: where you can't believe someone wrote what they did, but - it's valid HTML and BeautifulSoup screwed up by assuming it - wouldn't be.""" - - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ - ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', - 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', - 'big') - - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) - - NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) - -class MinimalSoup(BeautifulSoup): - """The MinimalSoup class is for parsing HTML that contains - pathologically bad markup. It makes no assumptions about tag - nesting, but it does know which tags are self-closing, that - ", html) - if match: - init = unicode(match.group(1), errors="ignore") - tokens = init.split('],[') - for token in tokens: - res = ImageResult() - res.page = i - res.index = j - toks = token.split(",") - - # should be 32 or 33, but seems to change, so just make sure no exceptions - # will be thrown by the indexing - if (len(toks) > 22): - for t in range(len(toks)): - toks[t] = toks[t].replace('\\x3cb\\x3e','').replace('\\x3c/b\\x3e','').replace('\\x3d','=').replace('\\x26','&') - match = re.search("imgurl=(?P[^&]+)&imgrefurl", toks[0]) - if match: - res.link = match.group("link") - res.name = toks[6].replace('"', '') - res.thumb = toks[21].replace('"', '') - res.format = toks[10].replace('"', '') - res.domain = toks[11].replace('"', '') - match = re.search("(?P[0-9]+) × (?P[0-9]+) - (?P[^ ]+)", toks[9].replace('"', '')) - if match: - res.width = match.group("width") - res.height = match.group("height") - res.filesize = match.group("size") - results.append(res) - j = j + 1 - return results - - @staticmethod - def search_images(query, image_options = None, pages = 1): - results = [] - for i in range(pages): - url = get_image_search_url(query, image_options, i) - html = get_html(url) - if html: - if Google.DEBUG_MODE: - write_html_to_file(html, "images_{0}_{1}.html".format(query.replace(" ", "_"), i)) - soup = BeautifulSoup(html) - j = 0 - tds = soup.findAll("td") - for td in tds: - a = td.find("a") - if a and a["href"].find("imgurl") != -1: - res = ImageResult() - res.page = i - res.index = j - tokens = a["href"].split("&") - match = re.search("imgurl=(?P[^&]+)", tokens[0]) - if match: - res.link = match.group("link") - res.format = res.link[res.link.rfind(".") + 1:] - img = td.find("img") - if img: - res.thumb = img["src"] - res.thumb_width = img["width"] - res.thumb_height = img["height"] - match = re.search("(?P[0-9]+) × (?P[0-9]+) - (?P[^&]+)", td.text) - if match: - res.width = match.group("width") - res.name = td.text[:td.text.find(res.width)] - res.height = match.group("height") - res.filesize = match.group("size") - cite = td.find("cite") - if cite: - res.domain = cite["title"] - results.append(res) - j = j + 1 - return results - - @staticmethod - def shopping(query, pages=1): - results = [] - for i in range(pages): - url = get_shopping_url(query, i) - html = get_html(url) - if html: - if Google.DEBUG_MODE: - write_html_to_file(html, "shopping_{0}_{1}.html".format(query.replace(" ", "_"), i)) - j = 0 - soup = BeautifulSoup(html) - - products = soup.findAll("li", "g") - for prod in products: - res = ShoppingResult() - - divs = prod.findAll("div") - for div in divs: - match = re.search("from (?P[0-9]+) stores", div.text.strip()) - if match: - res.store_count = match.group("count") - break - - h3 = prod.find("h3", "r") - if h3: - a = h3.find("a") - if a: - res.compare_url = a["href"] - res.name = h3.text.strip() - - psliimg = prod.find("div", "psliimg") - if psliimg: - img = psliimg.find("img") - if img: - res.thumb = img["src"] - - f = prod.find("div", "f") - if f: - res.subtext = f.text.strip() - - price = prod.find("div", "psliprice") - if price: - res.min_price = price.text.strip() - - results.append(res) - j = j + 1 - return results - - """ - Converts one currency to another. - [amount] from_curreny = [return_value] to_currency - """ - @staticmethod - def convert_currency(amount, from_currency, to_currency): - if from_currency == to_currency: - return 1.0 - conn = httplib.HTTPSConnection("www.google.com") - req_url = "/ig/calculator?hl=en&q={0}{1}=?{2}".format(amount, from_currency.replace(" ", "%20"), to_currency.replace(" ", "%20")) - headers = { "User-Agent": "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" } - conn.request("GET", req_url, "", headers) - response = conn.getresponse() - rval = response.read().decode("utf-8").replace(u"\xa0", "") - conn.close() - rhs = rval.split(",")[1].strip() - s = rhs[rhs.find('"')+1:] - rate = s[:s.find(" ")] - return float(rate) - - """ - Gets the exchange rate of one currency to another. - 1 from_curreny = [return_value] to_currency - """ - @staticmethod - def exchange_rate(from_currency, to_currency): - return Google.convert_currency(1, from_currency, to_currency) - - """ - Attempts to use google calculator to calculate the result of expr - """ - @staticmethod - def calculate(expr): - conn = httplib.HTTPSConnection("www.google.com") - req_url = "/ig/calculator?hl=en&q={0}".format(expr.replace(" ", "%20")) - headers = { "User-Agent": "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" } - conn.request("GET", req_url, "", headers) - response = conn.getresponse() - j = response.read().decode("utf-8").replace(u"\xa0", "") - conn.close() - j = re.sub(r"{\s*'?(\w)", r'{"\1', j) - j = re.sub(r",\s*'?(\w)", r',"\1', j) - j = re.sub(r"(\w)'?\s*:", r'\1":', j) - j = re.sub(r":\s*'(\w)'\s*([,}])", r':"\1"\2', j) - js = json.loads(j) - return parse_calc_result(js["lhs"] + " = " + js["rhs"]) - -def normalize_query(query): - return query.strip().replace(":", "%3A").replace("+", "%2B").replace("&", "%26").replace(" ", "+") - -def get_search_url(query, page = 0, per_page = 10): - # note: num per page might not be supported by google anymore (because of google instant) - return "http://www.google.com/search?hl=en&q=%s&start=%i&num=%i" % (normalize_query(query), page * per_page, per_page) - -def get_shopping_url(query, page=0, per_page=10): - return "http://www.google.com/search?hl=en&q={0}&tbm=shop&start={1}&num={2}".format(normalize_query(query), page * per_page, per_page) - -class ImageType: - NONE = None - FACE = "face" - PHOTO = "photo" - CLIPART = "clipart" - LINE_DRAWING = "lineart" - -class SizeCategory: - NONE = None - ICON = "i" - LARGE = "l" - MEDIUM = "m" - SMALL = "s" - LARGER_THAN = "lt" - EXACTLY = "ex" - -class LargerThan: - NONE = None - QSVGA = "qsvga" # 400 x 300 - VGA = "vga" # 640 x 480 - SVGA = "svga" # 800 x 600 - XGA = "xga" # 1024 x 768 - MP_2 = "2mp" # 2 MP (1600 x 1200) - MP_4 = "4mp" # 4 MP (2272 x 1704) - MP_6 = "6mp" # 6 MP (2816 x 2112) - MP_8 = "8mp" # 8 MP (3264 x 2448) - MP_10 = "10mp" # 10 MP (3648 x 2736) - MP_12 = "12mp" # 12 MP (4096 x 3072) - MP_15 = "15mp" # 15 MP (4480 x 3360) - MP_20 = "20mp" # 20 MP (5120 x 3840) - MP_40 = "40mp" # 40 MP (7216 x 5412) - MP_70 = "70mp" # 70 MP (9600 x 7200) - -class ColorType: - NONE = None - COLOR = "color" - BLACK_WHITE = "gray" - SPECIFIC = "specific" - -def get_image_search_url(query, image_options=None, page=0, per_page=20): - query = query.strip().replace(":", "%3A").replace("+", "%2B").replace("&", "%26").replace(" ", "+") - url = "http://images.google.com/images?q=%s&sa=N&start=%i&ndsp=%i&sout=1" % (query, page * per_page, per_page) - if image_options: - tbs = image_options.get_tbs() - if tbs: - url = url + tbs - return url - -def add_to_tbs(tbs, name, value): - if tbs: - return "%s,%s:%s" % (tbs, name, value) - else: - return "&tbs=%s:%s" % (name, value) - -def parse_calc_result(string): - result = CalculatorResult() - result.fullstring = string - string = string.strip().replace(u"\xa0", " ") - if string.find("=") != -1: - result.expr = string[:string.rfind("=")].strip() - string = string[string.rfind("=") + 2:] - result.result = string - tokens = string.split(" ") - if len(tokens) > 0: - result.value = "" - for token in tokens: - if is_number(token): - result.value = result.value + token - else: - if result.unit: - result.unit = result.unit + " " + token - else: - result.unit = token - return result - return None - -def is_number(s): - try: - float(s) - return True - except ValueError: - return False - -def get_html(url): - try: - request = urllib2.Request(url) - request.add_header("User-Agent", "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101") - html = urllib2.urlopen(request).read() - return html - except: - print "Error accessing:", url - return None - -def write_html_to_file(html, filename): - of = open(filename, "w") - of.write(html) - of.flush() - of.close() - -def test(): - search = Google.search("github") - if search is None or len(search) == 0: - print "ERROR: No Search Results!" - else: - print "PASSED: {0} Search Results".format(len(search)) - - shop = Google.shopping("Disgaea 4") - if shop is None or len(shop) == 0: - print "ERROR: No Shopping Results!" - else: - print "PASSED: {0} Shopping Results".format(len(shop)) - - options = ImageOptions() - options.image_type = ImageType.CLIPART - options.larger_than = LargerThan.MP_4 - options.color = "green" - images = Google.search_images("banana", options) - if images is None or len(images) == 0: - print "ERROR: No Image Results!" - else: - print "PASSED: {0} Image Results".format(len(images)) - - calc = Google.calculate("157.3kg in grams") - if calc is not None and int(calc.value) == 157300: - print "PASSED: Calculator passed" - else: - print "ERROR: Calculator failed!" - - euros = Google.convert_currency(5.0, "USD", "EUR") - if euros is not None and euros > 0.0: - print "PASSED: Currency convert passed" - else: - print "ERROR: Currency convert failed!" - -def main(): - if len(sys.argv) > 1 and sys.argv[1] == "--debug": - Google.DEBUG_MODE = True - print "DEBUG_MODE ENABLED" - test() - -if __name__ == "__main__": - main() - \ No newline at end of file diff --git a/googleapi/__init__.py b/googleapi/__init__.py new file mode 100644 index 0000000..3597c21 --- /dev/null +++ b/googleapi/__init__.py @@ -0,0 +1,3 @@ +from __future__ import absolute_import +from .modules import calculator, currency, images, utils +from .modules import standard_search, shopping_search diff --git a/googleapi/google.py b/googleapi/google.py new file mode 100644 index 0000000..cd14f37 --- /dev/null +++ b/googleapi/google.py @@ -0,0 +1,29 @@ +from __future__ import unicode_literals +from __future__ import absolute_import + +from .modules import images +from .modules import currency +from .modules import calculator +from .modules import standard_search +# from modules import shopping_search + +__author__ = "Anthony Casagrande , " + \ + "Agustin Benassi " +__version__ = "1.1.0" + + +"""Defines the public inteface of the API.""" + +search = standard_search.search +search_images = images.search +convert_currency = currency.convert +exchange_rate = currency.exchange_rate +calculate = calculator.calculate + +# TODO: This method is not working anymore! There is a new GET +# link for this kind of search +# shopping = shopping_search.shopping + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/googleapi/modules/__init__.py b/googleapi/modules/__init__.py new file mode 100644 index 0000000..0440d06 --- /dev/null +++ b/googleapi/modules/__init__.py @@ -0,0 +1,6 @@ +from __future__ import print_function +from . import calculator +from . import currency +from . import images +from . import shopping_search +from . import standard_search diff --git a/googleapi/modules/calculator.py b/googleapi/modules/calculator.py new file mode 100644 index 0000000..2a10f1d --- /dev/null +++ b/googleapi/modules/calculator.py @@ -0,0 +1,91 @@ +from __future__ import unicode_literals +from __future__ import absolute_import +from builtins import object +from unidecode import unidecode + +from .utils import get_html_from_dynamic_site +from .utils import _get_search_url +from bs4 import BeautifulSoup + + +class CalculatorResult(object): + + """Represents a result returned from google calculator.""" + + def __init__(self): + self.value = None # Result value (eg. 157300.0) + self.from_value = None # Initial value (eg. 157.3) + self.unit = None # Result unit (eg. u'grams') (NOT implemented yet) + # Initial unit (eg. u'kilograms') (NOT implemented yet) + self.from_unit = None + # Initial expression (eg. u'157.3 grams') (NOT implemented yet) + self.expr = None + # Result expression (eg. u'157300 kilograms') (NOT implemented yet) + self.result = None + # Complete expression (eg. u'157.3 kilograms = 157300 grams') (NOT + # implemented yet) + self.fullstring = None + + def __repr__(self): + return unidecode(self.value) + + +# PUBLIC +def calculate(expr): + """Search for a calculation expression in google. + + Attempts to search google calculator for the result of an expression. + Returns a `CalculatorResult` if successful or `None` if it fails. + + Args: + expr: Calculation expression (eg. "cos(25 pi) / 17.4" or + "157.3kg in grams") + + Returns: + CalculatorResult object.""" + + url = _get_search_url(expr) + html = get_html_from_dynamic_site(url) + bs = BeautifulSoup(html) + + cr = CalculatorResult() + cr.value = _get_to_value(bs) + cr.from_value = _get_from_value(bs) + cr.unit = _get_to_unit(bs) + cr.from_unit = _get_from_unit(bs) + cr.expr = _get_expr(bs) + cr.result = _get_result(bs) + cr.fullstring = _get_fullstring(bs) + + return cr + + +# PRIVATE +def _get_to_value(bs): + input_node = bs.find("div", {"id": "_Cif"}) + return float(input_node.find("input")["value"]) + + +def _get_from_value(bs): + input_node = bs.find("div", {"id": "_Aif"}) + return float(input_node.find("input")["value"]) + + +def _get_to_unit(bs): + return None + + +def _get_from_unit(bs): + return None + + +def _get_expr(bs): + return None + + +def _get_result(bs): + return None + + +def _get_fullstring(bs): + return None diff --git a/googleapi/modules/currency.py b/googleapi/modules/currency.py new file mode 100644 index 0000000..3c12bd0 --- /dev/null +++ b/googleapi/modules/currency.py @@ -0,0 +1,54 @@ +from __future__ import unicode_literals +from __future__ import absolute_import + +from .utils import get_html +from bs4 import BeautifulSoup + + +# PUBLIC +def convert(amount, from_currency, to_currency): + """Method to convert currency. + + Args: + amount: numeric amount to convert + from_currency: currency denomination of the amount to convert + to_currency: target currency denomination to convert to + """ + + # same currency, no conversion + if from_currency == to_currency: + return amount * 1.0 + + req_url = _get_currency_req_url(amount, + from_currency, to_currency) + response = get_html(req_url) + rate = _parse_currency_response(response, to_currency) + + return rate + + +def exchange_rate(from_currency, to_currency): + """Gets the exchange rate of one currency to another. + + Args: + from_currency: starting currency denomination (1) + to_currency: target currency denomination to convert to (rate) + + Returns: + rate / 1 to convert from_currency in to_currency + """ + return convert(1, from_currency, to_currency) + + +# PRIVATE +def _get_currency_req_url(amount, from_currency, to_currency): + return "https://www.google.com/finance/converter?a={0}&from={1}&to={2}".format( + amount, from_currency.replace(" ", "%20"), + to_currency.replace(" ", "%20")) + + +def _parse_currency_response(response, to_currency): + bs = BeautifulSoup(response) + str_rate = bs.find(id="currency_converter_result").span.get_text() + rate = float(str_rate.replace(to_currency, "").strip()) + return rate diff --git a/googleapi/modules/images.py b/googleapi/modules/images.py new file mode 100644 index 0000000..6490a93 --- /dev/null +++ b/googleapi/modules/images.py @@ -0,0 +1,565 @@ +from __future__ import unicode_literals +from __future__ import print_function +from __future__ import absolute_import +from future import standard_library +standard_library.install_aliases() +from builtins import str +from builtins import range +from builtins import object +from unidecode import unidecode + +from .utils import get_browser_with_url, write_html_to_file, measure_time +from bs4 import BeautifulSoup +import urllib.parse +import sys +import requests +import shutil +import os +import threading +import queue + + +IMAGE_FORMATS = ["bmp", "gif", "jpg", "png", "psd", "pspimage", "thm", + "tif", "yuv", "ai", "drw", "eps", "ps", "svg", "tiff", + "jpeg", "jif", "jfif", "jp2", "jpx", "j2k", "j2c", "fpx", + "pcd", "png", "pdf"] + + +# AUXILIARY CLASSES +class ImageType(object): + NONE = None + FACE = "face" + PHOTO = "photo" + CLIPART = "clipart" + LINE_DRAWING = "lineart" + + +class SizeCategory(object): + NONE = None + ICON = "i" + LARGE = "l" + MEDIUM = "m" + SMALL = "s" + LARGER_THAN = "lt" + EXACTLY = "ex" + + +class LargerThan(object): + NONE = None + QSVGA = "qsvga" # 400 x 300 + VGA = "vga" # 640 x 480 + SVGA = "svga" # 800 x 600 + XGA = "xga" # 1024 x 768 + MP_2 = "2mp" # 2 MP (1600 x 1200) + MP_4 = "4mp" # 4 MP (2272 x 1704) + MP_6 = "6mp" # 6 MP (2816 x 2112) + MP_8 = "8mp" # 8 MP (3264 x 2448) + MP_10 = "10mp" # 10 MP (3648 x 2736) + MP_12 = "12mp" # 12 MP (4096 x 3072) + MP_15 = "15mp" # 15 MP (4480 x 3360) + MP_20 = "20mp" # 20 MP (5120 x 3840) + MP_40 = "40mp" # 40 MP (7216 x 5412) + MP_70 = "70mp" # 70 MP (9600 x 7200) + + +class ColorType(object): + NONE = None + COLOR = "color" + BLACK_WHITE = "gray" + SPECIFIC = "specific" + + +class License(object): + NONE = None + REUSE = "fc" + REUSE_WITH_MOD = "fmc" + REUSE_NON_COMMERCIAL = "f" + REUSE_WITH_MOD_NON_COMMERCIAL = "fm" + + +class ImageOptions(object): + + """Allows passing options to filter a google images search.""" + + def __init__(self): + self.image_type = None + self.size_category = None + self.larger_than = None + self.exact_width = None + self.exact_height = None + self.color_type = None + self.color = None + self.license = None + + def __repr__(self): + return unidecode(self.__dict__) + + def get_tbs(self): + tbs = None + if self.image_type: + # clipart + tbs = self._add_to_tbs(tbs, "itp", self.image_type) + if self.size_category and not (self.larger_than or (self.exact_width and self.exact_height)): + # i = icon, l = large, m = medium, lt = larger than, ex = exact + tbs = self._add_to_tbs(tbs, "isz", self.size_category) + if self.larger_than: + # qsvga,4mp + tbs = self._add_to_tbs(tbs, "isz", SizeCategory.LARGER_THAN) + tbs = self._add_to_tbs(tbs, "islt", self.larger_than) + if self.exact_width and self.exact_height: + tbs = self._add_to_tbs(tbs, "isz", SizeCategory.EXACTLY) + tbs = self._add_to_tbs(tbs, "iszw", self.exact_width) + tbs = self._add_to_tbs(tbs, "iszh", self.exact_height) + if self.color_type and not self.color: + # color = color, gray = black and white, specific = user defined + tbs = self._add_to_tbs(tbs, "ic", self.color_type) + if self.color: + tbs = self._add_to_tbs(tbs, "ic", ColorType.SPECIFIC) + tbs = self._add_to_tbs(tbs, "isc", self.color) + if self.license: + tbs = self._add_to_tbs(tbs, "sur", self.license) + return tbs + + def _add_to_tbs(self, tbs, name, value): + if tbs: + return "%s,%s:%s" % (tbs, name, value) + else: + return "&tbs=%s:%s" % (name, value) + + +class ImageResult(object): + + """Represents a google image search result.""" + + ROOT_FILENAME = "img" + DEFAULT_FORMAT = "jpg" + + def __init__(self): + self.name = None + self.file_name = None + self.link = None + self.thumb = None + self.thumb_width = None + self.thumb_height = None + self.width = None + self.height = None + self.filesize = None + self.format = None + self.domain = None + self.page = None + self.index = None + self.site = None + + def __eq__(self, other): + return self.link == other.link + + def __hash__(self): + return id(self.link) + + def __repr__(self): + string = "ImageResult(index={i}, page={p}, domain={d}, link={l})".format( + i=str(self.index), + p=str(self.page), + d=unidecode(self.domain) if self.domain else None, + l=unidecode(self.link) if self.link else None + ) + return string + + def download(self, path="images"): + """Download an image to a given path.""" + + self._create_path(path) + # print path + + try: + response = requests.get(self.link, stream=True) + # request a protected image (adding a referer to the request) + # referer = self.domain + # image = self.link + + # req = urllib2.Request(image) + # req.add_header('Referer', referer) # here is the trick + # response = urllib2.urlopen(req) + + if "image" in response.headers['content-type']: + path_filename = self._get_path_filename(path) + with open(path_filename, 'wb') as output_file: + shutil.copyfileobj(response.raw, output_file) + # output_file.write(response.content) + else: + print("\r\rskiped! cached image") + + del response + + except Exception as inst: + print(self.link, "has failed:") + print(inst) + + def _get_path_filename(self, path): + """Build the filename to download. + + Checks that filename is not already in path. Otherwise looks for + another name. + + >>> ir = ImageResult() + >>> ir._get_path_filename("test") + 'test\\\img3.jpg' + >>> ir.name = "pirulo" + >>> ir.format = "jpg" + >>> ir._get_path_filename("test") + 'test\\\pirulo.jpg' + """ + + path_filename = None + + # preserve the original name + if self.file_name: + original_filename = self.file_name + path_filename = os.path.join(path, original_filename) + + # create a default name if there is no original name + if not path_filename or os.path.isfile(path_filename): + + # take the format of the file, or use default + if self.format: + file_format = self.format + else: + file_format = self.DEFAULT_FORMAT + + # create root of file, until reaching a non existent one + i = 1 + default_filename = self.ROOT_FILENAME + str(i) + "." + file_format + path_filename = os.path.join(path, default_filename) + while os.path.isfile(path_filename): + i += 1 + default_filename = self.ROOT_FILENAME + str(i) + "." + \ + file_format + path_filename = os.path.join(path, default_filename) + + return path_filename + + def _create_path(self, path): + """Create a path, if it doesn't exists.""" + + if not os.path.isdir(path): + os.mkdir(path) + + +# PRIVATE +def _parse_image_format(image_link): + """Parse an image format from a download link. + + Args: + image_link: link to download an image. + + >>> link = "http://blogs.elpais.com/.a/6a00d8341bfb1653ef01a73dbb4a78970d-pi" + >>> Google._parse_image_format(link) + + >>> link = "http://minionslovebananas.com/images/gallery/preview/Chiquita-DM2-minion-banana-3.jpg%3Fw%3D300%26h%3D429" + >>> Google._parse_image_format(link) + + """ + parsed_format = image_link[image_link.rfind(".") + 1:] + + # OLD: identify formats even with weird final characters + if parsed_format not in IMAGE_FORMATS: + for image_format in IMAGE_FORMATS: + if image_format in parsed_format: + parsed_format = image_format + break + + if parsed_format not in IMAGE_FORMATS: + parsed_format = None + + return parsed_format + + +def _get_images_req_url(query, image_options=None, page=0, + per_page=20): + query = query.strip().replace(":", "%3A").replace( + "+", "%2B").replace("&", "%26").replace(" ", "+") + + url = "https://www.google.com.ar/search?q={}".format(query) + \ + "&es_sm=122&source=lnms" + \ + "&tbm=isch&sa=X&ei=DDdUVL-fE4SpNq-ngPgK&ved=0CAgQ_AUoAQ" + \ + "&biw=1024&bih=719&dpr=1.25" + + if image_options: + tbs = image_options.get_tbs() + if tbs: + url = url + tbs + + return url + + +def _find_divs_with_images(soup): + + try: + div_container = soup.find("div", {"id": "rg_s"}) + divs = div_container.find_all("div", {"class": "rg_di"}) + except: + divs = None + return divs + + +def _get_file_name(link): + + temp_name = link.rsplit('/', 1)[-1] + image_format = _parse_image_format(link) + + if image_format and temp_name.rsplit(".", 1)[-1] != image_format: + file_name = temp_name.rsplit(".", 1)[0] + "." + image_format + + else: + file_name = temp_name + + return file_name + + +def _get_name(): + pass + + +def _get_filesize(): + pass + + +def _get_image_data(res, a): + """Parse image data and write it to an ImageResult object. + + Args: + res: An ImageResult object. + a: An "a" html tag. + """ + google_middle_link = a["href"] + url_parsed = urllib.parse.urlparse(google_middle_link) + qry_parsed = urllib.parse.parse_qs(url_parsed.query) + res.name = _get_name() + res.link = qry_parsed["imgurl"][0] + res.file_name = _get_file_name(res.link) + res.format = _parse_image_format(res.link) + res.width = qry_parsed["w"][0] + res.height = qry_parsed["h"][0] + res.site = qry_parsed["imgrefurl"][0] + res.domain = urllib.parse.urlparse(res.site).netloc + res.filesize = _get_filesize() + + +def _get_thumb_data(res, img): + """Parse thumb data and write it to an ImageResult object. + + Args: + res: An ImageResult object. + a: An "a" html tag. + """ + try: + res.thumb = img[0]["src"] + except: + res.thumb = img[0]["data-src"] + + try: + img_style = img[0]["style"].split(";") + img_style_dict = {i.split(":")[0]: i.split(":")[-1] for i in img_style} + res.thumb_width = img_style_dict["width"] + res.thumb_height = img_style_dict["height"] + except: + exc_type, exc_value, exc_traceback = sys.exc_info() + print(exc_type, exc_value, "index=", res.index) + + +# PUBLIC +def search_old(query, image_options=None, pages=1): + results = [] + for i in range(pages): + url = get_image_search_url(query, image_options, i) + html = get_html(url) + if html: + if Google.DEBUG_MODE: + write_html_to_file( + html, "images_{0}_{1}.html".format(query.replace(" ", "_"), i)) + j = 0 + soup = BeautifulSoup(html) + match = re.search("dyn.setResults\((.+)\);", html) + if match: + init = str(match.group(1), errors="ignore") + tokens = init.split('],[') + for token in tokens: + res = ImageResult() + res.page = i + res.index = j + toks = token.split(",") + + # should be 32 or 33, but seems to change, so just make sure no exceptions + # will be thrown by the indexing + if (len(toks) > 22): + for t in range(len(toks)): + toks[t] = toks[t].replace('\\x3cb\\x3e', '').replace( + '\\x3c/b\\x3e', '').replace('\\x3d', '=').replace('\\x26', '&') + match = re.search( + "imgurl=(?P[^&]+)&imgrefurl", toks[0]) + if match: + res.link = match.group("link") + res.name = toks[6].replace('"', '') + res.thumb = toks[21].replace('"', '') + res.format = toks[10].replace('"', '') + res.domain = toks[11].replace('"', '') + match = re.search( + "(?P[0-9]+) × (?P[0-9]+) - (?P[^ ]+)", toks[9].replace('"', '')) + if match: + res.width = match.group("width") + res.height = match.group("height") + res.filesize = match.group("size") + results.append(res) + j = j + 1 + return results + + +def search(query, image_options=None, num_images=50): + """Search images in google. + + Search images in google filtering by image type, size category, resolution, + exact width, exact height, color type or color. A simple search can be + performed without passing options. To filter the search, an ImageOptions + must be built with the different filter categories and passed. + + Args: + query: string to search in google images + image_options: an ImageOptions object to filter the search + num_images: number of images to be scraped + + Returns: + A list of ImageResult objects + """ + + results = set() + curr_num_img = 1 + page = 0 + browser = get_browser_with_url("about:home") + while curr_num_img <= num_images: + + page += 1 + url = _get_images_req_url(query, image_options, page) + # html = get_html_from_dynamic_site(url) + browser.get(url) + html = browser.page_source + + if html: + soup = BeautifulSoup(html) + + # iterate over the divs containing images in one page + divs = _find_divs_with_images(soup) + + # empty search result page case + if not divs: + break + + for div in divs: + + res = ImageResult() + + # store indexing paramethers + res.page = page + res.index = curr_num_img + + # get url of image and its secondary data + a = div.find("a") + if a: + _get_image_data(res, a) + + # get url of thumb and its size paramethers + img = a.find_all("img") + if img: + _get_thumb_data(res, img) + + # increment image counter only if a new image was added + prev_num_results = len(results) + results.add(res) + curr_num_results = len(results) + + if curr_num_results > prev_num_results: + curr_num_img += 1 + + # break the loop when limit of images is reached + if curr_num_img >= num_images: + break + + browser.quit() + + return list(results) + + +def _download_image(image_result, path): + + if image_result.format: + if path: + image_result.download(path) + else: + image_result.download() + + +@measure_time +def download(image_results, path=None): + """Download a list of images. + + Args: + images_list: a list of ImageResult instances + path: path to store downloaded images. + """ + + total_images = len(image_results) + i = 1 + for image_result in image_results: + + progress = "".join(["Downloading image ", str(i), + " (", str(total_images), ")"]) + print(progress) + sys.stdout.flush() + + _download_image(image_result, path) + + i += 1 + + +class ThreadUrl(threading.Thread): + + """Threaded Url Grab""" + + def __init__(self, queue, path, total): + threading.Thread.__init__(self) + self.queue = queue + self.path = path + self.total = total + + def run(self): + while True: + # grabs host from queue + image_result = self.queue.get() + + counter = self.total - self.queue.qsize() + progress = "".join(["Downloading image ", str(counter), + " (", str(self.total), ")"]) + print(progress) + sys.stdout.flush() + _download_image(image_result, self.path) + + # signals to queue job is done + self.queue.task_done() + + +@measure_time +def fast_download(image_results, path=None, threads=10): + # print path + queue = queue.Queue() + total = len(image_results) + + for image_result in image_results: + queue.put(image_result) + + # spawn a pool of threads, and pass them queue instance + for i in range(threads): + t = ThreadUrl(queue, path, total) + t.setDaemon(True) + t.start() + + # wait on the queue until everything has been processed + queue.join() diff --git a/googleapi/modules/shopping_search.py b/googleapi/modules/shopping_search.py new file mode 100644 index 0000000..aa4bd9b --- /dev/null +++ b/googleapi/modules/shopping_search.py @@ -0,0 +1,81 @@ +from __future__ import unicode_literals +from __future__ import print_function +from __future__ import absolute_import + +from builtins import range +from builtins import object + +from .utils import get_html, normalize_query +from bs4 import BeautifulSoup +import re +from unidecode import unidecode + + +class ShoppingResult(object): + + """Represents a shopping result.""" + + def __init__(self): + self.name = None + self.link = None + self.thumb = None + self.subtext = None + self.description = None + self.compare_url = None + self.store_count = None + self.min_price = None + + def __repr__(self): + return unidecode(self.name) + + +def shopping(query, pages=1): + results = [] + for i in range(pages): + url = _get_shopping_url(query, i) + html = get_html(url) + if html: + j = 0 + soup = BeautifulSoup(html) + + products = soup.findAll("div", "g") + print("yoooo", products) + for prod in products: + res = ShoppingResult() + + divs = prod.findAll("div") + for div in divs: + match = re.search( + "from (?P[0-9]+) stores", div.text.strip()) + if match: + res.store_count = match.group("count") + break + + h3 = prod.find("h3", "r") + if h3: + a = h3.find("a") + if a: + res.compare_url = a["href"] + res.name = h3.text.strip() + + psliimg = prod.find("div", "psliimg") + if psliimg: + img = psliimg.find("img") + if img: + res.thumb = img["src"] + + f = prod.find("div", "f") + if f: + res.subtext = f.text.strip() + + price = prod.find("div", "psliprice") + if price: + res.min_price = price.text.strip() + + results.append(res) + j = j + 1 + return results + + +def _get_shopping_url(query, page=0, per_page=10): + return "http://www.google.com/search?hl=en&q={0}&tbm=shop&start={1}&num={2}".format(normalize_query(query), page * per_page, per_page) diff --git a/googleapi/modules/standard_search.py b/googleapi/modules/standard_search.py new file mode 100644 index 0000000..7862940 --- /dev/null +++ b/googleapi/modules/standard_search.py @@ -0,0 +1,236 @@ +from __future__ import unicode_literals +from __future__ import absolute_import + +from future import standard_library +standard_library.install_aliases() +from builtins import range +from builtins import object +from .utils import _get_search_url, get_html +from bs4 import BeautifulSoup +import urllib.parse +from urllib.parse import unquote, parse_qs, urlparse +from unidecode import unidecode +from re import match, findall + + +class GoogleResult(object): + + """Represents a google search result.""" + + def __init__(self): + self.name = None # The title of the link + self.link = None # The external link + self.google_link = None # The google link + self.description = None # The description of the link + self.thumb = None # Thumbnail link of website (NOT implemented yet) + self.cached = None # Cached version link of page + self.page = None # Results page this one was on + self.index = None # What index on this page it was on + self.number_of_results = None # The total number of results the query returned + self.is_pdf = None # This boolean is true if google thinks this result leads to a PDF file + + def __repr__(self): + name = self._limit_str_size(self.name, 55) + description = self._limit_str_size(self.description, 49) + + list_google = ["GoogleResult(", + "name={}".format(name), "\n", " " * 13, + "description={}".format(description)] + + return "".join(list_google) + + def _limit_str_size(self, str_element, size_limit): + """Limit the characters of the string, adding .. at the end.""" + if not str_element: + return None + + elif len(str_element) > size_limit: + return unidecode(str_element[:size_limit]) + ".." + + else: + return unidecode(str_element) + + +# PUBLIC +def search(query, pages=1, lang='en', area='com', ncr=False, void=True, time_period=False, sort_by_date=False, first_page=0): + """Returns a list of GoogleResult. + + Args: + query: String to search in google. + pages: Number of pages where results must be taken. + area : Area of google homepages. + first_page : First page. + + TODO: add support to get the google results. + Returns: + A GoogleResult object.""" + + results = [] + for i in range(first_page, first_page + pages): + url = _get_search_url(query, i, lang=lang, area=area, ncr=ncr, time_period=time_period, sort_by_date=sort_by_date) + html = get_html(url) + + if html: + soup = BeautifulSoup(html, "html.parser") + divs = soup.findAll("div", attrs={"class": "g"}) + + results_div = soup.find("div", attrs={"id": "resultStats"}) + number_of_results = _get_number_of_results(results_div) + + j = 0 + for li in divs: + res = GoogleResult() + + res.page = i + res.index = j + + res.name = _get_name(li) + res.link = _get_link(li) + res.google_link = _get_google_link(li) + res.description = _get_description(li) + res.thumb = _get_thumb() + res.cached = _get_cached(li) + res.number_of_results = number_of_results + res.is_pdf = _get_is_pdf(li) + + if void is True: + if res.description is None: + continue + results.append(res) + j += 1 + return results + + +# PRIVATE +def _get_name(li): + """Return the name of a google search.""" + a = li.find("a") + # return a.text.encode("utf-8").strip() + if a is not None: + return a.text.strip() + return None + + +def _filter_link(link): + '''Filter links found in the Google result pages HTML code. + Returns None if the link doesn't yield a valid result. + ''' + try: + # Valid results are absolute URLs not pointing to a Google domain + # like images.google.com or googleusercontent.com + o = urlparse(link, 'http') + # link type-1 + # >>> "https://www.gitbook.com/book/ljalphabeta/python-" + if o.netloc and 'google' not in o.netloc: + return link + # link type-2 + # >>> "http://www.google.com/url?url=http://python.jobbole.com/84108/&rct=j&frm=1&q=&esrc=s&sa=U&ved=0ahUKEwj3quDH-Y7UAhWG6oMKHdQ-BQMQFggUMAA&usg=AFQjCNHPws5Buru5Z71wooRLHT6mpvnZlA" + if o.netloc and o.path.startswith('/url'): + try: + link = parse_qs(o.query)['url'][0] + o = urlparse(link, 'http') + if o.netloc and 'google' not in o.netloc: + return link + except KeyError: + pass + # Decode hidden URLs. + if link.startswith('/url?'): + try: + # link type-3 + # >>> "/url?q=http://python.jobbole.com/84108/&sa=U&ved=0ahUKEwjFw6Txg4_UAhVI5IMKHfqVAykQFggUMAA&usg=AFQjCNFOTLpmpfqctpIn0sAfaj5U5gAU9A" + link = parse_qs(o.query)['q'][0] + # Valid results are absolute URLs not pointing to a Google domain + # like images.google.com or googleusercontent.com + o = urlparse(link, 'http') + if o.netloc and 'google' not in o.netloc: + return link + except KeyError: + # link type-4 + # >>> "/url?url=https://machine-learning-python.kspax.io/&rct=j&frm=1&q=&esrc=s&sa=U&ved=0ahUKEwj3quDH-Y7UAhWG6oMKHdQ-BQMQFggfMAI&usg=AFQjCNEfkUI0RP_RlwD3eI22rSfqbYM_nA" + link = parse_qs(o.query)['url'][0] + o = urlparse(link, 'http') + if o.netloc and 'google' not in o.netloc: + return link + + # Otherwise, or on error, return None. + except Exception: + pass + return None + + +def _get_link(li): + """Return external link from a search.""" + try: + a = li.find("a") + link = a["href"] + except Exception: + return None + return _filter_link(link) + + +def _get_google_link(li): + """Return google link from a search.""" + try: + a = li.find("a") + link = a["href"] + except Exception: + return None + + if link.startswith("/url?") or link.startswith("/search?"): + return urllib.parse.urljoin("http://www.google.com", link) + + else: + return None + + +def _get_description(li): + """Return the description of a google search. + + TODO: There are some text encoding problems to resolve.""" + + sdiv = li.find("div", attrs={"class": "IsZvec"}) + if sdiv: + stspan = sdiv.find("span", attrs={"class": "aCOpRe"}) + if stspan is not None: + # return stspan.text.encode("utf-8").strip() + return stspan.text.strip() + else: + return None + + +def _get_thumb(): + """Return the link to a thumbnail of the website.""" + pass + + +def _get_cached(li): + """Return a link to the cached version of the page.""" + links = li.find_all("a") + if len(links) > 1 and links[1].text == "Cached": + link = links[1]["href"] + if link.startswith("/url?") or link.startswith("/search?"): + return urllib.parse.urljoin("http://www.google.com", link) + return None + +def _get_is_pdf(li): + """Return if the link is marked by google as PDF""" + sdiv = li.find("span", attrs={"class": "ZGwO7 C0kchf NaCKVc"}) + return True if sdiv else False + +def _get_number_of_results(results_div): + """Return the total number of results of the google search. + Note that the returned value will be the same for all the GoogleResult + objects from a specific query.""" + try: + results_div_text = results_div.get_text() + if results_div_text: + regex = r"((?:\d+[,\.])*\d+)" + m = findall(regex, results_div_text) + + # Clean up the number. + num = m[0].replace(",", "").replace(".", "") + + results = int(num) + return results + except Exception as e: + return 0 diff --git a/googleapi/modules/utils.py b/googleapi/modules/utils.py new file mode 100644 index 0000000..0f18be6 --- /dev/null +++ b/googleapi/modules/utils.py @@ -0,0 +1,560 @@ +from __future__ import unicode_literals +from __future__ import print_function +from __future__ import division + +from future import standard_library +standard_library.install_aliases() +from builtins import range +from past.utils import old_div +import time +from selenium import webdriver +import urllib.request +import urllib.error +import urllib.parse +from functools import wraps +# import requests +from urllib.parse import urlencode +from fake_useragent import UserAgent +import sys + +class AreaError(KeyError): + pass + + +def measure_time(fn): + + def decorator(*args, **kwargs): + start = time.time() + + res = fn(*args, **kwargs) + + elapsed = time.time() - start + print(fn.__name__, "took", elapsed, "seconds") + + return res + + return decorator + + +def normalize_query(query): + return query.strip().replace(":", "%3A").replace("+", "%2B").replace("&", "%26").replace(" ", "+") + + +def _get_search_url(query, page=0, per_page=10, lang='en', area='com', ncr=False, time_period=False, sort_by_date=False): + # note: num per page might not be supported by google anymore (because of + # google instant) + + params = { + 'nl': lang, + 'q': query.encode('utf8'), + 'start': page * per_page, + 'num': per_page + } + + time_mapping = { + 'hour': 'qdr:h', + 'week': 'qdr:w', + 'month': 'qdr:m', + 'year': 'qdr:y' + } + + + tbs_param = [] + # Set time period for query if given + if time_period and time_period in time_mapping: + tbs_param.append(time_mapping[time_period]) + + if sort_by_date: + tbs_param.append('sbd:1') + params['tbs'] = ','.join(tbs_param) + + # This will allow to search Google with No Country Redirect + if ncr: + params['gl'] = 'us' # Geographic Location: US + params['pws'] = '0' # 'pws' = '0' disables personalised search + params['gws_rd'] = 'cr' # Google Web Server ReDirect: CountRy. + + params = urlencode(params) + + url = u"https://www.google.com/search?" + params + + # @author JuaniFilardo: + # Workaround to switch between http and https, since this maneuver + # seems to avoid the 503 error when performing a lot of queries. + # Weird, but it works. + # You may also wanna wait some time between queries, say, randint(50,65) + # between each query, and randint(180,240) every 100 queries, which is + # what I found useful. + https = int(time.time()) % 2 == 0 + bare_url = u"https://www.google.com/search?" if https else u"http://www.google.com/search?" + url = bare_url + params + + # return u"http://www.google.com/search?hl=%s&q=%s&start=%i&num=%i" % + # (lang, normalize_query(query), page * per_page, per_page) + if not ncr: + if area == 'com': + url = u"http://www.google.com/search?" + elif area == 'is': + url = 'http://www.google.is/search?' + elif area == 'dk': + url = 'http://www.google.dk/search?' + elif area == 'no': + url = 'http://www.google.no/search?' + elif area == 'se': + url = 'http://www.google.se/search?' + elif area == 'fi': + url = 'http://www.google.fi/search?' + elif area == 'ee': + url = 'http://www.google.ee/search?' + elif area == 'lv': + url = 'http://www.google.lv/search?' + elif area == 'lt': + url = 'http://www.google.lt/search?' + elif area == 'ie': + url = 'http://www.google.ie/search?' + elif area == 'uk': + url = 'http://www.google.co.uk/search?' + elif area == 'gg': + url = 'http://www.google.gg/search?' + elif area == 'je': + url = 'http://www.google.je/search?' + elif area == 'im': + url = 'http://www.google.im/search?' + elif area == 'fr': + url = 'http://www.google.fr/search?' + elif area == 'nl': + url = 'http://www.google.nl/search?' + elif area == 'be': + url = 'http://www.google.be/search?' + elif area == 'lu': + url = 'http://www.google.lu/search?' + elif area == 'de': + url = 'http://www.google.de/search?' + elif area == 'at': + url = 'http://www.google.at/search?' + elif area == 'ch': + url = 'http://www.google.ch/search?' + elif area == 'li': + url = 'http://www.google.li/search?' + elif area == 'pt': + url = 'http://www.google.pt/search?' + elif area == 'es': + url = 'http://www.google.es/search?' + elif area == 'gi': + url = 'http://www.google.com.gi/search?' + elif area == 'ad': + url = 'http://www.google.ad/search?' + elif area == 'it': + url = 'http://www.google.it/search?' + elif area == 'mt': + url = 'http://www.google.com.mt/search?' + elif area == 'sm': + url = 'http://www.google.sm/search?' + elif area == 'gr': + url = 'http://www.google.gr/search?' + elif area == 'ru': + url = 'http://www.google.ru/search?' + elif area == 'by': + url = 'http://www.google.com.by/search?' + elif area == 'ua': + url = 'http://www.google.com.ua/search?' + elif area == 'pl': + url = 'http://www.google.pl/search?' + elif area == 'cz': + url = 'http://www.google.cz/search?' + elif area == 'sk': + url = 'http://www.google.sk/search?' + elif area == 'hu': + url = 'http://www.google.hu/search?' + elif area == 'si': + url = 'http://www.google.si/search?' + elif area == 'hr': + url = 'http://www.google.hr/search?' + elif area == 'ba': + url = 'http://www.google.ba/search?' + elif area == 'me': + url = 'http://www.google.me/search?' + elif area == 'rs': + url = 'http://www.google.rs/search?' + elif area == 'mk': + url = 'http://www.google.mk/search?' + elif area == 'bg': + url = 'http://www.google.bg/search?' + elif area == 'ro': + url = 'http://www.google.ro/search?' + elif area == 'md': + url = 'http://www.google.md/search?' + elif area == 'hk': + url = 'http://www.google.com.hk/search?' + elif area == 'mn': + url = 'http://www.google.mn/search?' + elif area == 'kr': + url = 'http://www.google.co.kr/search?' + elif area == 'jp': + url = 'http://www.google.co.jp/search?' + elif area == 'vn': + url = 'http://www.google.com.vn/search?' + elif area == 'la': + url = 'http://www.google.la/search?' + elif area == 'kh': + url = 'http://www.google.com.kh/search?' + elif area == 'th': + url = 'http://www.google.co.th/search?' + elif area == 'my': + url = 'http://www.google.com.my/search?' + elif area == 'sg': + url = 'http://www.google.com.sg/search?' + elif area == 'bn': + url = 'http://www.google.com.bn/search?' + elif area == 'ph': + url = 'http://www.google.com.ph/search?' + elif area == 'id': + url = 'http://www.google.co.id/search?' + elif area == 'tp': + url = 'http://www.google.tp/search?' + elif area == 'kz': + url = 'http://www.google.kz/search?' + elif area == 'kg': + url = 'http://www.google.kg/search?' + elif area == 'tj': + url = 'http://www.google.com.tj/search?' + elif area == 'uz': + url = 'http://www.google.co.uz/search?' + elif area == 'tm': + url = 'http://www.google.tm/search?' + elif area == 'af': + url = 'http://www.google.com.af/search?' + elif area == 'pk': + url = 'http://www.google.com.pk/search?' + elif area == 'np': + url = 'http://www.google.com.np/search?' + elif area == 'in': + url = 'http://www.google.co.in/search?' + elif area == 'bd': + url = 'http://www.google.com.bd/search?' + elif area == 'lk': + url = 'http://www.google.lk/search?' + elif area == 'mv': + url = 'http://www.google.mv/search?' + elif area == 'kw': + url = 'http://www.google.com.kw/search?' + elif area == 'sa': + url = 'http://www.google.com.sa/search?' + elif area == 'bh': + url = 'http://www.google.com.bh/search?' + elif area == 'ae': + url = 'http://www.google.ae/search?' + elif area == 'om': + url = 'http://www.google.com.om/search?' + elif area == 'jo': + url = 'http://www.google.jo/search?' + elif area == 'il': + url = 'http://www.google.co.il/search?' + elif area == 'lb': + url = 'http://www.google.com.lb/search?' + elif area == 'tr': + url = 'http://www.google.com.tr/search?' + elif area == 'az': + url = 'http://www.google.az/search?' + elif area == 'am': + url = 'http://www.google.am/search?' + elif area == 'ls': + url = 'http://www.google.co.ls/search?' + elif area == 'eg': + url = 'http://www.google.com.eg/search?' + elif area == 'ly': + url = 'http://www.google.com.ly/search?' + elif area == 'dz': + url = 'http://www.google.dz/search?' + elif area == 'ma': + url = 'http://www.google.co.ma/search?' + elif area == 'sn': + url = 'http://www.google.sn/search?' + elif area == 'gm': + url = 'http://www.google.gm/search?' + elif area == 'ml': + url = 'http://www.google.ml/search?' + elif area == 'bf': + url = 'http://www.google.bf/search?' + elif area == 'sl': + url = 'http://www.google.com.sl/search?' + elif area == 'ci': + url = 'http://www.google.ci/search?' + elif area == 'gh': + url = 'http://www.google.com.gh/search?' + elif area == 'tg': + url = 'http://www.google.tg/search?' + elif area == 'bj': + url = 'http://www.google.bj/search?' + elif area == 'ne': + url = 'http://www.google.ne/search?' + elif area == 'ng': + url = 'http://www.google.com.ng/search?' + elif area == 'sh': + url = 'http://www.google.sh/search?' + elif area == 'cm': + url = 'http://www.google.cm/search?' + elif area == 'td': + url = 'http://www.google.td/search?' + elif area == 'cf': + url = 'http://www.google.cf/search?' + elif area == 'ga': + url = 'http://www.google.ga/search?' + elif area == 'cg': + url = 'http://www.google.cg/search?' + elif area == 'cd': + url = 'http://www.google.cd/search?' + elif area == 'ao': + url = 'http://www.google.it.ao/search?' + elif area == 'et': + url = 'http://www.google.com.et/search?' + elif area == 'dj': + url = 'http://www.google.dj/search?' + elif area == 'ke': + url = 'http://www.google.co.ke/search?' + elif area == 'ug': + url = 'http://www.google.co.ug/search?' + elif area == 'tz': + url = 'http://www.google.co.tz/search?' + elif area == 'rw': + url = 'http://www.google.rw/search?' + elif area == 'bi': + url = 'http://www.google.bi/search?' + elif area == 'mw': + url = 'http://www.google.mw/search?' + elif area == 'mz': + url = 'http://www.google.co.mz/search?' + elif area == 'mg': + url = 'http://www.google.mg/search?' + elif area == 'sc': + url = 'http://www.google.sc/search?' + elif area == 'mu': + url = 'http://www.google.mu/search?' + elif area == 'zm': + url = 'http://www.google.co.zm/search?' + elif area == 'zw': + url = 'http://www.google.co.zw/search?' + elif area == 'bw': + url = 'http://www.google.co.bw/search?' + elif area == 'na': + url = 'http://www.google.com.na/search?' + elif area == 'za': + url = 'http://www.google.co.za/search?' + elif area == 'au': + url = 'http://www.google.com.au/search?' + elif area == 'nf': + url = 'http://www.google.com.nf/search?' + elif area == 'nz': + url = 'http://www.google.co.nz/search?' + elif area == 'sb': + url = 'http://www.google.com.sb/search?' + elif area == 'fj': + url = 'http://www.google.com.fj/search?' + elif area == 'fm': + url = 'http://www.google.fm/search?' + elif area == 'ki': + url = 'http://www.google.ki/search?' + elif area == 'nr': + url = 'http://www.google.nr/search?' + elif area == 'tk': + url = 'http://www.google.tk/search?' + elif area == 'ws': + url = 'http://www.google.ws/search?' + elif area == 'as': + url = 'http://www.google.as/search?' + elif area == 'to': + url = 'http://www.google.to/search?' + elif area == 'nu': + url = 'http://www.google.nu/search?' + elif area == 'ck': + url = 'http://www.google.co.ck/search?' + elif area == 'do': + url = 'http://www.google.com.do/search?' + elif area == 'tt': + url = 'http://www.google.tt/search?' + elif area == 'co': + url = 'http://www.google.com.co/search?' + elif area == 'ec': + url = 'http://www.google.com.ec/search?' + elif area == 've': + url = 'http://www.google.co.ve/search?' + elif area == 'gy': + url = 'http://www.google.gy/search?' + elif area == 'pe': + url = 'http://www.google.com.pe/search?' + elif area == 'bo': + url = 'http://www.google.com.bo/search?' + elif area == 'py': + url = 'http://www.google.com.py/search?' + elif area == 'br': + url = 'http://www.google.com.br/search?' + elif area == 'uy': + url = 'http://www.google.com.uy/search?' + elif area == 'ar': + url = 'http://www.google.com.ar/search?' + elif area == 'cl': + url = 'http://www.google.cl/search?' + elif area == 'gl': + url = 'http://www.google.gl/search?' + elif area == 'ca': + url = 'http://www.google.ca/search?' + elif area == 'mx': + url = 'http://www.google.com.mx/search?' + elif area == 'gt': + url = 'http://www.google.com.gt/search?' + elif area == 'bz': + url = 'http://www.google.com.bz/search?' + elif area == 'sv': + url = 'http://www.google.com.sv/search?' + elif area == 'hn': + url = 'http://www.google.hn/search?' + elif area == 'ni': + url = 'http://www.google.com.ni/search?' + elif area == 'cr': + url = 'http://www.google.co.cr/search?' + elif area == 'pa': + url = 'http://www.google.com.pa/search?' + elif area == 'bs': + url = 'http://www.google.bs/search?' + elif area == 'cu': + url = 'http://www.google.com.cu/search?' + elif area == 'jm': + url = 'http://www.google.com.jm/search?' + elif area == 'ht': + url = 'http://www.google.ht/search?' + else: + raise AreaError('invalid name, no area found') + url += params + return url + + +def get_html(url): + ua = UserAgent() + header = ua.random + + try: + request = urllib.request.Request(url) + request.add_header("User-Agent", header) + html = urllib.request.urlopen(request).read() + return html + except urllib.error.HTTPError as e: + print("Error accessing:", url) + print(e) + if e.code == 503 and 'CaptchaRedirect' in e.read(): + print("Google is requiring a Captcha. " + "For more information check: 'https://support.google.com/websearch/answer/86640'") + if e.code == 503: + sys.exit("503 Error: service is currently unavailable. Program will exit.") + return None + except Exception as e: + print("Error accessing:", url) + print(e) + return None + + +def write_html_to_file(html, filename): + of = open(filename, "w") + of.write(html.encode("utf-8")) + # of.flush() + of.close() + + +def get_browser_with_url(url, timeout=120, driver="firefox"): + """Returns an open browser with a given url.""" + + # choose a browser + if driver == "firefox": + browser = webdriver.Firefox() + elif driver == "ie": + browser = webdriver.Ie() + elif driver == "chrome": + browser = webdriver.Chrome() + else: + print("Driver choosen is not recognized") + + # set maximum load time + browser.set_page_load_timeout(timeout) + + # open a browser with given url + browser.get(url) + + time.sleep(0.5) + + return browser + + +def get_html_from_dynamic_site(url, timeout=120, + driver="firefox", attempts=10): + """Returns html from a dynamic site, opening it in a browser.""" + + RV = "" + + # try several attempts + for i in range(attempts): + try: + # load browser + browser = get_browser_with_url(url, timeout, driver) + + # get html + time.sleep(2) + content = browser.page_source + + # try again if there is no content + if not content: + browser.quit() + raise Exception("No content!") + + # if there is content gets out + browser.quit() + RV = content + break + + except: + print("\nTry ", i, " of ", attempts, "\n") + time.sleep(5) + + return RV + + +def timeit(func=None, loops=1, verbose=False): + if func: + def inner(*args, **kwargs): + + sums = 0.0 + mins = 1.7976931348623157e+308 + maxs = 0.0 + print('====%s Timing====' % func.__name__) + for i in range(0, loops): + t0 = time.time() + result = func(*args, **kwargs) + dt = time.time() - t0 + mins = dt if dt < mins else mins + maxs = dt if dt > maxs else maxs + sums += dt + if verbose: + print('\t%r ran in %2.9f sec on run %s' % + (func.__name__, dt, i)) + print('%r min run time was %2.9f sec' % (func.__name__, mins)) + print('%r max run time was %2.9f sec' % (func.__name__, maxs)) + print('%r avg run time was %2.9f sec in %s runs' % + (func.__name__, old_div(sums, loops), loops)) + print('==== end ====') + return result + + return inner + else: + def partial_inner(func): + return timeit(func, loops, verbose) + return partial_inner + + +def timing(f): + @wraps(f) + def wrap(*args, **kw): + ts = time.time() + result = f(*args, **kw) + te = time.time() + print('func:%r args:[%r, %r] took: %2.4f sec' % + (f.__name__, args, kw, te - ts)) + return result + return wrap diff --git a/googleapi/tests/__init__.py b/googleapi/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/googleapi/tests/html_files/test_calculator.html b/googleapi/tests/html_files/test_calculator.html new file mode 100644 index 0000000..bb51366 --- /dev/null +++ b/googleapi/tests/html_files/test_calculator.html @@ -0,0 +1,303 @@ + +2+2 - Buscar con Google
    Usuario lector de pantalla, clic aquí para desact. Google Instant.
    Agustín

    Cerca de 25,270,000,000 resultados (0.36 segundos) 

    diff --git a/googleapi/tests/html_files/test_convert_currency.html b/googleapi/tests/html_files/test_convert_currency.html new file mode 100644 index 0000000..afdbc5c --- /dev/null +++ b/googleapi/tests/html_files/test_convert_currency.html @@ -0,0 +1,368 @@ + + + +Currency Converter - Google Finance + + + + +
    +
    +
    + +
    +
    + +
    +
    to
    +
    + +
    +  +
    5 USD = 4.4585 EUR + +
    + +
    + +
    + diff --git a/googleapi/tests/html_files/test_exchange_rate.html b/googleapi/tests/html_files/test_exchange_rate.html new file mode 100644 index 0000000..291b55f --- /dev/null +++ b/googleapi/tests/html_files/test_exchange_rate.html @@ -0,0 +1,368 @@ + + + +Currency Converter - Google Finance + + + + +
    +
    +
    + +
    +
    + +
    +
    to
    +
    + +
    +  +
    1 USD = 0.8913 EUR + +
    + +
    + +
    + diff --git a/googleapi/tests/html_files/test_search_images.html b/googleapi/tests/html_files/test_search_images.html new file mode 100644 index 0000000..6c17b4a --- /dev/null +++ b/googleapi/tests/html_files/test_search_images.html @@ -0,0 +1,337 @@ + +apple - Buscar con Google
    Usuario lector de pantalla, clic aquí para desact. Google Instant.
    \ No newline at end of file diff --git a/googleapi/tests/html_files/test_shopping_search.html b/googleapi/tests/html_files/test_shopping_search.html new file mode 100644 index 0000000..b1f610c --- /dev/null +++ b/googleapi/tests/html_files/test_shopping_search.html @@ -0,0 +1 @@ +Disgaea 4 - Google Search

     
    \ No newline at end of file diff --git a/googleapi/tests/html_files/test_standard_search.html b/googleapi/tests/html_files/test_standard_search.html new file mode 100644 index 0000000..2b9279a --- /dev/null +++ b/googleapi/tests/html_files/test_standard_search.html @@ -0,0 +1,10 @@ +github - Google Search

     
    About 123,000,000 results
    \ No newline at end of file diff --git a/googleapi/tests/test_google.py b/googleapi/tests/test_google.py new file mode 100644 index 0000000..ef70f36 --- /dev/null +++ b/googleapi/tests/test_google.py @@ -0,0 +1,175 @@ +from builtins import object +import unittest +import nose +from googleapi import google +from googleapi import currency, images, shopping_search +from mock import Mock +import os +import vcr + +BASE_DIR = os.path.dirname(__file__) + + +def load_html_file(path): + """Call test with a html file of the same name. + + Args: + path: Relative path where the html file is located.""" + + def test_decorator(fn): + base_path = os.path.join(os.path.dirname(__file__), path) + file_name = fn.__name__ + ".html" + file_path = os.path.join(base_path, file_name) + + html_f = open(file_path, "r") + + def test_decorated(self): + fn(self, html_f) + + return test_decorated + return test_decorator + + +# HELPERS +def get_dir_vcr(name): + return os.path.join(BASE_DIR, "vcr_cassetes", name) + + +class GoogleTest(unittest.TestCase): + + @load_html_file("html_files") + # @unittest.skip("skip") + def test_search_images(self, html_f): + """Test method to search images.""" + + class MockBrowser(object): + + """Mock browser to replace selenium driver.""" + + def __init__(self, html_f): + self.page_source = html_f.read() + self.page_source = self.page_source.decode('utf8') if 'decode' in dir(self.page_source) else self.page_source + + def get(self, url): + pass + + def quit(self): + pass + + google.images.get_browser_with_url = \ + Mock(return_value=MockBrowser(html_f)) + + res = google.search_images("apple", num_images=10) + self.assertEqual(len(res), 10) + + # @load_html_file("html_files") + # def test_calculator(self, html_f): + @unittest.skip("skip") + def test_calculator(self): + """Test method to calculate in google.""" + + # replace method to get html from a test html file + # google.calculator.get_html_from_dynamic_site = \ + # Mock(return_value=html_f.read().decode('utf8')) + + calc = google.calculate("157.3kg in grams") + self.assertEqual(calc.value, 157300) + + # @load_html_file("html_files") + @vcr.use_cassette(get_dir_vcr("test_exchange_rate.yaml")) + def test_exchange_rate(self): + """Test method to get an exchange rate in google.""" + + usd_to_eur = google.exchange_rate("USD", "EUR") + self.assertGreater(usd_to_eur, 0.0) + + # @load_html_file("html_files") + @vcr.use_cassette(get_dir_vcr("test_convert_currency.yaml")) + def test_convert_currency(self): + """Test method to convert currency in google.""" + + euros = google.convert_currency(5.0, "USD", "EUR") + self.assertGreater(euros, 0.0) + + # @load_html_file("html_files") + @vcr.use_cassette(get_dir_vcr("test_standard_search.yaml")) + def test_standard_search(self): + """Test method to search in google.""" + + search = google.search("github") + self.assertNotEqual(len(search), 0) + + # @load_html_file("html_files") + @vcr.use_cassette(get_dir_vcr("test_shopping_search.yaml")) + @unittest.skip("skip") + def test_shopping_search(self): + """Test method for google shopping.""" + + shop = google.shopping_search("Disgaea 4") + self.assertNotEqual(len(shop), 0) + + +class ConvertCurrencyTest(unittest.TestCase): + + def test_get_currency_req_url(self): + """Test method to get currency conversion request url.""" + + amount = 10 + from_currency = "USD" + to_currency = "EUR" + req_url = currency._get_currency_req_url(amount, from_currency, + to_currency) + + exp_req_url = "https://www.google.com/finance/converter?a=10&from=USD&to=EUR" + + self.assertEqual(req_url, exp_req_url) + + # @unittest.skip("skip") + def test_parse_currency_response(self): + """Test method to parse currency response. TODO!""" + pass + +# @unittest.skip("skip") + + +class SearchImagesTest(unittest.TestCase): + + def test_get_images_req_url(self): + + query = "banana" + options = images.ImageOptions() + options.image_type = images.ImageType.CLIPART + options.larger_than = images.LargerThan.MP_4 + options.color = "green" + options.license = images.License.REUSE_WITH_MOD + + req_url = images._get_images_req_url(query, options) + + exp_req_url = 'https://www.google.com.ar/search?q=banana&es_sm=122&source=lnms&tbm=isch&sa=X&ei=DDdUVL-fE4SpNq-ngPgK&ved=0CAgQ_AUoAQ&biw=1024&bih=719&dpr=1.25&tbs=itp:clipart,isz:lt,islt:4mp,ic:specific,isc:green,sur:fmc' + + self.assertEqual(req_url, exp_req_url) + + def test_repr(self): + res = images.ImageResult() + assert repr( + res) == 'ImageResult(index=None, page=None, domain=None, link=None)' + res.page = 1 + res.index = 11 + res.name = 'test' + res.thumb = 'test' + res.format = 'test' + res.domain = 'test' + res.link = 'http://aa.com' + assert repr( + res) == 'ImageResult(index=11, page=1, domain=test, link=http://aa.com)' + + def test_download(self): + pass + + def test_fast_download(self): + pass + + +if __name__ == '__main__': + # nose.main() + nose.run(defaultTest=__name__) diff --git a/googleapi/tests/test_utils.py b/googleapi/tests/test_utils.py new file mode 100644 index 0000000..c2a3eb1 --- /dev/null +++ b/googleapi/tests/test_utils.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""Tests helper methods.""" + +from __future__ import print_function +from __future__ import with_statement +import unittest +import nose + +from googleapi.modules.utils import _get_search_url + + +class UtilsTestCase(unittest.TestCase): + """Tests for helper methods.""" + @unittest.skip('Don\t know why but it not work. Skipping for now') + def test_get_search_url(self): + url = _get_search_url("apple", 0, 10, "en", "jp") + exp_url = "http://www.google.co.jp/search?q=apple&start=0&num=10&hl=en" + self.assertEqual(url, exp_url) + + +if __name__ == '__main__': + nose.run(defaultTest=__name__) diff --git a/googleapi/tests/vcr_cassetes/test_convert_currency.yaml b/googleapi/tests/vcr_cassetes/test_convert_currency.yaml new file mode 100644 index 0000000..e9a363b --- /dev/null +++ b/googleapi/tests/vcr_cassetes/test_convert_currency.yaml @@ -0,0 +1,9308 @@ +interactions: +- request: + body: null + headers: + Connection: [close] + Host: [!!python/unicode 'www.w3schools.com'] + User-Agent: [Python-urllib/2.7] + method: GET + uri: https://www.w3schools.com/browsers/default.asp + response: + body: {string: !!python/unicode "\r\n\r\n\r\ + \n\r\nBrowser Statistics\r\n\r\ + \n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\ + \n\r\n\r\n\ + \r\n\r\n
    \r\n \r\ + \n
    THE WORLD'S LARGEST WEB DEVELOPER SITE
    \r\n\ +
    \r\n\r\n\r\n