Skip to content

Commit ba191ab

Browse files
committed
Initial commit
0 parents  commit ba191ab

File tree

749 files changed

+7934
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

749 files changed

+7934
-0
lines changed

MANIFEST.in

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
recursive-include wappalyzer/data *

README.md

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# wappalyzer-python
2+
Python wrapper for [Wappalizer](https://wappalyzer.com/) (utility that uncovers the technologies used on websites)
3+
4+
# How to use it
5+
6+
```python
7+
>>> from wappalyzer import Wappalyzer
8+
>>> w = Wappalyzer()
9+
10+
>>> w.analyze('http://wikipedia.org')
11+
{u'Apache': {u'confidence': 100, u'version': u'', u'categories': [u'web-servers']},
12+
u'Varnish': {u'confidence': 100, u'version': u'', u'categories': [u'cache-tools']}}
13+
14+
>>> w.analyze('http://tripadvisor.com')
15+
{u'Apache': {u'confidence': 100, u'version': u'', u'categories': [u'web-servers']},
16+
u'Google Analytics': {u'confidence': 100, u'version': u'', u'categories': [u'analytics']},
17+
u'comScore': {u'confidence': 100, u'version': u'', u'categories': [u'analytics']}}
18+
19+
>>> w.analyze('http://facebook.com')
20+
{u'reCAPTCHA': {u'confidence': 100, u'version': u'', u'categories': [u'captchas']}}
21+
```
22+
23+
You can specify the User-Agent to use:
24+
```python
25+
>>> w.analyze('http://www.google.com', user_agent='your_user_agent')
26+
```
27+
28+
Or analyze from already downloaded pages (in this case you'll need to have the url and response headers too):
29+
```python
30+
>>> w.analyze_from_data(url=the_url, html=the_html, headers=the_response_headers)
31+
```
32+
33+
Apps and Categories are available as dict objects:
34+
```python
35+
>>> w.apps
36+
{u'Google Wallet': {u'website': u'wallet.google.com', u'cats': [41], u'script': [u'checkout\\.google\\.com',
37+
u'wallet\\.google\\.com']}, u'Lockerz Share': ...}
38+
39+
>>> w.categories
40+
{u'42': u'tag-managers', u'48': u'network-storage', u'43': u'paywalls', u'49': u'feed-readers', u'24':
41+
u'rich-text-editors', u'25': u'javascript-graphics', u'26': u'mobile-frameworks', ...}
42+
43+
```
44+
45+
46+
Data can be also updated with the latest version available from the [Wappalyzer Github repo](https://github.com/AliasIO/Wappalyzer):
47+
48+
```python
49+
>>> from wappalyzer import updater
50+
>>> updater.update_all()
51+
```
52+
By default app icons will be updated to the `data/icons` folder, in case you need them somewhere else you can specify the destination folder:
53+
54+
```python
55+
>>> from wappalyzer import updater
56+
>>> updater.update_all(icons_folder='your_icons_folder')
57+
```
58+
59+
Or update them individually:
60+
61+
```python
62+
>>> updater.update_icons(icons_folder='your_icons_folder')
63+
```
64+
65+
# Requirements
66+
67+
* [Requests](https://github.com/kennethreitz/requests)
68+
* [PyV8](https://github.com/okoye/PyV8)
69+
* [lxml](https://github.com/lxml/lxml)
70+
71+
# Install
72+
73+
Using setup:
74+
75+
```python
76+
>>> python setup.py install
77+
```
78+
79+
Using pypi:
80+
81+
```python
82+
>>> pip install wappalyzer-python
83+
```
84+

requirements.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
requests
2+
PyV8
3+
lxml

setup.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from setuptools import setup, find_packages
2+
3+
setup(
4+
name='wappalyzer-python',
5+
version='0.1.2',
6+
url='https://github.com/gatufo/wappalyzer-python',
7+
description='Python wrapper for Wappalyzer (utility that uncovers the technologies used on websites)',
8+
author='Javier Casas',
9+
author_email='[email protected]',
10+
packages=find_packages(),
11+
package_data={'': ['data/*.*', 'data/icons/*.*']},
12+
include_package_data=True,
13+
classifiers=[
14+
'Development Status :: 4 - Beta',
15+
'Intended Audience :: Developers',
16+
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
17+
'Operating System :: OS Independent',
18+
'Programming Language :: Python :: 2.7',
19+
'Topic :: Internet :: WWW/HTTP',
20+
],
21+
keywords=['wappalyzer', 'scraping', 'crawling', 'site'],
22+
install_requires=[
23+
'requests',
24+
'PyV8',
25+
'lxml',
26+
],
27+
)
28+
29+

wappalyzer/__init__.py

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from urlparse import urlparse
2+
import logging
3+
4+
import requests
5+
try:
6+
import PyV8
7+
except ImportError:
8+
from pyv8 import PyV8
9+
try:
10+
import json
11+
except ImportError:
12+
import simplejson as json
13+
14+
from . import settings
15+
16+
17+
logger = logging.getLogger(__name__)
18+
19+
20+
class Wappalyzer(object):
21+
22+
def __init__(self):
23+
logger.debug('Initializing Wappalyzer...')
24+
with open(settings.FILENAME_APPS_JSON, 'r') as f:
25+
data = json.loads(f.read())
26+
self.categories = data['categories']
27+
self.apps = data['apps']
28+
29+
def analyze(self, url, user_agent=None):
30+
logger.debug('Fetching: %s' % url)
31+
32+
response = requests.get(
33+
url=url,
34+
headers={'User-Agent': user_agent or settings.USER_AGENT}
35+
)
36+
return self.analyze_from_data(
37+
url=url,
38+
html=response.text,
39+
headers=dict(response.headers))
40+
41+
def analyze_from_data(self, url, html, headers):
42+
logger.debug('Analyzing: %s' % url)
43+
44+
ctxt = PyV8.JSContext()
45+
ctxt.enter()
46+
47+
with open(settings.FILENAME_WAPPALIZER_JS) as f:
48+
ctxt.eval(f.read())
49+
50+
with open(settings.FILENAME_DRIVER_JS) as f:
51+
ctxt.eval(f.read())
52+
53+
apps = json.dumps(self.apps)
54+
categories = json.dumps(self.categories)
55+
data = {
56+
'host': urlparse(url).hostname,
57+
'url': url,
58+
'html': html,
59+
'headers': headers
60+
}
61+
62+
return json.loads(ctxt.eval(
63+
"w.apps={apps}; w.categories={categories}; w.driver.data={data}; w.driver.init();".format(
64+
apps=apps,
65+
categories=categories,
66+
data=json.dumps(data)
67+
)
68+
))

0 commit comments

Comments
 (0)