Skip to content

Commit

Permalink
Just For Book
Browse files Browse the repository at this point in the history
  • Loading branch information
imiyoo2010 committed Nov 7, 2017
1 parent f2d38aa commit 5240775
Show file tree
Hide file tree
Showing 317 changed files with 66,150 additions and 0 deletions.
10 changes: 10 additions & 0 deletions LogManager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#coding=utf-8
'''
LogManager.py
'''
import logging

logging.basicConfig(format='%(name)s[%(levelname)s/%(process)d]:%(asctime)s:%(module)s.%(funcName)s.%(lineno)d - %(message)s')
log = logging.getLogger("TScanner")
log.setLevel(logging.INFO)
#log.addHandler(logging.StreamHandler())
Binary file added LogManager.pyc
Binary file not shown.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# 《白帽子讲Web扫描》书籍参考代码

代码已作处理,主要以学习和研究为目的,有问题可以随时留言交流。

# 书籍购买链接
* 当当网

http://product.dangdang.com/25092638.html

* 亚马逊

https://www.amazon.cn/%E7%99%BD%E5%B8%BD%E5%AD%90%E8%AE%B2Web%E6%89%AB%E6%8F%8F-%E5%88%98%E6%BC%A9/dp/B071G3X4VN/

* 京东网

https://item.jd.com/12094365.html?dist=jd
276 changes: 276 additions & 0 deletions crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
# coding=utf-8
'''
crawler.py
'''
import sys
import traceback
import itertools
import time
from Queue import Queue

from LogManager import log as om

# HTTPRequest
from teye_web.http.URL import URL
from teye_web.http.Request import Request
from teye_web.http.Response import Response

# url function
from teye_web.http.function import is_similar_url

# Document Parser
import teye_web.parser.dpCache as dpCache

# wCurl
from wCurl import wcurl

# 404 Check
from teye_util.page_404 import is_404


class Crawler(object):
def __init__(self, depth_limit=1, time_limit=30, req_limit=100, filter_similar=True):
'''
'''
self.root = ''

self._target_domain = ''

self.depth_limit = depth_limit
self.time_limit = time_limit
self.req_limit = req_limit

self._sleeptime = 1

self._url_list = []

self._already_visit_url = set()

self._already_seen_urls = set()

self._already_send_reqs = set()

self._relate_ext = ['html', 'shtm', 'htm', 'shtml']

self._white_ext = ['asp', 'aspx', 'jsp', 'php', 'do', 'action']

self._black_ext = ["ico", "jpg", "gif", "js", "png", "bmp", "css", "zip", "rar", "ttf"]

self._blockwords = ['mailto:', 'javascript:', 'file://', 'tel:']

self.num_urls = 0

self.num_reqs = 0

self._wRequestList = []

self._start_time = None

self._other_domains = set()

def get_discovery_time(self):
'''
爬虫爬行的时间,单位为:分钟
'''
now = time.time()
diff = now - self._start_time

return diff / 60

def _do_with_reqs(self, reqs):
'''
'''
result = []
count = len(reqs)

if reqs is None or count == 0:
return result

for i in xrange(count):
filter = False
filter_url = reqs[i].get_url()
for j in xrange(count - i - 1):
k = i + j + 1
store_url = reqs[k].get_url()
if is_similar_url(filter_url, store_url):
filter = True
break
if not filter:
result.append(reqs[i])

return result

def _get_reqs_from_resp(self, response):
'''
'''
new_reqs = []

try:
doc_parser = dpCache.dpc.getDocumentParserFor(response)

except Exception, e:

pass

else:

re_urls, tag_urls = doc_parser.get_get_urls()

form_reqs = doc_parser.get_form_reqs()

seen = set()

for new_url in itertools.chain(re_urls, tag_urls):

if new_url in seen:
continue

seen.add(new_url)

if new_url.get_host() != self._target_domain:
if new_url.get_host() not in self._other_domains:
self._other_domains.add(new_url.get_host())
continue

if new_url not in self._url_list:

self._url_list.append(new_url)

wreq = self._url_to_req(new_url, response)

if wreq not in self._wRequestList:
new_reqs.append(wreq)

self._wRequestList.append(wreq)

for item in form_reqs:

if item not in self._wRequestList:
new_reqs.append(item)

self._wRequestList.append(item)

return new_reqs

def _url_to_req(self, new_url, response, method="GET"):
'''
'''
req = Request(new_url)
req.set_method(method)

new_referer = response.get_url()
req.set_referer(new_referer)

new_cookies = response.get_cookies()
req.set_cookies(new_cookies)

return req

def crawl(self, root_url):
'''
将URL对象存入到队列
'''
if not isinstance(root_url, URL):
root_url_obj = URL(root_url)
else:
root_url_obj = root_url

self._target_domain = root_url_obj.get_host()

self._url_list.append(root_url_obj)

root_req = Request(root_url_obj)

q = Queue()

q.put((root_req, 0))

self._start_time = time.time()

while True:

if q.empty():
print "reqs empty break"
break

this_req, depth = q.get()

# 将静态链接进行过滤
if this_req.get_url().get_ext() in self._black_ext:
continue

# 控制爬行的深度
if depth > self.depth_limit:
print "depth limit break"
break
# 控制爬行的时间
if self.get_discovery_time() > self.time_limit:
print "time limit break"
break

# 控制爬行的链接数,避免内存泄露
if self.num_reqs > self.req_limit:
print "reqs num limit break"
break

if this_req in self._already_send_reqs:
continue

try:
self._already_send_reqs.add(this_req)

om.info("%s Request:%s" % (this_req.get_method(), this_req.get_url().url_string))

response = None

try:
response = wcurl._send_req(this_req)

except Exception, e:
print str(e)
pass

if is_404(response):
continue

if response is None:
continue
# 获取HTTP响应中的请求
new_reqs = self._get_reqs_from_resp(response)
# 过滤相似和包含的请求
filter_reqs = self._do_with_reqs(new_reqs)

depth = depth + 1
for req in filter_reqs:
q.put((req, depth))

self.num_reqs = len(self._already_send_reqs)
om.info("Already Send Reqs:" + str(self.num_reqs) + " Left Reqs:" + str(q.qsize()))

except Exception, e:
traceback.print_exc()
om.info("ERROR: Can't process url '%s' (%s)" % (this_req.get_url(), e))
continue

time.sleep(self._sleeptime)

return self._do_with_reqs(self._wRequestList)


if __name__ == "__main__":
'''
'''
# url1="http://testphp.vulnweb.com/showimage.php"
# url2="http://testphp.vulnweb.com/showimage.php?id=1"
# print is_similar_url(url1,url2)
# sys.exit()
w = Crawler()
wurl = "http://192.168.1.105:8080/wavsep/active/index-sql.jsp"
a = w.crawl(wurl)
for item in a:
# print "\r\n"
# print item
print item.get_url()

print "Found URL Num:" + str(len(a))
sys.exit()
Binary file added crawler.pyc
Binary file not shown.
53 changes: 53 additions & 0 deletions env_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# coding=utf-8
'''
env_config.py
配置扫描所需要的环境
'''
import os
import platform
import teye_config as Settings

os_info = platform.platform() # 获取操作系统名称及版本号,'Windows-7-6.1.7601-SP1'
sys_info = platform.system() # Linux or windows or darwin
'''
def nmap_search_path():
pass
syslist = {
"win": ["windows"],
"linux": ['ubuntu', 'centos', 'debian'],
"mac": ["darwin"]
}
if os_info.lower() in syslist.get("mac"): # Mac
# Install brew
cmd = 'ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" < /dev/null 2 > /dev/null'
os.system(cmd)
# Install Nmap
cmd = "brew install nmap"
os.system(cmd)
# Install python-nmap
cmd = "pip install python-nmap"
os.system(cmd)
if os_info.lower() in syslist.get("linux"): # Linux
# python-dev,python-pip
os.system("apt-get -y install python-dev")
os.system("apt-get install python-pip")
# lxml
os.system("apt-get -y install libxml2 libxml2-dev")
os.system("apt-get -y install python-libxml2")
os.system("apt-get -y install python-lxml")
# nmap
os.system("apt-get -y install nmap")
# python-paramiko
os.system("apt-get -y install python-paramiko")
if os_info.lower() in syslist.get("win"): # windows
pass
'''
# pip install -r requirements.txt
piprequire = Settings.ROOT_PATH + "/requirements.txt"
os.system("pip install -r " + piprequire)
Empty file added misc/__init__.py
Empty file.
Binary file added misc/__init__.pyc
Binary file not shown.
Loading

0 comments on commit 5240775

Please sign in to comment.