-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathhu_utils.py
243 lines (220 loc) · 8.15 KB
/
hu_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:吉祥鸟
# datetime:2018/10/22 14:41
# software: PyCharm
import time
import MySQLdb
import requests
import random
import sys
reload(sys)
sys.setdefaultencoding('utf8')
"""
个人常用工具库(此库不是所有的代码都会使用)
注意:此工具库适用于python2,与python3不兼容
请求模块:get_url_html()
连接本地库(默认spider):open_local_db()
连接线上库(默认lz_datastore):open_line_db()
数据库查询信息:select_one()(需修改)
单条信息插入或更新:insert_update_one()
多条信息插入或更新insert_update_many()
单条信息更新:update_one()
获取格式化时间:now_time()
"""
def get_url_html(url,ip=False):
"""
get请求url的函数
:param url:
:ip:是否使用代理ip,默认不使用
:return: 未解析的html
"""
z = 0
user_agents = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
]
us = random.choice(user_agents)
headers = {
'User-Agent': us
}
proxy = "***********"
proxies = {
"http": "http://" + proxy,
"https": "https://" + proxy,
}
try:
for i in range(1, 10):
print now_time(), "第%s次请求" % i,
try:
if ip==True:
html = requests.get(url, headers=headers, proxies=proxies, timeout=10)
else:
html = requests.get(url, headers=headers, timeout=10)
if html.status_code == 200:
z = 1
break
else:
print "请求出错,再次尝试请求......"
except:
print "请求出错,再次尝试请求......"
if z == 1:
print '本次请求成功'
return html.text
else:
print now_time(), "请求超过最大次数,跳过"
return
# print html
except:
print now_time(), '%s请求失败'%url
return
def open_local_db(db="spider"):
"""
开启本地数据库
:param db: 数据库(默认为spider)
:return: 创建好的数据库连接
"""
print now_time(), '连接本地数据库%s' % db
conn = MySQLdb.connect(host='192.168.1.4', user='root', passwd='666', db=db, port=3306, charset='gbk')
return conn
def open_line_db(db="lz_datastore"):
"""
开启线上数据库
:param db: 数据库(默认为spider)
:return: 创建好的数据库连接
"""
print now_time(), '连接线上数据库%s' % db
conn = MySQLdb.connect(host='***',
user='***',
passwd='***',
db=db,
port=3306,
charset='gbk')
return conn
def select_one(conn):
"""
查询信息
:param time:
:param type:
:return:
"""
cursor = conn.cursor()
sql = "select etid,etname from et_info limit 3"
try:
cursor.execute(sql)
result = cursor.fetchall()
conn.commit()
return result
except:
print time.time(), '更新数据失败,回滚'
conn.rollback()
conn.close()
def insert_update_one(conn, item, table_name):
"""
单条数据插入或更新到数据库(注:插入的数据包含表里的关键字)
:param conn:数据库
:param item:插入的数据字典(字典类型)
:param table_name:数据库表名
:return:无
"""
print now_time(), "更新数据....."
cursor = conn.cursor()
sql1 = "insert into %s" % table_name
sql2 = "("
sql3 = ") values("
sql4 = ")on duplicate key update "
for key in item.keys(): # 拼接sql语句
sql2 += "%s," % key
sql3 += "%s,"
sql4 += "%s=values(%s)," % (key, key)
item_values = list(item.values())
item_values[1] = str(item_values[1]).encode("gbk")
# print item_values
sql = sql1+sql2[:-1]+sql3[:-1]+sql4[:-1]
# print sql
cursor.execute(sql, item_values)
conn.commit()
conn.close()
print now_time(), "数据更新成功"
def insert_update_many(conn, items, table_name):
"""
多条数据插入或更新到数据库(注:插入的数据包含表里的关键字)
:param conn:数据库
:param items:插入的数据列表字典(列表内包含字典类型)
:param table_name:数据库表名
:return:无
"""
cursor = conn.cursor()
sql1 = "insert into %s" % table_name
sql2 = "("
sql3 = ") values("
sql4 = ")on duplicate key update "
for key in items[0].keys(): # 拼接sql语句
sql2 += "%s," % key
sql3 += "%s,"
sql4 += "%s=values(%s)," % (key, key)
sql = sql1 + sql2[:-1] + sql3[:-1] + sql4[:-1]
item_values = []
for item in items:
item_values.append(list(item.values()))
num = len(item_values)
print now_time(), '一共需要处理数据%s条' % num
print sql
try:
for i in range(0, num, 1000):
a = min(num, 1000 + i)
cursor.executemany(sql, item_values[i:a])
conn.commit()
print now_time(), "当前已经处理%s条数据" % a
except:
print now_time(), '更新数据失败,回滚'
conn.rollback()
conn.close()
def update_one(db, item, table_name):
"""
向数据库更新一条数据的方法
:param item: 要写入数据库的数据字典
:param table_name: 表名
:return:
"""
cursor = db.cursor()
sql1 = "update %s " % table_name
sql2 = "set "
for key in item.keys():
date = item[key]
if key == 'etid':
sql3 = ' where %s = "%s"' % (key, date)
else:
sql2 += '%s = "%s",' % (key, date)
sql = sql1 + sql2[:-1] + sql3
try:
cursor.execute(sql)
cursor.commit()
except:
print now_time(), '更新数据失败,回滚'
db.rollback()
db.close()
def now_time():
"""
格式化返回当前时间
:return:
"""
now = int(time.time())
local_time = time.localtime(now)
format_now = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
return format_now