|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +from selenium import webdriver |
| 3 | +from bs4 import BeautifulSoup as bs |
| 4 | +import os |
| 5 | +import time |
| 6 | + |
| 7 | +# change to test folder |
| 8 | +os.chdir('/Users/easonchan/test') |
| 9 | + |
| 10 | +#website_with_logins = "http://service.moj.gov.tw/lawer/associList.asp?associName=%A5x%A5_%AB%DF%AEv%A4%BD%B7|" |
| 11 | +website = [] |
| 12 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%A5x%A4%A4%AB%DF%AEv%A4%BD%B7|') |
| 13 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%A5x%A5_%AB%DF%AEv%A4%BD%B7|') |
| 14 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%A5x%AAF%AB%DF%AEv%A4%BD%B7|') |
| 15 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%A5x%ABn%AB%DF%AEv%A4%BD%B7|') |
| 16 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%A9y%C4%F5%AB%DF%AEv%A4%BD%B7|') |
| 17 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%AA%E1%BD%AC%AB%DF%AEv%A4%BD%B7|') |
| 18 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%ABn%A7%EB%AB%DF%AEv%A4%BD%B7|') |
| 19 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%AB%CC%AAF%AB%DF%AEv%A4%BD%B7|') |
| 20 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%AD]%AE%DF%AB%DF%AEv%A4%BD%B7|') |
| 21 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%AE%E7%B6%E9%AB%DF%AEv%A4%BD%B7|') |
| 22 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%B0%AA%B6%AF%AB%DF%AEv%A4%BD%B7|') |
| 23 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%B0%F2%B6%A9%AB%DF%AEv%A4%BD%B7|') |
| 24 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%B6%B3%AAL%AB%DF%AEv%A4%BD%B7|') |
| 25 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%B7s%A6%CB%AB%DF%AEv%A4%BD%B7|') |
| 26 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%B9%C5%B8q%AB%DF%AEv%A4%BD%B7|') |
| 27 | +website.append('http://service.moj.gov.tw/lawer/associList.asp?associName=%B9%FC%A4%C6%AB%DF%AEv%A4%BD%B7|') |
| 28 | +websitename = ['taichung','taipei','taidung','tainan','ilan','hualien','nantao','pintung','miaoli','taoyuan','kaohsiung','keelung','yunlin','hsinchu','chiayi','chunghwa'] |
| 29 | + |
| 30 | +driver = webdriver.Chrome() |
| 31 | +i = 10 |
| 32 | +for link in website[-3:]: |
| 33 | + driver.get( str(link) ) |
| 34 | + ps = driver.page_source |
| 35 | + soup = bs(ps) |
| 36 | + souptext = soup.getText() |
| 37 | + tmp = souptext.replace('\n',',') |
| 38 | + tmp = tmp.replace(',,,',',') |
| 39 | + tmp = tmp.replace(',,',',') |
| 40 | + tmp = tmp[:-15] |
| 41 | + tmp = tmp[292:] |
| 42 | + namelist = tmp.split(',') |
| 43 | + for name in namelist: |
| 44 | + f = open(websitename[i] +'_' + name + '.txt',"w+") |
| 45 | + isfound = True |
| 46 | + try: |
| 47 | + driver.find_element_by_partial_link_text(name) |
| 48 | + except: |
| 49 | + print 'Failed at:' + name |
| 50 | + isfound = False |
| 51 | + if(isfound): |
| 52 | + tmp1 = driver.find_element_by_partial_link_text(name) |
| 53 | + tmp1.click() |
| 54 | + tmp2 = driver.page_source |
| 55 | + time.sleep(1) |
| 56 | + f.write(unicode(tmp2).encode('utf8')) |
| 57 | + driver.back() |
| 58 | + time.sleep(1) |
| 59 | + f.close() |
| 60 | + |
| 61 | + print websitename[i] + 'finished' |
| 62 | + i += 1 |
| 63 | + |
| 64 | + |
| 65 | + |
| 66 | + |
| 67 | + |
| 68 | + |
| 69 | + |
0 commit comments