-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsearchEverything.py
91 lines (75 loc) · 3.19 KB
/
searchEverything.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import json, urllib, sys, re, socket, csv, time, argparse, http, search
startTime = time.time() # Start time of script execution
ARTISTS_CSV = "artists.csv"
RESULTS_CSV = "result.csv"
PLANTS_CSV = "plants.csv"
# Get an array of plants
plants = []
try:
reader = csv.reader(open(PLANTS_CSV, 'r'))
tempPlants = list(reader)
for plant in tempPlants:
plants.append(plant[0])
except FileNotFoundError :
print("The plants file is not found. Quitting...")
exit()
# Get the array of artists
artists = []
try:
reader = csv.reader(open(ARTISTS_CSV, 'r'))
tempArtists = list(reader)
for artist in tempArtists:
artists.append(artist[0])
except FileNotFoundError :
print("The artists file is not found. Quitting...")
exit()
# Open the results file
try:
results_file = open(RESULTS_CSV, 'w')
results_file.write("Plants, Artists (Ranking: higher = better), \n")
except FileNotFoundError :
print("The results file is not found. Quitting...")
exit()
for index, plant in enumerate(plants) :
plantStartTime = time.time() # Start time for execution of script for current plant
timeIndex = index
# The search query used
queryAdd = " art artist painting"
query = "\"" + plant + "\"" + queryAdd
# Prints: "1 out of 427 : PLANT_NAME"
print(index+1, "out of", len(plants), ":", plant)
# Get a list of urls
url1 = search.getUrls(query, "google", verbose=True)
url2 = search.getUrls(query, "duckduckgo", verbose=True)
urlList = [url1, url2]
urls = list(set().union(*urlList)) # union() removes duplicates
# Search the urls for occurrences of artist names
## Open each url and add to one string HTMLOfPages
HTMLOfPages = ""
for url in urls :
html = search.getPageText(url, verbose=True, timeout=3)
HTMLOfPages += html
print("Finished downloading all pages. Searching for artists...")
# There is an artist named "Erro" and he get's matched for every single
# "error" in the site text. Since no artist has "error" in their name
# we can safely get rid of "error" strings without messing up the results.
pattern = re.compile("error", re.IGNORECASE)
HTMLOfPages = pattern.sub("", HTMLOfPages.lower())
## For each artist, count the number of occurrences that artist has in the
## file and add it to an array (counter) with it's index corresponding to
## the index of that artist
counter = []
for index, artist in enumerate(artists, start=0):
count = HTMLOfPages.count(artist.lower())
counter.append(count)
#search.printOccurrences(counter, artists)
search.resultsToCsv(counter, artists, query, plant, results_file)
# Figure out how long the program's been running and print it out
plantFinishTime = time.time()
plantElapsedTime = (plantFinishTime - plantStartTime)
totalElapsedTime = (plantFinishTime - startTime)
print("Successfully added " + plant + " to results file in " + search.formatSeconds(plantElapsedTime) + ".")
print("Total elapsed time: " + search.formatSeconds(totalElapsedTime))
print("Average time per plant: " + search.formatSeconds(totalElapsedTime/(timeIndex+1)))
print("------------------------")
print("")