-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSearchEngine.java
171 lines (155 loc) · 4.92 KB
/
SearchEngine.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
package finalproject;
import java.util.HashMap;
import java.util.ArrayList;
public class SearchEngine {
// this will contain a set of pairs (String, LinkedList of Strings)
public HashMap<String, ArrayList<String> > wordIndex;
public MyWebGraph internet;
public XmlParser parser;
//constructor
public SearchEngine(String filename) throws Exception{
this.wordIndex = new HashMap<String, ArrayList<String>>();
this.internet = new MyWebGraph();
this.parser = new XmlParser(filename);
}
/*
* crawlAndIndex does a graph traversal of the web, starting at the given url.
* For each new page seen, it updates the wordIndex, the web graph,
* and the set of visited vertices.
*/
public void crawlAndIndex(String url) throws Exception {
//Add this URL vertex and mark Visited
internet.addVertex(url);
internet.setVisited(url, true);
//Get content from this URL
ArrayList<String> mots = parser.getContent(url);
//for all its words
for(String mot: mots) {
//index them properly
ArrayList<String> urlList = wordIndex.get(mot);
if(urlList == null) {
urlList = new ArrayList<String>();
urlList.add(url);
wordIndex.put(mot, urlList);
} else {
if(!urlList.contains(url)) {
urlList.add(url);
}
}
}
//now we will analyze all sites it links to
ArrayList<String> links = parser.getLinks(url);
for(String link: links) {
//for each word in each linked page
ArrayList<String> words = parser.getContent(link);
for(String word: words) {
//index them properly
ArrayList<String> urlList = wordIndex.get(word);
if(urlList == null) {
urlList = new ArrayList<String>();
urlList.add(link);
wordIndex.put(word, urlList);
} else {
if(!urlList.contains(link)) {
urlList.add(link);
}
}
}
//for each linked page, vertex / visited / edge
if (!internet.getVisited(link)) {
internet.addVertex(link);
//dank recursion
crawlAndIndex(link);
}
internet.addEdge(url, link);
internet.setVisited(link, true);
}
}
/*
* assignPageRanks computes pageRanks for every vertex in the web graph.
* It will only be called after the graph has been constructed using
* crawlAndIndex(). Implementation created based on an algorithm that was
* described in the assignment.
*/
public void assignPageRanks(double epsilon) {
// start by setting all pageRanks to 1
for(String url: internet.getVertices()) {
if(internet.getPageRank(url) == 0) {
internet.setPageRank(url, 1);
}
}
//loop through pages
for(String url: internet.getVertices()) {
//determine old rank, compute new rank, compare
double prevRank = internet.getPageRank(url);
computeRanks(internet.getVertices());
double newRank = internet.getPageRank(url);
//continue assigning rank until convergence is reached for difference between old and new rank
while(!(Math.abs((prevRank) - (newRank)) < epsilon)) {
prevRank = internet.getPageRank(url);
computeRanks(internet.getVertices());
newRank = internet.getPageRank(url);
}
}
}
/*
* The method takes as input an ArrayList<String> representing the urls in the web graph
* and returns an ArrayList<double> representing the newly computed ranks for those urls.
* Note that the double in the output list is matched to the url in the input list using
* their position in the list.
*/
public ArrayList<Double> computeRanks(ArrayList<String> vertices) {
double dampingFactor = 0.5;
double pr = 0;
double total = 0;
ArrayList<Double> ranks = new ArrayList<Double>();
//set all the ranks to one
for(String url: vertices) {
if(internet.getPageRank(url) == 0) {
internet.setPageRank(url, 1);
}
}
//for each url
for(String url: vertices) {
//for each of the pages linking into it
for(String linkedPage: internet.getEdgesInto(url)) {
//add their rank/outdeg to this ones rank function
pr += (internet.getPageRank(linkedPage) / internet.getOutDegree(linkedPage));
}
//finalize calucaltion step
pr = (1 - dampingFactor) + (dampingFactor * pr);
//set this page's rank
internet.setPageRank(url, pr);
//add it to the list
ranks.add(pr);
//reset rank and go to next url
pr = 0;
}
return ranks;
}
/*
* Returns a list of urls containing the query, ordered by rank
* Returns an empty list if no web site contains the query.
*/
public ArrayList<String> getResults(String query) {
// TODO: Add code here
ArrayList<String> urls = new ArrayList<String>();
//if the word exists on the Internet
if(wordIndex.keySet().contains(query)) {
//for all the URLs in the index list for this word
for(String url: wordIndex.get(query)) {
//if the list is empty
if(urls.size() == 0) {
urls.add(url);
} else {
int index = 0;
while(!(index >= urls.size()) && internet.getPageRank(url) <= internet.getPageRank(urls.get(index))){
index += 1;
}
urls.add(index, url);
}
}
}
return urls;
}
}