Search_Engine_Project/civilinfoRetrieval.py at main · Skyhorizon2021/Search_Engine_Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["civilCrawler"]
collection = db["civilResearch"]

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    return ' '.join(stemmed_tokens)  # Return as a single string

# Prepare documents
documents = []
doc_ids = []
professors = []

for doc in collection.find():
    doc_id = str(doc['_id'])
    doc_ids.append(doc_id)

    name = doc.get('name', 'Unknown')
    url = doc.get('url', 'No URL provided')
    professors.append((name, url))

    combined_text = ""

    # Iterate over all fields in the document
    for key, value in doc.items():
        if key not in ['_id', 'url', 'email', 'phone', 'office_location']:  # Exclude non-textual fields
            if isinstance(value, list):  # Handle list fields
                for item in value:
                    if isinstance(item, dict):
                        # Concatenate all string values in the dictionary
                        combined_text += " ".join([str(v) for v in item.values() if isinstance(v, str)])
                    elif isinstance(item, str):
                        combined_text += " " + item
            elif isinstance(value, str):  # Handle string fields
                combined_text += " " + value

    processed_text = preprocess_text(combined_text)
    documents.append(processed_text)


# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer()

# Generate the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert the TF-IDF matrix to a dense format
dense_tfidf_matrix = tfidf_matrix.todense()

# Get feature names to match indices
feature_names = vectorizer.get_feature_names()

# Add a query and transform it
query = "Understanding Public Sentiment toward I-710 Corridor Project from Social Media Based on Natural Language Processing"
processed_query = preprocess_text(query)
query_vector = vectorizer.transform([processed_query])

# Compute cosine similarity
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

# Rank the documents
document_ranking = sorted(enumerate(cosine_similarities), key=lambda x: x[1], reverse=True)

# Display the ranking
for rank, (doc_idx, score) in enumerate(document_ranking):
    name, url = professors[doc_idx]
    print(f"Rank {rank + 1}: Document {doc_idx + 1} (Score: {score}) - {name}, Homepage: {url}")