-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathData Analysis (1).py
More file actions
177 lines (130 loc) · 5.19 KB
/
Data Analysis (1).py
File metadata and controls
177 lines (130 loc) · 5.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env python
# coding: utf-8
# # Data Extraction
# In[9]:
import requests
from bs4 import BeautifulSoup
import openpyxl
import os
# Load the input.xlsx file
input_file = r"C:\Users\Nidhi Gangwar\Downloads\Input.xlsx"
wb = openpyxl.load_workbook(input_file)
sheet = wb.active
# Create a directory to save the articles
output_dir = 'articles'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Function to extract article text from a URL
def extract_article(url):
try:
# Send a request to the URL
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the article title
title = soup.find('h1').get_text(strip=True)
# Extract the article text
paragraphs = soup.find_all('p')
article_text = '\n'.join([para.get_text(strip=True) for para in paragraphs])
return title + '\n\n' + article_text
except Exception as e:
print(f"Error extracting article from {url}: {e}")
return None
# Iterate through each row in the Excel sheet
for row in sheet.iter_rows(min_row=2, values_only=True):
url_id, url = row
# Extract the article text
article_content = extract_article(url)
if article_content:
# Save the extracted text in a .txt file with URL_ID as the filename
file_path = os.path.join(output_dir, f'{url_id}.txt')
with open(file_path, 'w', encoding='utf-8') as file:
file.write(article_content)
print("Extraction completed!")
# # Data Analysis
# In[10]:
get_ipython().system('pip install nltk')
get_ipython().system('pip install textblob')
get_ipython().system('pip install pandas')
# In[21]:
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from textblob import TextBlob
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
input_file = r"C:\Users\Nidhi Gangwar\Downloads\Input.xlsx"
output_structure_file = r"C:\Users\Nidhi Gangwar\Downloads\Output Data Structure.xlsx"
output_dir ='articles'
with open(r"C:\Users\Nidhi Gangwar\Downloads\positive-words.txt", "r") as file:
positive_words = set(file.read().split())
with open(r"C:\Users\Nidhi Gangwar\Downloads\negative-words.txt", "r") as file:
negative_words = set(file.read().split())
def syllable_count(word):
word = word.lower()
vowels = "aeiouy"
count = 0
if word[0] in vowels:
count += 1
for index in range(1, len(word)):
if word[index] in vowels and word[index - 1] not in vowels:
count += 1
if word.endswith("e"):
count -= 1
if count == 0:
count += 1
return count
def analyze_text(text):
words = word_tokenize(text)
sentences = sent_tokenize(text)
positive_score = sum(1 for word in words if word in positive_words)
negative_score = sum(1 for word in words if word in negative_words)
polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
subjectivity_score = TextBlob(text).sentiment.subjectivity
word_count = len(words)
complex_words = sum(1 for word in words if syllable_count(word) > 2)
percentage_complex_words = complex_words / word_count
avg_sentence_length = word_count / len(sentences)
avg_words_per_sentence = avg_sentence_length
fog_index = 0.4 * (avg_sentence_length + percentage_complex_words * 100)
avg_syllables_per_word = sum(syllable_count(word) for word in words) / word_count
avg_word_length = sum(len(word) for word in words) / word_count
personal_pronouns = len([word for word, tag in pos_tag(words) if tag in ['PRP', 'PRP$', 'WP', 'WP$']])
return {
"POSITIVE SCORE": positive_score,
"NEGATIVE SCORE": negative_score,
"POLARITY SCORE": polarity_score,
"SUBJECTIVITY SCORE": subjectivity_score,
"AVG SENTENCE LENGTH": avg_sentence_length,
"PERCENTAGE OF COMPLEX WORDS": percentage_complex_words,
"FOG INDEX": fog_index,
"AVG NUMBER OF WORDS PER SENTENCE": avg_words_per_sentence,
"COMPLEX WORD COUNT": complex_words,
"WORD COUNT": word_count,
"SYLLABLE PER WORD": avg_syllables_per_word,
"PERSONAL PRONOUNS": personal_pronouns,
"AVG WORD LENGTH": avg_word_length
}
input_data = pd.read_excel(input_file)
output_data = pd.read_excel(output_structure_file)
for index, row in input_data.iterrows():
url_id = row['URL_ID']
file_path = os.path.join(output_dir, f'{url_id}.txt')
if os.path.exists(file_path):
print(f"Processing file: {file_path}")
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
analysis_results = analyze_text(text)
print(f"Results for {url_id}: {analysis_results}")
for key, value in analysis_results.items():
output_data.at[index, key] = value
else:
print(f"File not found: {file_path}")
output_file_path = r"C:\Users\Nidhi Gangwar\Downloads\Output Data Structure.xlsx"
output_data.to_excel(output_file_path, index=False)
print(f"Textual analysis completed and results saved to {output_file_path}!")
# In[ ]: