Skip to content

Commit b97e0a6

Browse files
committed
fixing except block directly handling baseException
1 parent 4ace637 commit b97e0a6

File tree

1 file changed

+132
-111
lines changed

1 file changed

+132
-111
lines changed

news_articles__scraper.py

+132-111
Original file line numberDiff line numberDiff line change
@@ -11,61 +11,64 @@
1111

1212
# ! pip install newspaper3k
1313

14-
# importing necessary libraries
15-
from bs4 import BeautifulSoup
16-
import requests
17-
import urllib
18-
import pandas as pd
19-
from newspaper import Article
2014
import pickle
2115
import re
2216
import sys
17+
import urllib
18+
19+
import pandas as pd
20+
import requests
21+
22+
# importing necessary libraries
23+
from bs4 import BeautifulSoup
24+
from newspaper import Article
2325

2426
# Extracting links for all the pages (1 to 158) of boomlive fake news section
2527
fakearticle_links = []
2628
for i in range(1, 159):
27-
url = 'https://www.boomlive.in/fake-news/' + str(i)
28-
try:
29-
# this might throw an exception if something goes wrong.
30-
page=requests.get(url)
31-
32-
# send requests
33-
page = requests.get(url)
34-
soup = BeautifulSoup(page.text, 'html.parser')
35-
36-
# Collecting all the links in a list
37-
for content in soup.find_all('h2', attrs={'class':'entry-title'}):
38-
link = content.find('a')
39-
fakearticle_links.append(link.get('href'))
40-
41-
# this describes what to do if an exception is thrown
42-
except Exception as e:
43-
# get the exception information
44-
error_type, error_obj, error_info = sys.exc_info()
45-
#print the link that cause the problem
46-
print ('ERROR FOR LINK:',url)
47-
#print error info and line that threw the exception
48-
print (error_type, 'Line:', error_info.tb_lineno)
49-
continue
29+
url = "https://www.boomlive.in/fake-news/" + str(i)
30+
try:
31+
# this might throw an exception if something goes wrong.
32+
page = requests.get(url)
33+
34+
# send requests
35+
page = requests.get(url)
36+
soup = BeautifulSoup(page.text, "html.parser")
37+
38+
# Collecting all the links in a list
39+
for content in soup.find_all("h2", attrs={"class": "entry-title"}):
40+
link = content.find("a")
41+
fakearticle_links.append(link.get("href"))
42+
43+
# this describes what to do if an exception is thrown
44+
except Exception as e:
45+
# get the exception information
46+
error_type, error_obj, error_info = sys.exc_info()
47+
# print the link that cause the problem
48+
print("ERROR FOR LINK:", url)
49+
# print error info and line that threw the exception
50+
print(error_type, "Line:", error_info.tb_lineno)
51+
continue
5052

5153
fakearticle_links[:5]
5254

5355
len(fakearticle_links)
5456

5557
fakearticle_links[1888:]
5658

59+
import matplotlib.pyplot as plt
5760
import pandas as pd
61+
5862
import numpy as np
59-
import matplotlib.pyplot as plt
6063

6164
"""We have to modify the links so that the links actually work as we can see that the string extracted is the last part of the url!
6265
6366
**We have to add 'https://www.boomlive.in/fake-news' to the extracted links.**
6467
"""
6568

6669
# Modify the links so that it takes us to the particular website
67-
str1 = 'https://www.boomlive.in/fake-news'
68-
fakearticle_links = [str1+lnk for lnk in fakearticle_links]
70+
str1 = "https://www.boomlive.in/fake-news"
71+
fakearticle_links = [str1 + lnk for lnk in fakearticle_links]
6972

7073
fakearticle_links[6:10]
7174

@@ -75,30 +78,37 @@
7578
"""
7679

7780
# Create a dataset for storing the news articles
78-
news_dataset = pd.DataFrame(fakearticle_links, columns=['URL'])
81+
news_dataset = pd.DataFrame(fakearticle_links, columns=["URL"])
7982

8083
news_dataset.head()
8184

82-
title, text, summary, keywords, published_on, author = [], [], [], [], [], [] # Creating empty lists to store the data
85+
title, text, summary, keywords, published_on, author = (
86+
[],
87+
[],
88+
[],
89+
[],
90+
[],
91+
[],
92+
) # Creating empty lists to store the data
8393
for Url in fakearticle_links:
84-
article = Article(Url)
85-
86-
#Call the download and parse methods to download information
87-
try:
88-
article.download()
89-
article.parse()
90-
article.nlp()
91-
except Exception as error:
92-
print(f"exception : {error}")
93-
pass
94-
95-
# Scrape the contents of article
96-
title.append(article.title) # extracts the title of the article
97-
text.append(article.text) # extracts the whole text of article
98-
summary.append(article.summary) # gives us a summary abou the article
99-
keywords.append(', '.join(article.keywords)) # the main keywords used in it
100-
published_on.append(article.publish_date) # the date on which it was published
101-
author.append(article.authors) # the authors of the article
94+
article = Article(Url)
95+
96+
# Call the download and parse methods to download information
97+
try:
98+
article.download()
99+
article.parse()
100+
article.nlp()
101+
except Exception as error:
102+
print(f"exception : {error}")
103+
pass
104+
105+
# Scrape the contents of article
106+
title.append(article.title) # extracts the title of the article
107+
text.append(article.text) # extracts the whole text of article
108+
summary.append(article.summary) # gives us a summary abou the article
109+
keywords.append(", ".join(article.keywords)) # the main keywords used in it
110+
published_on.append(article.publish_date) # the date on which it was published
111+
author.append(article.authors) # the authors of the article
102112

103113
"""**Checking the lists created**"""
104114

@@ -111,107 +121,118 @@
111121
author[6]
112122

113123
# Adding the columns in the fake news dataset
114-
news_dataset['title'] = title
115-
news_dataset['text'] = text
116-
news_dataset['keywords'] = keywords
117-
news_dataset['published date'] = published_on
118-
news_dataset['author'] = author
124+
news_dataset["title"] = title
125+
news_dataset["text"] = text
126+
news_dataset["keywords"] = keywords
127+
news_dataset["published date"] = published_on
128+
news_dataset["author"] = author
119129

120130
# Check the first five columns of dataset created
121131
news_dataset.head()
122132

123133
"""**Converting the dataset to a csv file**"""
124134

125-
news_dataset.to_csv('Fake_news.csv')
135+
news_dataset.to_csv("Fake_news.csv")
126136

127137
"""**Reading the csv file**"""
128138

129-
df = pd.read_csv('Fake_news.csv')
139+
df = pd.read_csv("Fake_news.csv")
130140

131141
# Checking the last 5 rows of the csv file
132142
df.tail(5)
133143

134144
"""**Download the csv file in local machine**"""
135145

136146
from google.colab import files
137-
files.download('Fake_news.csv')
147+
148+
files.download("Fake_news.csv")
138149

139150
"""**Scraping news from Times of India**"""
140151

141-
TOIarticle_links = [] # Creating an empty list of all the urls of news from Times of India site
152+
TOIarticle_links = (
153+
[]
154+
) # Creating an empty list of all the urls of news from Times of India site
142155

143156
# Extracting links for all the pages (2 to 125) of boomlive fake news section
144157
for i in range(2, 126):
145-
url = 'https://timesofindia.indiatimes.com/news/' + str(i)
146-
147-
try:
148-
# send requests
149-
page = requests.get(url)
150-
soup = BeautifulSoup(page.text, 'html.parser')
151-
152-
# Collecting all the links in a list
153-
for content in soup.find_all('span', attrs={'class':'w_tle'}):
154-
link = content.find('a')
155-
TOIarticle_links.append(link.get('href'))
156-
157-
# this describes what to do if an exception is thrown
158-
except Exception as e:
159-
# get the exception information
160-
error_type, error_obj, error_info = sys.exc_info()
161-
#print the link that cause the problem
162-
print ('ERROR FOR LINK:',url)
163-
#print error info and line that threw the exception
164-
print (error_type, 'Line:', error_info.tb_lineno)
165-
continue
158+
url = "https://timesofindia.indiatimes.com/news/" + str(i)
159+
160+
try:
161+
# send requests
162+
page = requests.get(url)
163+
soup = BeautifulSoup(page.text, "html.parser")
164+
165+
# Collecting all the links in a list
166+
for content in soup.find_all("span", attrs={"class": "w_tle"}):
167+
link = content.find("a")
168+
TOIarticle_links.append(link.get("href"))
169+
170+
# this describes what to do if an exception is thrown
171+
except Exception as e:
172+
# get the exception information
173+
error_type, error_obj, error_info = sys.exc_info()
174+
# print the link that cause the problem
175+
print("ERROR FOR LINK:", url)
176+
# print error info and line that threw the exception
177+
print(error_type, "Line:", error_info.tb_lineno)
178+
continue
166179

167180
TOIarticle_links[6:15]
168181

169182
len(TOIarticle_links)
170183

171-
str2 = 'https://timesofindia.indiatimes.com'
172-
TOIarticle_links = [str2+lnk for lnk in TOIarticle_links if lnk[0]=='/']
184+
str2 = "https://timesofindia.indiatimes.com"
185+
TOIarticle_links = [str2 + lnk for lnk in TOIarticle_links if lnk[0] == "/"]
173186

174187
TOIarticle_links[5:8]
175188

176189
len(TOIarticle_links)
177190

178-
title, text, summary, keywords, published_on, author = [], [], [], [], [], [] # Creating empty lists to store the data
191+
title, text, summary, keywords, published_on, author = (
192+
[],
193+
[],
194+
[],
195+
[],
196+
[],
197+
[],
198+
) # Creating empty lists to store the data
179199
for Url in TOIarticle_links:
180-
article = Article(Url)
181-
182-
#Call the download and parse methods to download information
183-
try:
184-
article.download()
185-
article.parse()
186-
article.nlp()
187-
except:
188-
pass
189-
190-
# Scrape the contents of article
191-
title.append(article.title) # extracts the title of the article
192-
text.append(article.text) # extracts the whole text of article
193-
summary.append(article.summary) # gives us a summary abou the article
194-
keywords.append(', '.join(article.keywords)) # the main keywords used in it
195-
published_on.append(article.publish_date) # the date on which it was published
196-
author.append(article.authors) # the authors of the article
200+
article = Article(Url)
201+
202+
# Call the download and parse methods to download information
203+
try:
204+
article.download()
205+
article.parse()
206+
article.nlp()
207+
except Exception:
208+
pass
209+
210+
# Scrape the contents of article
211+
title.append(article.title) # extracts the title of the article
212+
text.append(article.text) # extracts the whole text of article
213+
summary.append(article.summary) # gives us a summary abou the article
214+
keywords.append(", ".join(article.keywords)) # the main keywords used in it
215+
published_on.append(article.publish_date) # the date on which it was published
216+
author.append(article.authors) # the authors of the article
197217

198218
title[5]
199219

200-
TOI_dataset = pd.DataFrame(TOIarticle_links, columns=['URL'])
220+
TOI_dataset = pd.DataFrame(TOIarticle_links, columns=["URL"])
201221
# Adding the columns in the TOI news dataset
202-
TOI_dataset['title'] = title
203-
TOI_dataset['text'] = text
204-
TOI_dataset['keywords'] = keywords
205-
TOI_dataset['published date'] = published_on
206-
TOI_dataset['author'] = author
222+
TOI_dataset["title"] = title
223+
TOI_dataset["text"] = text
224+
TOI_dataset["keywords"] = keywords
225+
TOI_dataset["published date"] = published_on
226+
TOI_dataset["author"] = author
207227

208228
TOI_dataset.head()
209229

210-
TOI_dataset.to_csv('TOI_news_dataset.csv')
230+
TOI_dataset.to_csv("TOI_news_dataset.csv")
211231

212-
dt = pd.read_csv('TOI_news_dataset.csv')
232+
dt = pd.read_csv("TOI_news_dataset.csv")
213233

214234
dt.tail(3)
215235

216236
from google.colab import files
217-
files.download('TOI_news_dataset.csv')
237+
238+
files.download("TOI_news_dataset.csv")

0 commit comments

Comments
 (0)