forked from belate/newsclassifier
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread.py
68 lines (54 loc) · 1.83 KB
/
read.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import requests
import re
from bs4 import BeautifulSoup
content = []
def clean(text):
text= text.lower()
text = re.sub(r'\W', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
'''
PERIODICO <--> junto con el respectivo find-all de esa página
--> FEED-SECCION
--> articulo(s) del feed pulidos
'''
link="http://www.finanzas.com/rss/noticiasportada.xml"
#link="http://www.portafolio.co/economia/feed"
#link="http://www.banrep.gov.co/rss.xml"
#link="http://feeds.bbci.co.uk/news/politics/rss.xml"
feed = requests.get(link, timeout=20) #Link = RSS
parser='html.parser'
#Hay que cambiar el guid, en portafolio es link, pero no funciona. Tuve que usar "#comments"
for url in BeautifulSoup(feed.content,parser).find_all('link'):
try:
print (url.text)
article = requests.get(url.text, timeout=20)
except Exception:
print ("virgula")
soup = BeautifulSoup(article.content, parser)
#body = soup.find_all('div', class_='articulo-articulo') Para portafolio
body = soup.find_all('div', class_='contenido_noticia')
if (body):
body2= clean(' '.join([t.get_text() for t in body[0].find_all(['p','h2'])]))
content.append(body2)
print("contenido va: "+str(len(content)))
else:
print ("empty body")
#print(content)
print(len(content))
f = open('resultado.txt', 'a')
for line in content:
f.write(line+"\n")
#print ("wrote :"+str(line))
f.close()
if( 1 == 0 ):
#Test Unique
link="http://www.portafolio.co/economia/feed"
feed = requests.get(link, timeout=20) #Link = RSS
parser='html.parser'
find_in_rss='link'
urls = BeautifulSoup(feed.content,parser).find_all(find_in_rss)
article = requests.get(urls[1].text, timeout=20)
soup = BeautifulSoup(article.content, parser)
body = soup.find_all('div', class_='articulo-articulo')#contenido_noticia
body2= clean(' '.join([t.get_text() for t in body[0].find_all(['p','h2'])]))