-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
49fb8d6
commit 25566d0
Showing
1 changed file
with
291 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,291 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import PyPDF2\n", | ||
"import re\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"from bs4.element import Comment\n", | ||
"import urllib2\n", | ||
"from urllib2 import URLError\n", | ||
"from urllib2 import HTTPError\n", | ||
"import socket\n", | ||
"\n", | ||
"regex = re.compile(\n", | ||
" r'^(?:http|ftp)s?://' # http:// or https://\n", | ||
" r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\\.)+(?:[A-Z]{2,6}\\.?|[A-Z0-9-]{2,}\\.?)|' #domain...\n", | ||
" r'localhost|' #localhost...\n", | ||
" r'\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})' # ...or ip\n", | ||
" r'(?::\\d+)?' # optional port\n", | ||
" r'(?:/?|[/?]\\S+)$', re.IGNORECASE)\n", | ||
"\n", | ||
"def isValidUrl(url):\n", | ||
" if regex.match(url) is not None:\n", | ||
" return True;\n", | ||
" return False\n", | ||
"\n", | ||
"def tag_visible(element):\n", | ||
" if element.parent.name in ['style', 'script', 'meta', '[document]']:\n", | ||
" return False\n", | ||
" if isinstance(element, Comment):\n", | ||
" return False\n", | ||
" return True\n", | ||
"\n", | ||
"i = 1\n", | ||
"j = 1\n", | ||
"\n", | ||
"def crawlLinks(SeedUrl):\n", | ||
" global i\n", | ||
" page = SeedUrl\n", | ||
" print 'Crawling:'+page\n", | ||
" try:\n", | ||
" pagesource = urllib2.urlopen(page, timeout = 5)\n", | ||
" except HTTPError, URLError:\n", | ||
" print(\"Something bad happened\")\n", | ||
" except socket.timeout, e:\n", | ||
" print(\"Something bad happened\")\n", | ||
" else:\n", | ||
" s = pagesource.read()\n", | ||
" soup = BeautifulSoup(s)\n", | ||
" divs = soup.findAll(\"div\", {\"class\" : \"content-inner grid_9 push_3\"})\n", | ||
" file1 = open(\"text\"+str(i)+\".txt\",\"wb\")\n", | ||
" for d in divs:\n", | ||
" data1 = d.findAll(text = True)\n", | ||
" text1 = filter(tag_visible, data1)\n", | ||
" file1.write(\" \".join(t.encode('ascii','ignore').decode('ascii').strip() for t in text1))\n", | ||
" i += 1\n", | ||
" file1.close()\n", | ||
" for d in divs:\n", | ||
" link = d.findAll('a', href = True)\n", | ||
" for li in link:\n", | ||
" if li['href'].endswith('.pdf'):\n", | ||
" if isValidUrl(li['href']):\n", | ||
" downPdf(li['href'])\n", | ||
"\n", | ||
"\n", | ||
"def downPdf(SeedUrl):\n", | ||
" global j\n", | ||
" page = SeedUrl\n", | ||
" try:\n", | ||
" pagesource = urllib2.urlopen(page, timeout = 5)\n", | ||
" except HTTPError, URLError:\n", | ||
" print(\"Something bad happened\")\n", | ||
" except socket.timeout, e:\n", | ||
" print(\"Something bad happened\")\n", | ||
" else:\n", | ||
" s = pagesource.read()\n", | ||
" file = open(\"pdf\"+str(j)+\".pdf\", \"wb\")\n", | ||
" file.write(s)\n", | ||
" file.close()\n", | ||
" file = open(\"pdf\"+str(j)+\".txt\", \"wb\")\n", | ||
" pdfFileObj = open(\"pdf\"+str(j)+\".pdf\", 'rb')\n", | ||
" pdfReader = PyPDF2.PdfFileReader(pdfFileObj)\n", | ||
" numofPages = pdfReader.numPages\n", | ||
" for n in range(0 , numofPages):\n", | ||
" pageObj = pdfReader.getPage(n)\n", | ||
" file.write(\"\".join(t.encode('ascii', 'ignore').decode('ascii') for t in pageObj.extractText()))\n", | ||
" j += 1\n", | ||
" file.close()\n", | ||
" pdfFileObj.close()\n", | ||
"\n", | ||
"\n", | ||
"def crawler(SeedUrl):\n", | ||
" global i\n", | ||
" page = SeedUrl\n", | ||
" print 'Crawling:'+page\n", | ||
" try:\n", | ||
" pagesource = urllib2.urlopen(page, timeout = 5)\n", | ||
" except HTTPError, URLError:\n", | ||
" print(\"Something bad happened\")\n", | ||
" except socket.timeout, e:\n", | ||
" print(\"Something bad happened\")\n", | ||
" else:\n", | ||
" s = pagesource.read()\n", | ||
" soup = BeautifulSoup(s)\n", | ||
" divs = soup.findAll(\"div\", { \"class\" : \"content-inner grid_9 push_3\" })\n", | ||
" file1 = open(\"text\"+str(i)+\".txt\",\"wb\")\n", | ||
" for d in divs:\n", | ||
" data1 = d.findAll(text = True)\n", | ||
" text1 = filter(tag_visible, data1)\n", | ||
" file1.write(\" \".join(t.encode('ascii','ignore').decode('ascii').strip() for t in text1))\n", | ||
" i += 1\n", | ||
" file1.close()\n", | ||
" forlinks = soup.findAll(\"div\", {\"id\" : \"menuwrapper\"})\n", | ||
" for l in forlinks:\n", | ||
" link = l.findAll('a', href = True)\n", | ||
" for li in link:\n", | ||
" if li['href'].endswith('.pdf'):\n", | ||
" downPdf(li['href'])\n", | ||
" else:\n", | ||
" crawlLinks(li['href'])\n", | ||
"\n", | ||
"crawler('http://du.ac.in/du/index.php?page=research')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 23, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import io\n", | ||
"import nltk\n", | ||
"import urllib2\n", | ||
"import string\n", | ||
"\n", | ||
"for i in range(1, 19):\n", | ||
" file1 = open('text'+str(i)+'.txt')\n", | ||
" line = file1.read()# Use this to read file content as a stream:\n", | ||
" sentences = nltk.sent_tokenize(line)\n", | ||
" appendFile = open('filteredtxt'+str(i)+'.txt','a')\n", | ||
" for sentence in sentences:\n", | ||
" for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):\n", | ||
" if (pos == 'NN' or pos == 'JJ'):\n", | ||
" appendFile.write(\" \"+word)\n", | ||
" appendFile.close()\n", | ||
" \n", | ||
"for i in range(1, 3):\n", | ||
" file1 = open('pdf'+str(i)+'.txt')\n", | ||
" line = file1.read()# Use this to read file content as a stream:\n", | ||
" sentences = nltk.sent_tokenize(line)\n", | ||
" appendFile = open('filteredpdf'+str(i)+'.txt','a')\n", | ||
" for sentence in sentences:\n", | ||
" for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):\n", | ||
" if (pos == 'NN' or pos == 'JJ'):\n", | ||
" appendFile.write(\" \"+word)\n", | ||
" appendFile.close()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import glob\n", | ||
" \n", | ||
"corpus = []\n", | ||
"for file in glob.glob(\"*.txt\"):\n", | ||
" with open(file, \"r\") as paper:\n", | ||
" corpus.append((file, paper.read()))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | ||
"\n", | ||
"tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')\n", | ||
"tfidf_matrix = tf.fit_transform([content for file, content in corpus])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from sklearn.metrics.pairwise import linear_kernel\n", | ||
"\n", | ||
"def find_similar(tfidf_matrix, index, top_n = 5):\n", | ||
" cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()\n", | ||
" related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]\n", | ||
" return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false, | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"print(tfidf_matrix)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 29, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0.713622701901 4\n", | ||
"0.0865938285945 14\n", | ||
"0.0437647613183 15\n", | ||
"0.0410348448431 0\n", | ||
"0.0368599016196 2\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"for index, score in find_similar(tfidf_matrix, 18):\n", | ||
" print score, index" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"print(tfidf_matrix*tfidf_matrix.T)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 2", | ||
"language": "python", | ||
"name": "python2" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.11" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |