Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
ssomnathssaha authored Mar 19, 2018
1 parent 49fb8d6 commit 25566d0
Showing 1 changed file with 291 additions and 0 deletions.
291 changes: 291 additions & 0 deletions VB_assignment.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import PyPDF2\n",
"import re\n",
"from bs4 import BeautifulSoup\n",
"from bs4.element import Comment\n",
"import urllib2\n",
"from urllib2 import URLError\n",
"from urllib2 import HTTPError\n",
"import socket\n",
"\n",
"regex = re.compile(\n",
" r'^(?:http|ftp)s?://' # http:// or https://\n",
" r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\\.)+(?:[A-Z]{2,6}\\.?|[A-Z0-9-]{2,}\\.?)|' #domain...\n",
" r'localhost|' #localhost...\n",
" r'\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})' # ...or ip\n",
" r'(?::\\d+)?' # optional port\n",
" r'(?:/?|[/?]\\S+)$', re.IGNORECASE)\n",
"\n",
"def isValidUrl(url):\n",
" if regex.match(url) is not None:\n",
" return True;\n",
" return False\n",
"\n",
"def tag_visible(element):\n",
" if element.parent.name in ['style', 'script', 'meta', '[document]']:\n",
" return False\n",
" if isinstance(element, Comment):\n",
" return False\n",
" return True\n",
"\n",
"i = 1\n",
"j = 1\n",
"\n",
"def crawlLinks(SeedUrl):\n",
" global i\n",
" page = SeedUrl\n",
" print 'Crawling:'+page\n",
" try:\n",
" pagesource = urllib2.urlopen(page, timeout = 5)\n",
" except HTTPError, URLError:\n",
" print(\"Something bad happened\")\n",
" except socket.timeout, e:\n",
" print(\"Something bad happened\")\n",
" else:\n",
" s = pagesource.read()\n",
" soup = BeautifulSoup(s)\n",
" divs = soup.findAll(\"div\", {\"class\" : \"content-inner grid_9 push_3\"})\n",
" file1 = open(\"text\"+str(i)+\".txt\",\"wb\")\n",
" for d in divs:\n",
" data1 = d.findAll(text = True)\n",
" text1 = filter(tag_visible, data1)\n",
" file1.write(\" \".join(t.encode('ascii','ignore').decode('ascii').strip() for t in text1))\n",
" i += 1\n",
" file1.close()\n",
" for d in divs:\n",
" link = d.findAll('a', href = True)\n",
" for li in link:\n",
" if li['href'].endswith('.pdf'):\n",
" if isValidUrl(li['href']):\n",
" downPdf(li['href'])\n",
"\n",
"\n",
"def downPdf(SeedUrl):\n",
" global j\n",
" page = SeedUrl\n",
" try:\n",
" pagesource = urllib2.urlopen(page, timeout = 5)\n",
" except HTTPError, URLError:\n",
" print(\"Something bad happened\")\n",
" except socket.timeout, e:\n",
" print(\"Something bad happened\")\n",
" else:\n",
" s = pagesource.read()\n",
" file = open(\"pdf\"+str(j)+\".pdf\", \"wb\")\n",
" file.write(s)\n",
" file.close()\n",
" file = open(\"pdf\"+str(j)+\".txt\", \"wb\")\n",
" pdfFileObj = open(\"pdf\"+str(j)+\".pdf\", 'rb')\n",
" pdfReader = PyPDF2.PdfFileReader(pdfFileObj)\n",
" numofPages = pdfReader.numPages\n",
" for n in range(0 , numofPages):\n",
" pageObj = pdfReader.getPage(n)\n",
" file.write(\"\".join(t.encode('ascii', 'ignore').decode('ascii') for t in pageObj.extractText()))\n",
" j += 1\n",
" file.close()\n",
" pdfFileObj.close()\n",
"\n",
"\n",
"def crawler(SeedUrl):\n",
" global i\n",
" page = SeedUrl\n",
" print 'Crawling:'+page\n",
" try:\n",
" pagesource = urllib2.urlopen(page, timeout = 5)\n",
" except HTTPError, URLError:\n",
" print(\"Something bad happened\")\n",
" except socket.timeout, e:\n",
" print(\"Something bad happened\")\n",
" else:\n",
" s = pagesource.read()\n",
" soup = BeautifulSoup(s)\n",
" divs = soup.findAll(\"div\", { \"class\" : \"content-inner grid_9 push_3\" })\n",
" file1 = open(\"text\"+str(i)+\".txt\",\"wb\")\n",
" for d in divs:\n",
" data1 = d.findAll(text = True)\n",
" text1 = filter(tag_visible, data1)\n",
" file1.write(\" \".join(t.encode('ascii','ignore').decode('ascii').strip() for t in text1))\n",
" i += 1\n",
" file1.close()\n",
" forlinks = soup.findAll(\"div\", {\"id\" : \"menuwrapper\"})\n",
" for l in forlinks:\n",
" link = l.findAll('a', href = True)\n",
" for li in link:\n",
" if li['href'].endswith('.pdf'):\n",
" downPdf(li['href'])\n",
" else:\n",
" crawlLinks(li['href'])\n",
"\n",
"crawler('http://du.ac.in/du/index.php?page=research')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import io\n",
"import nltk\n",
"import urllib2\n",
"import string\n",
"\n",
"for i in range(1, 19):\n",
" file1 = open('text'+str(i)+'.txt')\n",
" line = file1.read()# Use this to read file content as a stream:\n",
" sentences = nltk.sent_tokenize(line)\n",
" appendFile = open('filteredtxt'+str(i)+'.txt','a')\n",
" for sentence in sentences:\n",
" for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):\n",
" if (pos == 'NN' or pos == 'JJ'):\n",
" appendFile.write(\" \"+word)\n",
" appendFile.close()\n",
" \n",
"for i in range(1, 3):\n",
" file1 = open('pdf'+str(i)+'.txt')\n",
" line = file1.read()# Use this to read file content as a stream:\n",
" sentences = nltk.sent_tokenize(line)\n",
" appendFile = open('filteredpdf'+str(i)+'.txt','a')\n",
" for sentence in sentences:\n",
" for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):\n",
" if (pos == 'NN' or pos == 'JJ'):\n",
" appendFile.write(\" \"+word)\n",
" appendFile.close()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import glob\n",
" \n",
"corpus = []\n",
"for file in glob.glob(\"*.txt\"):\n",
" with open(file, \"r\") as paper:\n",
" corpus.append((file, paper.read()))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')\n",
"tfidf_matrix = tf.fit_transform([content for file, content in corpus])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.metrics.pairwise import linear_kernel\n",
"\n",
"def find_similar(tfidf_matrix, index, top_n = 5):\n",
" cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()\n",
" related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]\n",
" return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"print(tfidf_matrix)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.713622701901 4\n",
"0.0865938285945 14\n",
"0.0437647613183 15\n",
"0.0410348448431 0\n",
"0.0368599016196 2\n"
]
}
],
"source": [
"for index, score in find_similar(tfidf_matrix, 18):\n",
" print score, index"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print(tfidf_matrix*tfidf_matrix.T)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

0 comments on commit 25566d0

Please sign in to comment.