diff --git a/VB_assignment.ipynb b/VB_assignment.ipynb new file mode 100644 index 0000000..2e738b9 --- /dev/null +++ b/VB_assignment.ipynb @@ -0,0 +1,291 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import PyPDF2\n", + "import re\n", + "from bs4 import BeautifulSoup\n", + "from bs4.element import Comment\n", + "import urllib2\n", + "from urllib2 import URLError\n", + "from urllib2 import HTTPError\n", + "import socket\n", + "\n", + "regex = re.compile(\n", + " r'^(?:http|ftp)s?://' # http:// or https://\n", + " r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\\.)+(?:[A-Z]{2,6}\\.?|[A-Z0-9-]{2,}\\.?)|' #domain...\n", + " r'localhost|' #localhost...\n", + " r'\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})' # ...or ip\n", + " r'(?::\\d+)?' # optional port\n", + " r'(?:/?|[/?]\\S+)$', re.IGNORECASE)\n", + "\n", + "def isValidUrl(url):\n", + " if regex.match(url) is not None:\n", + " return True;\n", + " return False\n", + "\n", + "def tag_visible(element):\n", + " if element.parent.name in ['style', 'script', 'meta', '[document]']:\n", + " return False\n", + " if isinstance(element, Comment):\n", + " return False\n", + " return True\n", + "\n", + "i = 1\n", + "j = 1\n", + "\n", + "def crawlLinks(SeedUrl):\n", + " global i\n", + " page = SeedUrl\n", + " print 'Crawling:'+page\n", + " try:\n", + " pagesource = urllib2.urlopen(page, timeout = 5)\n", + " except HTTPError, URLError:\n", + " print(\"Something bad happened\")\n", + " except socket.timeout, e:\n", + " print(\"Something bad happened\")\n", + " else:\n", + " s = pagesource.read()\n", + " soup = BeautifulSoup(s)\n", + " divs = soup.findAll(\"div\", {\"class\" : \"content-inner grid_9 push_3\"})\n", + " file1 = open(\"text\"+str(i)+\".txt\",\"wb\")\n", + " for d in divs:\n", + " data1 = d.findAll(text = True)\n", + " text1 = filter(tag_visible, data1)\n", + " file1.write(\" \".join(t.encode('ascii','ignore').decode('ascii').strip() for t in text1))\n", + " i += 1\n", + " file1.close()\n", + " for d in divs:\n", + " link = d.findAll('a', href = True)\n", + " for li in link:\n", + " if li['href'].endswith('.pdf'):\n", + " if isValidUrl(li['href']):\n", + " downPdf(li['href'])\n", + "\n", + "\n", + "def downPdf(SeedUrl):\n", + " global j\n", + " page = SeedUrl\n", + " try:\n", + " pagesource = urllib2.urlopen(page, timeout = 5)\n", + " except HTTPError, URLError:\n", + " print(\"Something bad happened\")\n", + " except socket.timeout, e:\n", + " print(\"Something bad happened\")\n", + " else:\n", + " s = pagesource.read()\n", + " file = open(\"pdf\"+str(j)+\".pdf\", \"wb\")\n", + " file.write(s)\n", + " file.close()\n", + " file = open(\"pdf\"+str(j)+\".txt\", \"wb\")\n", + " pdfFileObj = open(\"pdf\"+str(j)+\".pdf\", 'rb')\n", + " pdfReader = PyPDF2.PdfFileReader(pdfFileObj)\n", + " numofPages = pdfReader.numPages\n", + " for n in range(0 , numofPages):\n", + " pageObj = pdfReader.getPage(n)\n", + " file.write(\"\".join(t.encode('ascii', 'ignore').decode('ascii') for t in pageObj.extractText()))\n", + " j += 1\n", + " file.close()\n", + " pdfFileObj.close()\n", + "\n", + "\n", + "def crawler(SeedUrl):\n", + " global i\n", + " page = SeedUrl\n", + " print 'Crawling:'+page\n", + " try:\n", + " pagesource = urllib2.urlopen(page, timeout = 5)\n", + " except HTTPError, URLError:\n", + " print(\"Something bad happened\")\n", + " except socket.timeout, e:\n", + " print(\"Something bad happened\")\n", + " else:\n", + " s = pagesource.read()\n", + " soup = BeautifulSoup(s)\n", + " divs = soup.findAll(\"div\", { \"class\" : \"content-inner grid_9 push_3\" })\n", + " file1 = open(\"text\"+str(i)+\".txt\",\"wb\")\n", + " for d in divs:\n", + " data1 = d.findAll(text = True)\n", + " text1 = filter(tag_visible, data1)\n", + " file1.write(\" \".join(t.encode('ascii','ignore').decode('ascii').strip() for t in text1))\n", + " i += 1\n", + " file1.close()\n", + " forlinks = soup.findAll(\"div\", {\"id\" : \"menuwrapper\"})\n", + " for l in forlinks:\n", + " link = l.findAll('a', href = True)\n", + " for li in link:\n", + " if li['href'].endswith('.pdf'):\n", + " downPdf(li['href'])\n", + " else:\n", + " crawlLinks(li['href'])\n", + "\n", + "crawler('http://du.ac.in/du/index.php?page=research')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import io\n", + "import nltk\n", + "import urllib2\n", + "import string\n", + "\n", + "for i in range(1, 19):\n", + " file1 = open('text'+str(i)+'.txt')\n", + " line = file1.read()# Use this to read file content as a stream:\n", + " sentences = nltk.sent_tokenize(line)\n", + " appendFile = open('filteredtxt'+str(i)+'.txt','a')\n", + " for sentence in sentences:\n", + " for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):\n", + " if (pos == 'NN' or pos == 'JJ'):\n", + " appendFile.write(\" \"+word)\n", + " appendFile.close()\n", + " \n", + "for i in range(1, 3):\n", + " file1 = open('pdf'+str(i)+'.txt')\n", + " line = file1.read()# Use this to read file content as a stream:\n", + " sentences = nltk.sent_tokenize(line)\n", + " appendFile = open('filteredpdf'+str(i)+'.txt','a')\n", + " for sentence in sentences:\n", + " for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):\n", + " if (pos == 'NN' or pos == 'JJ'):\n", + " appendFile.write(\" \"+word)\n", + " appendFile.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import glob\n", + " \n", + "corpus = []\n", + "for file in glob.glob(\"*.txt\"):\n", + " with open(file, \"r\") as paper:\n", + " corpus.append((file, paper.read()))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')\n", + "tfidf_matrix = tf.fit_transform([content for file, content in corpus])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn.metrics.pairwise import linear_kernel\n", + "\n", + "def find_similar(tfidf_matrix, index, top_n = 5):\n", + " cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()\n", + " related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]\n", + " return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "print(tfidf_matrix)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.713622701901 4\n", + "0.0865938285945 14\n", + "0.0437647613183 15\n", + "0.0410348448431 0\n", + "0.0368599016196 2\n" + ] + } + ], + "source": [ + "for index, score in find_similar(tfidf_matrix, 18):\n", + " print score, index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(tfidf_matrix*tfidf_matrix.T)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}