Add files via upload

ssomnathssaha · Mar 19, 2018 · 25566d0 · 25566d0
1 parent 49fb8d6
commit 25566d0
Showing 1 changed file with 291 additions and 0 deletions.
diff --git a/VB_assignment.ipynb b/VB_assignment.ipynb
@@ -0,0 +1,291 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import PyPDF2\n",
+    "import re\n",
+    "from bs4 import BeautifulSoup\n",
+    "from bs4.element import Comment\n",
+    "import urllib2\n",
+    "from urllib2 import URLError\n",
+    "from urllib2 import HTTPError\n",
+    "import socket\n",
+    "\n",
+    "regex = re.compile(\n",
+    "        r'^(?:http|ftp)s?://' # http:// or https://\n",
+    "        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\\.)+(?:[A-Z]{2,6}\\.?|[A-Z0-9-]{2,}\\.?)|' #domain...\n",
+    "        r'localhost|' #localhost...\n",
+    "        r'\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})' # ...or ip\n",
+    "        r'(?::\\d+)?' # optional port\n",
+    "        r'(?:/?|[/?]\\S+)$', re.IGNORECASE)\n",
+    "\n",
+    "def isValidUrl(url):\n",
+    "    if regex.match(url) is not None:\n",
+    "        return True;\n",
+    "    return False\n",
+    "\n",
+    "def tag_visible(element):\n",
+    "    if element.parent.name in ['style', 'script', 'meta', '[document]']:\n",
+    "        return False\n",
+    "    if isinstance(element, Comment):\n",
+    "        return False\n",
+    "    return True\n",
+    "\n",
+    "i = 1\n",
+    "j = 1\n",
+    "\n",
+    "def crawlLinks(SeedUrl):\n",
+    "    global i\n",
+    "    page = SeedUrl\n",
+    "    print 'Crawling:'+page\n",
+    "    try:\n",
+    "        pagesource = urllib2.urlopen(page, timeout = 5)\n",
+    "    except HTTPError, URLError:\n",
+    "        print(\"Something bad happened\")\n",
+    "    except socket.timeout, e:\n",
+    "        print(\"Something bad happened\")\n",
+    "    else:\n",
+    "        s = pagesource.read()\n",
+    "        soup = BeautifulSoup(s)\n",
+    "        divs = soup.findAll(\"div\", {\"class\" : \"content-inner grid_9 push_3\"})\n",
+    "        file1 = open(\"text\"+str(i)+\".txt\",\"wb\")\n",
+    "        for d in divs:\n",
+    "            data1 = d.findAll(text = True)\n",
+    "            text1 = filter(tag_visible, data1)\n",
+    "            file1.write(\" \".join(t.encode('ascii','ignore').decode('ascii').strip() for t in text1))\n",
+    "        i += 1\n",
+    "        file1.close()\n",
+    "        for d in divs:\n",
+    "            link = d.findAll('a', href = True)\n",
+    "            for li in link:\n",
+    "                if li['href'].endswith('.pdf'):\n",
+    "                    if isValidUrl(li['href']):\n",
+    "                        downPdf(li['href'])\n",
+    "\n",
+    "\n",
+    "def downPdf(SeedUrl):\n",
+    "    global j\n",
+    "    page = SeedUrl\n",
+    "    try:\n",
+    "        pagesource = urllib2.urlopen(page, timeout = 5)\n",
+    "    except HTTPError, URLError:\n",
+    "        print(\"Something bad happened\")\n",
+    "    except socket.timeout, e:\n",
+    "        print(\"Something bad happened\")\n",
+    "    else:\n",
+    "        s = pagesource.read()\n",
+    "        file = open(\"pdf\"+str(j)+\".pdf\", \"wb\")\n",
+    "        file.write(s)\n",
+    "        file.close()\n",
+    "        file = open(\"pdf\"+str(j)+\".txt\", \"wb\")\n",
+    "        pdfFileObj = open(\"pdf\"+str(j)+\".pdf\", 'rb')\n",
+    "        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)\n",
+    "        numofPages = pdfReader.numPages\n",
+    "        for n in range(0 , numofPages):\n",
+    "            pageObj = pdfReader.getPage(n)\n",
+    "            file.write(\"\".join(t.encode('ascii', 'ignore').decode('ascii') for t in pageObj.extractText()))\n",
+    "        j += 1\n",
+    "        file.close()\n",
+    "        pdfFileObj.close()\n",
+    "\n",
+    "\n",
+    "def crawler(SeedUrl):\n",
+    "    global i\n",
+    "    page = SeedUrl\n",
+    "    print 'Crawling:'+page\n",
+    "    try:\n",
+    "        pagesource = urllib2.urlopen(page, timeout = 5)\n",
+    "    except HTTPError, URLError:\n",
+    "        print(\"Something bad happened\")\n",
+    "    except socket.timeout, e:\n",
+    "        print(\"Something bad happened\")\n",
+    "    else:\n",
+    "        s = pagesource.read()\n",
+    "        soup = BeautifulSoup(s)\n",
+    "        divs = soup.findAll(\"div\", { \"class\" : \"content-inner grid_9 push_3\" })\n",
+    "        file1 = open(\"text\"+str(i)+\".txt\",\"wb\")\n",
+    "        for d in divs:\n",
+    "            data1 = d.findAll(text = True)\n",
+    "            text1 = filter(tag_visible, data1)\n",
+    "            file1.write(\" \".join(t.encode('ascii','ignore').decode('ascii').strip() for t in text1))\n",
+    "        i += 1\n",
+    "        file1.close()\n",
+    "        forlinks = soup.findAll(\"div\", {\"id\" : \"menuwrapper\"})\n",
+    "        for l in forlinks:\n",
+    "            link = l.findAll('a', href = True)\n",
+    "            for li in link:\n",
+    "                if li['href'].endswith('.pdf'):\n",
+    "                    downPdf(li['href'])\n",
+    "                else:\n",
+    "                    crawlLinks(li['href'])\n",
+    "\n",
+    "crawler('http://du.ac.in/du/index.php?page=research')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import io\n",
+    "import nltk\n",
+    "import urllib2\n",
+    "import string\n",
+    "\n",
+    "for i in range(1, 19):\n",
+    "    file1 = open('text'+str(i)+'.txt')\n",
+    "    line = file1.read()# Use this to read file content as a stream:\n",
+    "    sentences = nltk.sent_tokenize(line)\n",
+    "    appendFile = open('filteredtxt'+str(i)+'.txt','a')\n",
+    "    for sentence in sentences:\n",
+    "        for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):\n",
+    "            if (pos == 'NN' or pos == 'JJ'):\n",
+    "                appendFile.write(\" \"+word)\n",
+    "    appendFile.close()\n",
+    "    \n",
+    "for i in range(1, 3):\n",
+    "    file1 = open('pdf'+str(i)+'.txt')\n",
+    "    line = file1.read()# Use this to read file content as a stream:\n",
+    "    sentences = nltk.sent_tokenize(line)\n",
+    "    appendFile = open('filteredpdf'+str(i)+'.txt','a')\n",
+    "    for sentence in sentences:\n",
+    "        for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):\n",
+    "            if (pos == 'NN' or pos == 'JJ'):\n",
+    "                appendFile.write(\" \"+word)\n",
+    "    appendFile.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    " \n",
+    "corpus = []\n",
+    "for file in glob.glob(\"*.txt\"):\n",
+    "    with open(file, \"r\") as paper:\n",
+    "        corpus.append((file, paper.read()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "\n",
+    "tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')\n",
+    "tfidf_matrix =  tf.fit_transform([content for file, content in corpus])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics.pairwise import linear_kernel\n",
+    "\n",
+    "def find_similar(tfidf_matrix, index, top_n = 5):\n",
+    "    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()\n",
+    "    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]\n",
+    "    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "print(tfidf_matrix)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.713622701901 4\n",
+      "0.0865938285945 14\n",
+      "0.0437647613183 15\n",
+      "0.0410348448431 0\n",
+      "0.0368599016196 2\n"
+     ]
+    }
+   ],
+   "source": [
+    "for index, score in find_similar(tfidf_matrix, 18):\n",
+    "       print score, index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "print(tfidf_matrix*tfidf_matrix.T)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}