diff --git a/data/02 Data cleaning, vector representations.ipynb b/data/02 Data cleaning, vector representations.ipynb index ec6bc75..d7a73df 100644 --- a/data/02 Data cleaning, vector representations.ipynb +++ b/data/02 Data cleaning, vector representations.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": { "collapsed": false }, @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": { "collapsed": false }, @@ -732,7 +732,7 @@ "[1576 rows x 5 columns]" ] }, - "execution_count": 2, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -744,7 +744,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "collapsed": false }, @@ -758,11 +758,11 @@ "each document represents one speaker\n", "\n", "processing document 1 of 170\n", - "took 0.0732870101928711\n", + "took 0.011493921279907227\n", "processing document 2 of 170\n", - "took 1.2330009937286377\n", + "took 0.24931097030639648\n", "processing document 3 of 170\n", - "took 1.8079171180725098\n" + "took 0.6408729553222656\n" ] }, { @@ -1093,7 +1093,7 @@ "[170 rows x 1 columns]" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -1196,20 +1196,20 @@ "common_words:\n", " None\n", "{None: 150456,\n", - " 'annoyed': 173308,\n", - " 'asterisks': 64425,\n", - " 'brunt': 59044,\n", - " 'built': 57951,\n", - " 'claimant': 84261,\n", - " 'contortions': 94617,\n", - " 'crutch': 81435,\n", - " 'drones': 83707,\n", - " 'humphery': 100032,\n", - " 'keener': 85957,\n", - " 'scrutinize': 56994,\n", - " 'sper': 74820,\n", - " 'spider': 63131,\n", - " 'tub': 70365}\n", + " 'attrition': 74820,\n", + " 'auxiliaries': 63131,\n", + " 'burghley': 64425,\n", + " 'deficiencies': 85957,\n", + " 'establishing': 81435,\n", + " 'glocestershire': 59044,\n", + " 'imitate': 70365,\n", + " 'memorialise': 57951,\n", + " 'percival': 84261,\n", + " 'robust': 56994,\n", + " 'sandgate': 173308,\n", + " 'sizarships': 83707,\n", + " 'tend': 94617,\n", + " 'thefree': 100032}\n", "common_words:\n", " None\n" ] @@ -1248,7 +1248,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "collapsed": false }, @@ -1258,8 +1258,7 @@ "output_type": "stream", "text": [ "loading existing corpus\n", - "MmCorpus(315358 documents, 45481 features, 29827150 non-zero entries)\n", - "\n" + "MmCorpus(315358 documents, 45481 features, 29827150 non-zero entries)\n" ] } ], @@ -1305,19 +1304,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[(1907, 1.0), (4491, 1.0), (8955, 1.0), (13486, 1.0), (14546, 1.0), (14644, 1.0), (15567, 1.0), (16373, 1.0), (16683, 1.0), (18849, 1.0), (21106, 1.0), (24518, 1.0), (25772, 1.0), (25828, 1.0), (25944, 1.0), (28305, 1.0), (29049, 1.0), (31323, 1.0), (31863, 1.0), (32063, 2.0), (33605, 1.0), (34427, 1.0), (36316, 1.0), (37134, 1.0), (37799, 1.0), (39896, 1.0), (42602, 1.0), (43436, 1.0), (45076, 1.0)]\n" - ] - } - ], + "outputs": [], "source": [ "# documents in the BOW corpus are sparse vectors where each element represents term_id, count; respectively.\n", "print(BOW_corpus[2])" @@ -1325,26 +1316,18 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[(1907, 0.14766152089841653), (4491, 0.1387403653892339), (8955, 0.10727026241684091), (13486, 0.11112409499224919), (14546, 0.29450065189975705), (14644, 0.1279928005727261), (15567, 0.2396274317588969), (16373, 0.21141867583249677), (16683, 0.17943524701582836), (18849, 0.11882169557818345), (21106, 0.1508847244229395), (24518, 0.20982305099497528), (25772, 0.08751790023430829), (25828, 0.19243793443494436), (25944, 0.14290500619637247), (28305, 0.34189690585416327), (29049, 0.1110436410668854), (31323, 0.14147048220269118), (31863, 0.07517281470377087), (32063, 0.26205615504554314), (33605, 0.3125178807326799), (34427, 0.06132360068253787), (36316, 0.2838166351646873), (37134, 0.21829363918943648), (37799, 0.056688229857271066), (39896, 0.0311257060663129), (42602, 0.06981254287994763), (43436, 0.1701679157785267), (45076, 0.24582700779476782)]\n" - ] - } - ], + "outputs": [], "source": [ "print TFIDF_corpus[2]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "collapsed": false }, diff --git a/data/03 PCA .ipynb b/data/03 PCA .ipynb index 2b770f5..6a8c8fe 100644 --- a/data/03 PCA .ipynb +++ b/data/03 PCA .ipynb @@ -21,17 +21,18 @@ }, "outputs": [ { - "ename": "ValueError", - "evalue": "total size of new array must be unchanged", + "ename": "TypeError", + "evalue": "Can't convert 'list' object to str implicitly", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msparse_tfidf_corpus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmmread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'./data/large_files/tfidf_corpus.mm'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdense_tfidf_corpus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msparse_tfidf_corpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Users/tim/Workspace/projects/hansard/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36mmmread\u001b[0;34m(source)\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0mMatrix\u001b[0m \u001b[0mMarket\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \"\"\"\n\u001b[0;32m---> 76\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mMMFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 77\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0;31m# -----------------------------------------------------------------------------\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Users/tim/Workspace/projects/hansard/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, source)\u001b[0m\n\u001b[1;32m 413\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_header\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 415\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_body\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 416\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 417\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Users/tim/Workspace/projects/hansard/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36m_parse_body\u001b[0;34m(self, stream)\u001b[0m\n\u001b[1;32m 590\u001b[0m \u001b[0mV\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimag\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mflat_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 592\u001b[0;31m \u001b[0mflat_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mflat_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 593\u001b[0m \u001b[0mI\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mascontiguousarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflat_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'intc'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 594\u001b[0m \u001b[0mJ\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mascontiguousarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflat_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'intc'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: total size of new array must be unchanged" + "\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36mmmread\u001b[0;34m(source)\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0mMatrix\u001b[0m \u001b[0mMarket\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \"\"\"\n\u001b[0;32m---> 76\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mMMFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 77\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0;31m# -----------------------------------------------------------------------------\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, source)\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 413\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 414\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_header\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 415\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_body\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 416\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36m_parse_header\u001b[0;34m(self, stream)\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_parse_header\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 477\u001b[0m \u001b[0mrows\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcols\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mentries\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfield\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msymmetry\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 478\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 479\u001b[0m self._init_attrs(rows=rows, cols=cols, entries=entries, format=format,\n\u001b[1;32m 480\u001b[0m field=field, symmetry=symmetry)\n", + "\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36minfo\u001b[0;34m(self, source)\u001b[0m\n\u001b[1;32m 254\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 256\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Header line not of length 3: \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 257\u001b[0m \u001b[0mrows\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcols\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mentries\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: Can't convert 'list' object to str implicitly" ] } ], diff --git a/data/LDA Topic modelling.ipynb b/data/LDA Topic modelling.ipynb index 4dede68..839594b 100644 --- a/data/LDA Topic modelling.ipynb +++ b/data/LDA Topic modelling.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": { "collapsed": true }, @@ -19,58 +19,148 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { - "ename": "UnicodeDecodeError", - "evalue": "'ascii' codec can't decode byte 0xae in position 2: ordinal not in range(128)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mdictionary\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDictionary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDICT_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mcorpus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcorpora\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMmCorpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCORPUS_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mlda_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mldamulticore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLdaMulticore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mLDA_MODEL_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdictionary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Users/tim/Workspace/projects/hansard/venv/lib/python3.5/site-packages/gensim/models/ldamodel.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(cls, fname, *args, **kwargs)\u001b[0m\n\u001b[1;32m 973\u001b[0m \"\"\"\n\u001b[1;32m 974\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'mmap'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'mmap'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 975\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mLdaModel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 976\u001b[0m \u001b[0mstate_fname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msmart_extension\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'.state'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 977\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Users/tim/Workspace/projects/hansard/venv/lib/python3.5/site-packages/gensim/utils.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(cls, fname, mmap)\u001b[0m\n\u001b[1;32m 246\u001b[0m \u001b[0mcompress\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msubname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSaveLoad\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_adapt_by_suffix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 248\u001b[0;31m \u001b[0mobj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0munpickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 249\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_load_specials\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmmap\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcompress\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msubname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Users/tim/Workspace/projects/hansard/venv/lib/python3.5/site-packages/gensim/utils.py\u001b[0m in \u001b[0;36munpickle\u001b[0;34m(fname)\u001b[0m\n\u001b[1;32m 910\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0msmart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 911\u001b[0m \u001b[0;31m# Because of loading from S3 load can't be used (missing readline in smart_open)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 912\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_pickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 913\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 914\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mUnicodeDecodeError\u001b[0m: 'ascii' codec can't decode byte 0xae in position 2: ordinal not in range(128)" + "name": "stdout", + "output_type": "stream", + "text": [ + "Dictionary(70996 unique tokens: ['barttelot', 'gets', 'prohibiting', 'fidecarried', 'comfort']...)\n", + "MmCorpus(315358 documents, 45481 features, 29827150 non-zero entries)\n", + "LdaModel(num_terms=45481, num_topics=100, decay=0.5, chunksize=2000)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/IPython/core/formatters.py:92: DeprecationWarning: DisplayFormatter._ipython_display_formatter_default is deprecated: use @default decorator instead.\n", + " def _ipython_display_formatter_default(self):\n", + "/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/IPython/core/formatters.py:669: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.\n", + " def _singleton_printers_default(self):\n", + "/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/IPython/core/formatters.py:672: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.\n", + " def _type_printers_default(self):\n", + "/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/IPython/core/formatters.py:677: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.\n", + " def _deferred_printers_default(self):\n" ] + }, + { + "data": { + "text/plain": [ + "[(87,\n", + " '0.016*38433 + 0.013*935 + 0.012*36661 + 0.010*5152 + 0.010*22699 + 0.010*34810 + 0.008*999 + 0.008*28254 + 0.008*40389 + 0.008*2257'),\n", + " (8,\n", + " '0.020*31257 + 0.018*31992 + 0.017*29640 + 0.017*38795 + 0.013*5952 + 0.012*33987 + 0.012*5314 + 0.011*5334 + 0.009*38190 + 0.009*43507'),\n", + " (96,\n", + " '0.016*34291 + 0.014*30942 + 0.013*37865 + 0.011*28943 + 0.010*8955 + 0.010*37251 + 0.010*20586 + 0.009*36779 + 0.009*40854 + 0.008*18992'),\n", + " (71,\n", + " '0.065*634 + 0.062*18562 + 0.055*25049 + 0.018*39896 + 0.018*45132 + 0.012*29433 + 0.010*16745 + 0.009*44007 + 0.007*12491 + 0.007*8643'),\n", + " (28,\n", + " '0.034*37445 + 0.031*10965 + 0.026*40014 + 0.021*40377 + 0.020*6510 + 0.017*29364 + 0.015*27949 + 0.012*25777 + 0.011*37546 + 0.010*42255'),\n", + " (41,\n", + " '0.115*25029 + 0.020*35628 + 0.014*6207 + 0.010*10162 + 0.009*34600 + 0.008*26389 + 0.008*13455 + 0.008*22699 + 0.007*33740 + 0.007*34739'),\n", + " (62,\n", + " '0.097*4525 + 0.024*42737 + 0.022*18562 + 0.013*35409 + 0.011*5688 + 0.011*10085 + 0.011*935 + 0.011*10610 + 0.010*29729 + 0.008*45404'),\n", + " (77,\n", + " '0.108*2028 + 0.032*13382 + 0.027*24001 + 0.020*14077 + 0.018*31603 + 0.013*4165 + 0.011*30340 + 0.010*2938 + 0.009*31912 + 0.009*42318'),\n", + " (48,\n", + " '0.014*13052 + 0.013*25029 + 0.010*20586 + 0.009*36779 + 0.008*38433 + 0.007*11511 + 0.007*935 + 0.006*22699 + 0.006*39195 + 0.006*6203'),\n", + " (42,\n", + " '0.042*5948 + 0.023*41226 + 0.016*13674 + 0.016*3969 + 0.015*28769 + 0.014*43495 + 0.010*39235 + 0.009*329 + 0.009*9698 + 0.009*16012')]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "dictionary = corpora.Dictionary.load(DICT_PATH)\n", "corpus = corpora.MmCorpus(CORPUS_PATH)\n", - "lda_model = models.ldamulticore.LdaMulticore.load(LDA_MODEL_PATH)\n", + "lda = models.ldamulticore.LdaMulticore.load(LDA_MODEL_PATH)\n", "print(dictionary)\n", "print(corpus)\n", - "print(lda_model)" + "print(lda)\n", + "lda.show_topics()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "IndexError", + "evalue": "index 52046 is out of bounds for axis 1 with size 45481", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpyLDAvis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgensim\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mpyLDAvis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprepare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlda_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdictionary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/pyLDAvis/gensim.py\u001b[0m in \u001b[0;36mprepare\u001b[0;34m(topic_model, corpus, dictionary, doc_topic_dist, **kwargs)\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0mSee\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mpyLDAvis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprepare\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 96\u001b[0m \"\"\"\n\u001b[0;32m---> 97\u001b[0;31m \u001b[0mopts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_extract_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtopic_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdictionary\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdoc_topic_dist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 98\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mvis_prepare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mopts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/pyLDAvis/gensim.py\u001b[0m in \u001b[0;36m_extract_data\u001b[0;34m(topic_model, corpus, dictionary, doc_topic_dists)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0mbeta\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0.01\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mfnames_argsort\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdictionary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoken2id\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mint_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0mterm_freqs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcorpus_csc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfnames_argsort\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0mterm_freqs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mterm_freqs\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbeta\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mdoc_lengths\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcorpus_csc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mA\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIndexError\u001b[0m: index 52046 is out of bounds for axis 1 with size 45481" + ] + } + ], + "source": [ + "import pyLDAvis.gensim\n", + "\n", + "pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from nltk.tag.stanford import StanfordNERTagger" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sent = 'The 1975 novel High-Rise depicted an apocalyptic tower that drove its inhabitants insane. As a new film adaptation hits cinemas, we wonder what the author would have made of today’s rash of skyscrapers for the megarich'" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { - "ename": "ImportError", - "evalue": "No module named pyLDAvis", + "ename": "LookupError", + "evalue": "\n\n===========================================================================\n NLTK was unable to find stanford-ner.jar! Set the CLASSPATH\n environment variable.\n\n For more information, on stanford-ner.jar, see:\n \n===========================================================================", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mpyLDAvis\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mpyLDAvis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menable_notebook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mpyLDAvis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprepare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdictionary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mImportError\u001b[0m: No module named pyLDAvis" + "\u001b[0;31mLookupError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mStanfordNERTagger\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/nltk/tag/stanford.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 168\u001b[0;31m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mStanfordNERTagger\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 169\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 170\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/nltk/tag/stanford.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, model_filename, path_to_jar, encoding, verbose, java_options)\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_JAR\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath_to_jar\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[0msearchpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_stanford_url\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 53\u001b[0;31m verbose=verbose)\n\u001b[0m\u001b[1;32m 54\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m self._stanford_model = find_file(model_filename,\n", + "\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/nltk/__init__.py\u001b[0m in \u001b[0;36mfind_jar\u001b[0;34m(name_pattern, path_to_jar, env_vars, searchpath, url, verbose, is_regex)\u001b[0m\n\u001b[1;32m 717\u001b[0m searchpath=(), url=None, verbose=True, is_regex=False):\n\u001b[1;32m 718\u001b[0m return next(find_jar_iter(name_pattern, path_to_jar, env_vars,\n\u001b[0;32m--> 719\u001b[0;31m searchpath, url, verbose, is_regex))\n\u001b[0m\u001b[1;32m 720\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 721\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/nltk/__init__.py\u001b[0m in \u001b[0;36mfind_jar_iter\u001b[0;34m(name_pattern, path_to_jar, env_vars, searchpath, url, verbose, is_regex)\u001b[0m\n\u001b[1;32m 712\u001b[0m (name_pattern, url))\n\u001b[1;32m 713\u001b[0m \u001b[0mdiv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'='\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;36m75\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 714\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mLookupError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n\\n%s\\n%s\\n%s'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdiv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdiv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 715\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 716\u001b[0m def find_jar(name_pattern, path_to_jar=None, env_vars=(),\n", + "\u001b[0;31mLookupError\u001b[0m: \n\n===========================================================================\n NLTK was unable to find stanford-ner.jar! Set the CLASSPATH\n environment variable.\n\n For more information, on stanford-ner.jar, see:\n \n===========================================================================" ] } ], "source": [ - "import pyLDAvis\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.gensim.prepare(lda, corpus, dictionary)" + "StanfordNERTagger(sent)" ] }, { diff --git a/data/data/large_files/lda.model b/data/data/large_files/lda.model index 9c0180a..7dcb15e 100644 Binary files a/data/data/large_files/lda.model and b/data/data/large_files/lda.model differ diff --git a/data/data/large_files/lda.model.state b/data/data/large_files/lda.model.state index 23ab942..6fb8c3a 100644 Binary files a/data/data/large_files/lda.model.state and b/data/data/large_files/lda.model.state differ diff --git a/data/generate_lda_model.py b/data/generate_lda_model.py index 2cc03e1..7084fde 100644 --- a/data/generate_lda_model.py +++ b/data/generate_lda_model.py @@ -9,10 +9,7 @@ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) -if os.path.exists(LDA_MODEL_PATH): - # load dictionary - lda_model = models.ldamulticore.LdaMulticore.load(LDA_MODEL_PATH) -else: +if not os.path.exists(LDA_MODEL_PATH): dictionary = corpora.Dictionary.load(DICT_PATH) corpus = corpora.MmCorpus(CORPUS_PATH) diff --git a/data/requirements.txt b/data/requirements.txt index ddbaed3..454c1ae 100644 --- a/data/requirements.txt +++ b/data/requirements.txt @@ -6,13 +6,13 @@ CacheControl==0.11.6 contextlib2==0.5.1 cycler==0.10.0 decorator==4.0.9 -funcy==1.6 +funcy==1.7.1 future==0.15.2 gensim==0.12.4 gnureadline==6.3.3 httpretty==0.8.10 ipykernel==4.3.1 -ipython==4.1.1 +ipython==4.1.2 ipython-genutils==0.1.0 ipywidgets==4.1.1 Jinja2==2.8 @@ -34,21 +34,21 @@ nose==1.3.7 notebook==4.1.0 numexpr==2.5 numpy==1.10.4 -pandas==0.17.1 +pandas==0.18.0 path.py==8.1.2 pexpect==4.0.1 pickleshare==0.6 ptyprocess==0.5.1 py==1.4.31 Pygments==2.1.3 -pyLDAvis==1.4.1 +pyLDAvis==1.5.0 pyparsing==2.1.0 -pytest==2.8.7 -python-dateutil==2.4.2 +pytest==2.9.0 +python-dateutil==2.5.0 pytz==2015.7 pyzmq==15.2.0 qtconsole==4.2.0 -requests==2.8.1 +requests==2.9.1 scikit-bio==0.4.2 scikit-learn==0.17.1 scipy==0.17.0