Skip to content

Commit

Permalink
updated
Browse files Browse the repository at this point in the history
  • Loading branch information
timini committed Mar 14, 2016
1 parent 12a88f5 commit cfedceb
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 88 deletions.
75 changes: 29 additions & 46 deletions data/02 Data cleaning, vector representations.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {
"collapsed": false
},
Expand All @@ -25,7 +25,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"metadata": {
"collapsed": false
},
Expand Down Expand Up @@ -732,7 +732,7 @@
"[1576 rows x 5 columns]"
]
},
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -744,7 +744,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {
"collapsed": false
},
Expand All @@ -758,11 +758,11 @@
"each document represents one speaker\n",
"\n",
"processing document 1 of 170\n",
"took 0.0732870101928711\n",
"took 0.011493921279907227\n",
"processing document 2 of 170\n",
"took 1.2330009937286377\n",
"took 0.24931097030639648\n",
"processing document 3 of 170\n",
"took 1.8079171180725098\n"
"took 0.6408729553222656\n"
]
},
{
Expand Down Expand Up @@ -1093,7 +1093,7 @@
"[170 rows x 1 columns]"
]
},
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -1196,20 +1196,20 @@
"common_words:\n",
" None\n",
"{None: 150456,\n",
" 'annoyed': 173308,\n",
" 'asterisks': 64425,\n",
" 'brunt': 59044,\n",
" 'built': 57951,\n",
" 'claimant': 84261,\n",
" 'contortions': 94617,\n",
" 'crutch': 81435,\n",
" 'drones': 83707,\n",
" 'humphery': 100032,\n",
" 'keener': 85957,\n",
" 'scrutinize': 56994,\n",
" 'sper': 74820,\n",
" 'spider': 63131,\n",
" 'tub': 70365}\n",
" 'attrition': 74820,\n",
" 'auxiliaries': 63131,\n",
" 'burghley': 64425,\n",
" 'deficiencies': 85957,\n",
" 'establishing': 81435,\n",
" 'glocestershire': 59044,\n",
" 'imitate': 70365,\n",
" 'memorialise': 57951,\n",
" 'percival': 84261,\n",
" 'robust': 56994,\n",
" 'sandgate': 173308,\n",
" 'sizarships': 83707,\n",
" 'tend': 94617,\n",
" 'thefree': 100032}\n",
"common_words:\n",
" None\n"
]
Expand Down Expand Up @@ -1248,7 +1248,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {
"collapsed": false
},
Expand All @@ -1258,8 +1258,7 @@
"output_type": "stream",
"text": [
"loading existing corpus\n",
"MmCorpus(315358 documents, 45481 features, 29827150 non-zero entries)\n",
"<gensim.interfaces.TransformedCorpus object at 0x10b215f60>\n"
"MmCorpus(315358 documents, 45481 features, 29827150 non-zero entries)\n"
]
}
],
Expand Down Expand Up @@ -1305,46 +1304,30 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(1907, 1.0), (4491, 1.0), (8955, 1.0), (13486, 1.0), (14546, 1.0), (14644, 1.0), (15567, 1.0), (16373, 1.0), (16683, 1.0), (18849, 1.0), (21106, 1.0), (24518, 1.0), (25772, 1.0), (25828, 1.0), (25944, 1.0), (28305, 1.0), (29049, 1.0), (31323, 1.0), (31863, 1.0), (32063, 2.0), (33605, 1.0), (34427, 1.0), (36316, 1.0), (37134, 1.0), (37799, 1.0), (39896, 1.0), (42602, 1.0), (43436, 1.0), (45076, 1.0)]\n"
]
}
],
"outputs": [],
"source": [
"# documents in the BOW corpus are sparse vectors where each element represents term_id, count; respectively.\n",
"print(BOW_corpus[2])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(1907, 0.14766152089841653), (4491, 0.1387403653892339), (8955, 0.10727026241684091), (13486, 0.11112409499224919), (14546, 0.29450065189975705), (14644, 0.1279928005727261), (15567, 0.2396274317588969), (16373, 0.21141867583249677), (16683, 0.17943524701582836), (18849, 0.11882169557818345), (21106, 0.1508847244229395), (24518, 0.20982305099497528), (25772, 0.08751790023430829), (25828, 0.19243793443494436), (25944, 0.14290500619637247), (28305, 0.34189690585416327), (29049, 0.1110436410668854), (31323, 0.14147048220269118), (31863, 0.07517281470377087), (32063, 0.26205615504554314), (33605, 0.3125178807326799), (34427, 0.06132360068253787), (36316, 0.2838166351646873), (37134, 0.21829363918943648), (37799, 0.056688229857271066), (39896, 0.0311257060663129), (42602, 0.06981254287994763), (43436, 0.1701679157785267), (45076, 0.24582700779476782)]\n"
]
}
],
"outputs": [],
"source": [
"print TFIDF_corpus[2]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {
"collapsed": false
},
Expand Down
15 changes: 8 additions & 7 deletions data/03 PCA .ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,18 @@
},
"outputs": [
{
"ename": "ValueError",
"evalue": "total size of new array must be unchanged",
"ename": "TypeError",
"evalue": "Can't convert 'list' object to str implicitly",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-2-7dd3503e11ff>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msparse_tfidf_corpus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmmread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'./data/large_files/tfidf_corpus.mm'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdense_tfidf_corpus\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msparse_tfidf_corpus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/tim/Workspace/projects/hansard/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36mmmread\u001b[0;34m(source)\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0mMatrix\u001b[0m \u001b[0mMarket\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \"\"\"\n\u001b[0;32m---> 76\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mMMFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 77\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0;31m# -----------------------------------------------------------------------------\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/tim/Workspace/projects/hansard/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, source)\u001b[0m\n\u001b[1;32m 413\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_header\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 415\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_body\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 416\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 417\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/tim/Workspace/projects/hansard/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36m_parse_body\u001b[0;34m(self, stream)\u001b[0m\n\u001b[1;32m 590\u001b[0m \u001b[0mV\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimag\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mflat_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 592\u001b[0;31m \u001b[0mflat_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mflat_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 593\u001b[0m \u001b[0mI\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mascontiguousarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflat_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'intc'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 594\u001b[0m \u001b[0mJ\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mascontiguousarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflat_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'intc'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: total size of new array must be unchanged"
"\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36mmmread\u001b[0;34m(source)\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[0mMatrix\u001b[0m \u001b[0mMarket\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \"\"\"\n\u001b[0;32m---> 76\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mMMFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 77\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 78\u001b[0m \u001b[0;31m# -----------------------------------------------------------------------------\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, source)\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 413\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 414\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_header\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 415\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_body\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 416\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36m_parse_header\u001b[0;34m(self, stream)\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_parse_header\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 477\u001b[0m \u001b[0mrows\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcols\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mentries\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfield\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msymmetry\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 478\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 479\u001b[0m self._init_attrs(rows=rows, cols=cols, entries=entries, format=format,\n\u001b[1;32m 480\u001b[0m field=field, symmetry=symmetry)\n",
"\u001b[0;32m/Users/tim/Workspace/projects/hansard/data/venv/lib/python3.5/site-packages/scipy/io/mmio.py\u001b[0m in \u001b[0;36minfo\u001b[0;34m(self, source)\u001b[0m\n\u001b[1;32m 254\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 256\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Header line not of length 3: \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 257\u001b[0m \u001b[0mrows\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcols\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mentries\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: Can't convert 'list' object to str implicitly"
]
}
],
Expand Down
Loading

0 comments on commit cfedceb

Please sign in to comment.