Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 10 additions & 58 deletions kMeansClustering.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {
"collapsed": false
},
Expand Down Expand Up @@ -71,31 +71,22 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Retrieving data from NIST\n",
"Last item in list retrieved: A New NIST Online Database: The NIST Polycyclic Aromatic Hydrocarbon Structure Index Recently, a new website containing a wealth of information on polycyclic aromatic hydrocarbons (PAHs) was made publicly available by NIST. PAHs are compounds that are produced during the … \n"
]
}
],
"outputs": [],
"source": [
"print(\"Retrieving data from NIST\")\n",
"\n",
"#retrieve the data from the web page\n",
"page = requests.get('http://www.nist.gov/allnews.cfm?s=01-01-2014&e=12-31-2014') \n",
"page = requests.get('https://www.nist.gov/news-events/news/search?combine=&field_campus_tid=All&term_node_tid_depth_1=All&date_filter%5Bmin%5D%5Bdate%5D=January+01%2C+2014&date_filter%5Bmax%5D%5Bdate%5D=June+30%2C+2014&items_per_page=200') \n",
"#use html module to parse it out and store in tree\n",
"tree = html.fromstring(page.content)\n",
"\n",
"#create list of news headlines and descriptions. This required obtaining the XPath of the elements by examining the web page.\n",
"list_of_headlines = tree.xpath('//div[@class=\"select_portal_module_wrapper\"]/a/strong/text()')\n",
"list_of_descriptions = tree.xpath('//div[@class=\"select_portal_module_wrapper\"]/p/text()')\n",
"list_of_headlines = tree.xpath('//h3[@class=\"nist-teaser__title\"]/a/text()')\n",
"list_of_descriptions = tree.xpath('//div[@class=\"field-body field--body nist-body nist-teaser__content\"]/text()')\n",
"#combine each headline and description into one value in a list\n",
"news=[]\n",
"for each_headline in list_of_headlines:\n",
Expand All @@ -121,23 +112,12 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extracting features from the training dataset using a sparse vectorizer\n",
"done in 3.459507s\n",
"n_samples: 110224, n_features: 12172\n",
"\n"
]
}
],
"outputs": [],
"source": [
"print(\"Extracting features from the training dataset using a sparse vectorizer\")\n",
"t0 = time()\n",
Expand Down Expand Up @@ -165,39 +145,11 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=15, n_init=10,\n",
" n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,\n",
" verbose=0)\n",
"done in 80.027s\n",
"\n",
"Top terms per cluster:\n",
"Cluster 0: final house technology standards national version 2014 institute released roadmap\n",
"Cluster 1: standards technology institute national new researchers based atomic research demonstrated\n",
"Cluster 2: standards md measurement national technology gaithersburg institute campus march employees\n",
"Cluster 3: developed new optical institute technology standards national years technique ago\n",
"Cluster 4: test metal left ptir photothermal lateral resonance combines object resolution\n",
"Cluster 5: committee technology visiting vcat primary advanced congress advisory report president\n",
"Cluster 6: device medical chip devices scale used gas click light materials\n",
"Cluster 7: manufacturing extension hollings partnership mep new technology standards institute national\n",
"Cluster 8: forensic science standards research new national technology institute committees organization\n",
"Cluster 9: draft public comment standards review technology national institute issued federal\n",
"Cluster 10: cnst nanoscale center science director 2014 edition silicon recent spring\n",
"Cluster 11: workshop developing hold 2014 workshops devoted community disaster privacy resilience\n",
"Cluster 12: health care annual hospitals healthcare performance costs 100 early better\n",
"Cluster 13: baldrige award excellence program performance malcolm organizations quality 2014 penny\n",
"Cluster 14: dimensional metrology pml semiconductor division neutron ultra colleagues challenge electron\n"
]
}
],
"outputs": [],
"source": [
"#number of clusters = 15, since NIST has 15 subject areas\n",
"k = 15\n",
Expand Down