From f9d217d6a73c869f5e4f7c866457aad021137f98 Mon Sep 17 00:00:00 2001 From: Nicole Date: Thu, 17 Nov 2016 22:24:04 -0500 Subject: [PATCH] updated data retrieval based on changes to NIST site --- kMeansClustering.ipynb | 68 +++++++----------------------------------- 1 file changed, 10 insertions(+), 58 deletions(-) diff --git a/kMeansClustering.ipynb b/kMeansClustering.ipynb index cbf3665..4578bf6 100644 --- a/kMeansClustering.ipynb +++ b/kMeansClustering.ipynb @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "collapsed": false }, @@ -71,31 +71,22 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Retrieving data from NIST\n", - "Last item in list retrieved: A New NIST Online Database: The NIST Polycyclic Aromatic Hydrocarbon Structure Index Recently, a new website containing a wealth of information on polycyclic aromatic hydrocarbons (PAHs) was made publicly available by NIST. PAHs are compounds that are produced during the … \n" - ] - } - ], + "outputs": [], "source": [ "print(\"Retrieving data from NIST\")\n", "\n", "#retrieve the data from the web page\n", - "page = requests.get('http://www.nist.gov/allnews.cfm?s=01-01-2014&e=12-31-2014') \n", + "page = requests.get('https://www.nist.gov/news-events/news/search?combine=&field_campus_tid=All&term_node_tid_depth_1=All&date_filter%5Bmin%5D%5Bdate%5D=January+01%2C+2014&date_filter%5Bmax%5D%5Bdate%5D=June+30%2C+2014&items_per_page=200') \n", "#use html module to parse it out and store in tree\n", "tree = html.fromstring(page.content)\n", "\n", "#create list of news headlines and descriptions. This required obtaining the XPath of the elements by examining the web page.\n", - "list_of_headlines = tree.xpath('//div[@class=\"select_portal_module_wrapper\"]/a/strong/text()')\n", - "list_of_descriptions = tree.xpath('//div[@class=\"select_portal_module_wrapper\"]/p/text()')\n", + "list_of_headlines = tree.xpath('//h3[@class=\"nist-teaser__title\"]/a/text()')\n", + "list_of_descriptions = tree.xpath('//div[@class=\"field-body field--body nist-body nist-teaser__content\"]/text()')\n", "#combine each headline and description into one value in a list\n", "news=[]\n", "for each_headline in list_of_headlines:\n", @@ -121,23 +112,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "collapsed": false, "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Extracting features from the training dataset using a sparse vectorizer\n", - "done in 3.459507s\n", - "n_samples: 110224, n_features: 12172\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "print(\"Extracting features from the training dataset using a sparse vectorizer\")\n", "t0 = time()\n", @@ -165,39 +145,11 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=15, n_init=10,\n", - " n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,\n", - " verbose=0)\n", - "done in 80.027s\n", - "\n", - "Top terms per cluster:\n", - "Cluster 0: final house technology standards national version 2014 institute released roadmap\n", - "Cluster 1: standards technology institute national new researchers based atomic research demonstrated\n", - "Cluster 2: standards md measurement national technology gaithersburg institute campus march employees\n", - "Cluster 3: developed new optical institute technology standards national years technique ago\n", - "Cluster 4: test metal left ptir photothermal lateral resonance combines object resolution\n", - "Cluster 5: committee technology visiting vcat primary advanced congress advisory report president\n", - "Cluster 6: device medical chip devices scale used gas click light materials\n", - "Cluster 7: manufacturing extension hollings partnership mep new technology standards institute national\n", - "Cluster 8: forensic science standards research new national technology institute committees organization\n", - "Cluster 9: draft public comment standards review technology national institute issued federal\n", - "Cluster 10: cnst nanoscale center science director 2014 edition silicon recent spring\n", - "Cluster 11: workshop developing hold 2014 workshops devoted community disaster privacy resilience\n", - "Cluster 12: health care annual hospitals healthcare performance costs 100 early better\n", - "Cluster 13: baldrige award excellence program performance malcolm organizations quality 2014 penny\n", - "Cluster 14: dimensional metrology pml semiconductor division neutron ultra colleagues challenge electron\n" - ] - } - ], + "outputs": [], "source": [ "#number of clusters = 15, since NIST has 15 subject areas\n", "k = 15\n",