From 542f5a90d76af0eefe48ae1958e5235fc6334886 Mon Sep 17 00:00:00 2001 From: LiangMa Date: Sat, 15 Jun 2019 12:37:00 +0800 Subject: [PATCH] ready to push up to github --- P102_roll_dice.py | 57 +++++++++++++++++++++++++++++++++++++++++ P111_SND.py | 38 +++++++++++++++++++++++++++ P12_tweet_analysis.py | 10 ++++++++ P18_advtise_analysis.py | 20 +++++++++++++++ P19_JD_analysis.py | 30 ++++++++++++++++++++++ P19_JD_analysis_2.py | 38 +++++++++++++++++++++++++++ P36_data_mean_median.py | 16 ++++++++++++ P38_data_sd_analysis.py | 40 +++++++++++++++++++++++++++++ P66_movie.py | 29 +++++++++++++++++++++ P71_set.py | 20 +++++++++++++++ P74_Jaccard.py | 27 +++++++++++++++++++ P77_user_and_movie.py | 32 +++++++++++++++++++++++ P83_large_number.py | 26 +++++++++++++++++++ P83_large_number_1.py | 31 ++++++++++++++++++++++ Test_Matrix.py | 29 +++++++++++++++++++++ coin_study.py | 32 +++++++++++++++++++++++ prac_bs4.py | 10 ++++++++ 17 files changed, 485 insertions(+) create mode 100644 P102_roll_dice.py create mode 100644 P111_SND.py create mode 100644 P12_tweet_analysis.py create mode 100644 P18_advtise_analysis.py create mode 100644 P19_JD_analysis.py create mode 100644 P19_JD_analysis_2.py create mode 100644 P36_data_mean_median.py create mode 100644 P38_data_sd_analysis.py create mode 100644 P66_movie.py create mode 100644 P71_set.py create mode 100644 P74_Jaccard.py create mode 100644 P77_user_and_movie.py create mode 100644 P83_large_number.py create mode 100644 P83_large_number_1.py create mode 100644 Test_Matrix.py create mode 100644 coin_study.py create mode 100644 prac_bs4.py diff --git a/P102_roll_dice.py b/P102_roll_dice.py new file mode 100644 index 0000000..313f1b9 --- /dev/null +++ b/P102_roll_dice.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import random +#import numpy as np +#import pandas as pd +from matplotlib import pyplot as plt + +def random_dice_roll(): + return random.randint(1,6) + + +trials_list=[] +std_deviation=0.0 + +num_trials=1000 +for trial in range(num_trials): + trials_list.append (random_dice_roll()) + +avg=sum(trials_list)/float(num_trials) + +for trial in range(num_trials): + std_deviation+=(trials_list[trial]-avg)**2 + +std_deviation=std_deviation/(num_trials-1) + +print (trials_list) +print (avg) +print (std_deviation) + +''' +num_trials=range(100,1000,10) +avgs_list=[] +std_dev_list=[] +std_deviation=0.0 + +for num_trial in num_trials: + trials_list=[] + for trial in range(1,num_trial): + trials_list.append (random_dice_roll()) + print(trial) + avg=sum(trials_list)/float(num_trial) + avgs_list.append (avg) + print (trials_list) + + for trial in range(1,num_trial): + print(trial, trials_list[trial]) + std_deviation+=(trials_list[trial]-avg)**2 + + std_deviation=std_deviation/(num_trial-1) + std_dev_list.append (std_deviation) + +plt.plot(num_trials, avgs) +plt.xlabel('Number of Trials') +plt.ylabel('Average') +plt.show() +''' diff --git a/P111_SND.py b/P111_SND.py new file mode 100644 index 0000000..7031dc9 --- /dev/null +++ b/P111_SND.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +#import random +import numpy as np +#import pandas as pd +from matplotlib import pyplot as plt + +def normal_pdf(x, mu=0, sigma=1): + return (1./np.sqrt(2*3.14159*sigma**2) )* 2.718**( -(x-mu)**2/(2*sigma**2) ) + + +x_values = np.linspace(-5,5,100) +y_values = [normal_pdf(x) for x in x_values] + +plt.plot(x_values, y_values) +plt.show() +''' + +from scipy.stats import norm +import numpy as np +import matplotlib.pyplot as plt + + +def normal_pdf(x, mean, sigma): + return (1./np.sqrt(2*3.14159*sigma**2) )* 2.718**( -(x-mean)**2/(2*sigma**2) ) + +x=np.arange(-5,5,0.01) +mean=0 +sigma=1 +y=normal_pdf(x, mean, sigma) + +#y=norm.pdf(x,mean,sigma) +plt.plot(x,y) +plt.xlabel('x') +plt.ylabel('y') +plt.show() +''' diff --git a/P12_tweet_analysis.py b/P12_tweet_analysis.py new file mode 100644 index 0000000..4e722ad --- /dev/null +++ b/P12_tweet_analysis.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +tweet_msg="RT @robdv: $TWTR now top holding for Andor, unseating $AAPL" + +words_in_tweet= tweet_msg.split(' ') # split tweet messge to word +for word in words_in_tweet: #for loop + if "$" in word: + print ("This Tweet is about", word) + diff --git a/P18_advtise_analysis.py b/P18_advtise_analysis.py new file mode 100644 index 0000000..69014f8 --- /dev/null +++ b/P18_advtise_analysis.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import pandas as pd +import seaborn as sns + + +# %matplotlib inline +data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0) +head=data.head() + +print (head) + +print (data) + +sns.set(style="ticks", color_codes=True) +g=sns.pairplot (data, x_vars=["TV","radio","newspaper"], y_vars="sales") #, height=4.5, aspect=0.7) + +import matplotlib.pyplot as plt +plt.show() diff --git a/P19_JD_analysis.py b/P19_JD_analysis.py new file mode 100644 index 0000000..12928fa --- /dev/null +++ b/P19_JD_analysis.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import requests #used for grab data from the web + +from bs4 import BeautifulSoup #used to parse HTML + +from sklearn.feature_extraction.text import CountVectorizer + +jd_texts =[] + +for index in range(0, 30, 10): #index from 0, 10, 20,...1000? + page ='https://www.indeed.com/jobs?q=data+scientist&start='+str(index) + print ("grab data from web site...", index) + + web_result = requests.get(page).text #use requests to actually vsit the url specfied by page. + + soup=BeautifulSoup(web_result, "html.parser") + + for listing in soup.findAll('span', {'class':'summary'}): + jd_texts.append(listing.text) + +print ("Finished grab data from web site, now start anaylysis...") + +type(jd_texts) +vect=CountVectorizer(ngram_range=(1,2),stop_words='english') +matrix=vect.fit_transform(jd_texts) +print (len(vect.get_feature_names())) + + diff --git a/P19_JD_analysis_2.py b/P19_JD_analysis_2.py new file mode 100644 index 0000000..f3099ec --- /dev/null +++ b/P19_JD_analysis_2.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import requests +from bs4 import BeautifulSoup +from sklearn.feature_extraction.text import CountVectorizer + +# grab postings from the web +texts = [] + +for i in range(0,100,10): # cycle through 100 pages of indeed job resources + + soup = BeautifulSoup(requests.get('http://www.indeed.com/jobs?q=data+scientist&start='+str(i)).text,'html.parser') + print ("Reading the web messge, range index=", i) + #print(soup.prettify()) + texts += [a.text for a in soup.findAll('span', {'class':'summary'})] + #print(texts) + +#print (type(texts)) + +print(texts) +#print (texts[10]) # first job description + +# + +vect = CountVectorizer(ngram_range=(1,2), stop_words='english') +# make a count vectorizer to get basic counts + +matrix = vect.fit_transform(texts) +# fit and learn to the vocabulary in the corpus + +print (len(vect.get_feature_names())) # how many features are there + +freqs = [(word, matrix.getcol(idx).sum()) for word, idx in vect.vocabulary_.items()] +#sort from largest to smallest + +for phrase, times in sorted (freqs, key = lambda x: -x[1])[:25]: + print (phrase, times) diff --git a/P36_data_mean_median.py b/P36_data_mean_median.py new file mode 100644 index 0000000..386b858 --- /dev/null +++ b/P36_data_mean_median.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import numpy + +original_data=[5,4,3,4,5,3,2,5,3,2,1,4,5,3,4,4,5,4,2,1,4,5,4,3,2,4,4,5,4,3,2,1] + +sorted_data=sorted(original_data) + +print ('original data are :',original_data) + +print ('sorted data are :',sorted_data) + +print ('mean of data is: ', numpy.mean(original_data)) + +print ('median of data is: ', numpy.median(original_data)) diff --git a/P38_data_sd_analysis.py b/P38_data_sd_analysis.py new file mode 100644 index 0000000..0f693e3 --- /dev/null +++ b/P38_data_sd_analysis.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import numpy + +temperature=[31,32,32,31,28,29,31,38,32,31,30,29,30,31,26] + +sorted_temp=sorted(temperature) +mean_temp=numpy.mean(temperature) + +print ('original temperature are :',temperature) + +print ('sorted temperature are :',sorted_temp) + +print ('mean of temperature is: ', mean_temp) + +print ('median of temperature is: ', numpy.median(temperature)) + +print ('\n') + +sqrt_diff_list=[] + +num_items=len(temperature) +product=1. + + +for temp in temperature: + diff=temp-mean_temp + sqrt_diff=diff**2 + sqrt_diff_list.append(sqrt_diff) + average_sqrt_diff=numpy.mean(sqrt_diff_list) + product*=temp + +standard_deviation = numpy.sqrt(average_sqrt_diff) +geometric_mean = product ** (1./num_items) + +print ('Standard Deviation = ',standard_deviation) +print ('\n') +print ('Geometric Mean = ',geometric_mean) + diff --git a/P66_movie.py b/P66_movie.py new file mode 100644 index 0000000..5b7915f --- /dev/null +++ b/P66_movie.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import numpy as np + +user_fever_explain=np.array(['comic', 'love', 'act', 'war']) +user_fever=np.array([5 ,1 , 3, 4]) + +movie1_egen=np.array([3, 2, 3,1]) +movie2_egen=np.array([5, 5, 1,3]) + +relevant_1 = user_fever.dot(movie1_egen) +relevant_2 = user_fever.dot(movie2_egen) + +print ('user_fever = ',user_fever_explain) +print ('\n') +print ('movie1_egen = ',movie1_egen) +print ('movie1 relevant score = ',relevant_1) +print ('movie2_egen = ',movie2_egen) +print ('movie2 relevant score = ',relevant_2) +print ('\n') + + +if relevant_1>relevant_2 : + print ("movie1 is the better choice!") +elif relevant_1