From 542f5a90d76af0eefe48ae1958e5235fc6334886 Mon Sep 17 00:00:00 2001
From: LiangMa <liang.ma.sh@gmail.com>
Date: Sat, 15 Jun 2019 12:37:00 +0800
Subject: [PATCH] ready to push up to github

---
 P102_roll_dice.py       | 57 +++++++++++++++++++++++++++++++++++++++++
 P111_SND.py             | 38 +++++++++++++++++++++++++++
 P12_tweet_analysis.py   | 10 ++++++++
 P18_advtise_analysis.py | 20 +++++++++++++++
 P19_JD_analysis.py      | 30 ++++++++++++++++++++++
 P19_JD_analysis_2.py    | 38 +++++++++++++++++++++++++++
 P36_data_mean_median.py | 16 ++++++++++++
 P38_data_sd_analysis.py | 40 +++++++++++++++++++++++++++++
 P66_movie.py            | 29 +++++++++++++++++++++
 P71_set.py              | 20 +++++++++++++++
 P74_Jaccard.py          | 27 +++++++++++++++++++
 P77_user_and_movie.py   | 32 +++++++++++++++++++++++
 P83_large_number.py     | 26 +++++++++++++++++++
 P83_large_number_1.py   | 31 ++++++++++++++++++++++
 Test_Matrix.py          | 29 +++++++++++++++++++++
 coin_study.py           | 32 +++++++++++++++++++++++
 prac_bs4.py             | 10 ++++++++
 17 files changed, 485 insertions(+)
 create mode 100644 P102_roll_dice.py
 create mode 100644 P111_SND.py
 create mode 100644 P12_tweet_analysis.py
 create mode 100644 P18_advtise_analysis.py
 create mode 100644 P19_JD_analysis.py
 create mode 100644 P19_JD_analysis_2.py
 create mode 100644 P36_data_mean_median.py
 create mode 100644 P38_data_sd_analysis.py
 create mode 100644 P66_movie.py
 create mode 100644 P71_set.py
 create mode 100644 P74_Jaccard.py
 create mode 100644 P77_user_and_movie.py
 create mode 100644 P83_large_number.py
 create mode 100644 P83_large_number_1.py
 create mode 100644 Test_Matrix.py
 create mode 100644 coin_study.py
 create mode 100644 prac_bs4.py

diff --git a/P102_roll_dice.py b/P102_roll_dice.py
new file mode 100644
index 0000000..313f1b9
--- /dev/null
+++ b/P102_roll_dice.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import random
+#import numpy as np
+#import pandas as pd
+from matplotlib import pyplot as plt
+
+def random_dice_roll():
+    return random.randint(1,6)
+
+
+trials_list=[]
+std_deviation=0.0
+
+num_trials=1000
+for trial in range(num_trials):
+    trials_list.append (random_dice_roll())
+
+avg=sum(trials_list)/float(num_trials)
+
+for trial in range(num_trials):
+    std_deviation+=(trials_list[trial]-avg)**2
+
+std_deviation=std_deviation/(num_trials-1)
+
+print (trials_list)
+print (avg)
+print (std_deviation)
+
+'''
+num_trials=range(100,1000,10)
+avgs_list=[]
+std_dev_list=[]
+std_deviation=0.0
+
+for num_trial in num_trials:
+    trials_list=[]
+    for trial in range(1,num_trial):
+        trials_list.append (random_dice_roll())
+        print(trial)
+        avg=sum(trials_list)/float(num_trial)
+    avgs_list.append (avg)
+    print (trials_list)
+
+    for trial in range(1,num_trial):
+        print(trial, trials_list[trial])
+        std_deviation+=(trials_list[trial]-avg)**2
+
+    std_deviation=std_deviation/(num_trial-1)
+    std_dev_list.append (std_deviation)
+
+plt.plot(num_trials, avgs)
+plt.xlabel('Number of Trials')
+plt.ylabel('Average')
+plt.show()
+'''
diff --git a/P111_SND.py b/P111_SND.py
new file mode 100644
index 0000000..7031dc9
--- /dev/null
+++ b/P111_SND.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+#import random
+import numpy as np
+#import pandas as pd
+from matplotlib import pyplot as plt
+
+def normal_pdf(x, mu=0, sigma=1):
+    return (1./np.sqrt(2*3.14159*sigma**2) )* 2.718**( -(x-mu)**2/(2*sigma**2) )
+
+
+x_values = np.linspace(-5,5,100)
+y_values = [normal_pdf(x) for x in x_values]
+
+plt.plot(x_values, y_values)
+plt.show()
+'''
+
+from scipy.stats import norm
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def normal_pdf(x, mean, sigma):
+    return (1./np.sqrt(2*3.14159*sigma**2) )* 2.718**( -(x-mean)**2/(2*sigma**2) )
+
+x=np.arange(-5,5,0.01)
+mean=0
+sigma=1
+y=normal_pdf(x, mean, sigma)
+
+#y=norm.pdf(x,mean,sigma)
+plt.plot(x,y)
+plt.xlabel('x')
+plt.ylabel('y')
+plt.show()
+'''
diff --git a/P12_tweet_analysis.py b/P12_tweet_analysis.py
new file mode 100644
index 0000000..4e722ad
--- /dev/null
+++ b/P12_tweet_analysis.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+tweet_msg="RT @robdv: $TWTR now top holding for Andor, unseating $AAPL"
+
+words_in_tweet= tweet_msg.split(' ') # split tweet messge to word
+for word in words_in_tweet:            #for loop
+    if "$" in word:
+       print ("This Tweet is about", word)
+
diff --git a/P18_advtise_analysis.py b/P18_advtise_analysis.py
new file mode 100644
index 0000000..69014f8
--- /dev/null
+++ b/P18_advtise_analysis.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import pandas as pd
+import seaborn as sns
+
+
+# %matplotlib inline
+data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
+head=data.head()
+
+print (head)
+
+print (data)
+
+sns.set(style="ticks", color_codes=True)
+g=sns.pairplot (data, x_vars=["TV","radio","newspaper"], y_vars="sales") #, height=4.5, aspect=0.7)
+
+import matplotlib.pyplot as plt
+plt.show()
diff --git a/P19_JD_analysis.py b/P19_JD_analysis.py
new file mode 100644
index 0000000..12928fa
--- /dev/null
+++ b/P19_JD_analysis.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import requests #used for grab data from the web
+
+from bs4 import BeautifulSoup #used to parse HTML
+
+from sklearn.feature_extraction.text import CountVectorizer
+
+jd_texts =[]
+
+for index in range(0, 30, 10):     #index from 0, 10, 20,...1000?
+    page ='https://www.indeed.com/jobs?q=data+scientist&start='+str(index)
+    print ("grab data from web site...", index)
+
+    web_result = requests.get(page).text    #use requests to actually vsit the url specfied by page.
+
+    soup=BeautifulSoup(web_result, "html.parser")
+
+    for listing in soup.findAll('span', {'class':'summary'}):
+        jd_texts.append(listing.text)
+
+print ("Finished grab data from web site, now start anaylysis...")
+
+type(jd_texts)
+vect=CountVectorizer(ngram_range=(1,2),stop_words='english')
+matrix=vect.fit_transform(jd_texts)
+print (len(vect.get_feature_names()))
+
+
diff --git a/P19_JD_analysis_2.py b/P19_JD_analysis_2.py
new file mode 100644
index 0000000..f3099ec
--- /dev/null
+++ b/P19_JD_analysis_2.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import requests
+from bs4 import BeautifulSoup
+from sklearn.feature_extraction.text import CountVectorizer
+
+# grab postings from the web
+texts = []
+
+for i in range(0,100,10): # cycle through 100 pages of indeed job resources
+
+    soup = BeautifulSoup(requests.get('http://www.indeed.com/jobs?q=data+scientist&start='+str(i)).text,'html.parser')
+    print ("Reading the web messge, range index=", i)
+    #print(soup.prettify())
+    texts += [a.text for a in soup.findAll('span', {'class':'summary'})]
+    #print(texts)
+
+#print (type(texts))
+
+print(texts)
+#print (texts[10])   # first job description
+
+#<type 'list'>
+
+vect = CountVectorizer(ngram_range=(1,2), stop_words='english')
+# make a count vectorizer to get basic counts
+
+matrix = vect.fit_transform(texts)
+# fit and learn to the vocabulary in the corpus
+
+print (len(vect.get_feature_names()))  # how many features are there
+
+freqs = [(word, matrix.getcol(idx).sum()) for word, idx in vect.vocabulary_.items()]
+#sort from largest to smallest
+
+for phrase, times in sorted (freqs, key = lambda x: -x[1])[:25]:
+    print (phrase, times)
diff --git a/P36_data_mean_median.py b/P36_data_mean_median.py
new file mode 100644
index 0000000..386b858
--- /dev/null
+++ b/P36_data_mean_median.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy
+
+original_data=[5,4,3,4,5,3,2,5,3,2,1,4,5,3,4,4,5,4,2,1,4,5,4,3,2,4,4,5,4,3,2,1]
+
+sorted_data=sorted(original_data)
+
+print ('original data are :',original_data)
+
+print ('sorted data are :',sorted_data)
+
+print ('mean of data is: ', numpy.mean(original_data))
+
+print ('median of data is: ', numpy.median(original_data))
diff --git a/P38_data_sd_analysis.py b/P38_data_sd_analysis.py
new file mode 100644
index 0000000..0f693e3
--- /dev/null
+++ b/P38_data_sd_analysis.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy
+
+temperature=[31,32,32,31,28,29,31,38,32,31,30,29,30,31,26]
+
+sorted_temp=sorted(temperature)
+mean_temp=numpy.mean(temperature)
+
+print ('original temperature are :',temperature)
+
+print ('sorted temperature are :',sorted_temp)
+
+print ('mean of temperature is: ', mean_temp)
+
+print ('median of temperature is: ', numpy.median(temperature))
+
+print ('\n')
+
+sqrt_diff_list=[]
+
+num_items=len(temperature)
+product=1.
+
+
+for temp in temperature:
+    diff=temp-mean_temp
+    sqrt_diff=diff**2
+    sqrt_diff_list.append(sqrt_diff)
+    average_sqrt_diff=numpy.mean(sqrt_diff_list)
+    product*=temp
+
+standard_deviation = numpy.sqrt(average_sqrt_diff)
+geometric_mean = product ** (1./num_items)
+
+print ('Standard Deviation = ',standard_deviation)
+print ('\n')
+print ('Geometric Mean = ',geometric_mean)
+
diff --git a/P66_movie.py b/P66_movie.py
new file mode 100644
index 0000000..5b7915f
--- /dev/null
+++ b/P66_movie.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+user_fever_explain=np.array(['comic', 'love', 'act', 'war'])
+user_fever=np.array([5 ,1 , 3, 4])
+
+movie1_egen=np.array([3, 2, 3,1])
+movie2_egen=np.array([5, 5, 1,3])
+
+relevant_1 = user_fever.dot(movie1_egen)
+relevant_2 = user_fever.dot(movie2_egen)
+
+print ('user_fever   =      ',user_fever_explain)
+print ('\n')
+print ('movie1_egen  =      ',movie1_egen)
+print ('movie1 relevant score = ',relevant_1)
+print ('movie2_egen  =      ',movie2_egen)
+print ('movie2 relevant score = ',relevant_2)
+print ('\n')
+
+
+if relevant_1>relevant_2 :
+    print ("movie1 is the better choice!")
+elif relevant_1<relevant_2 :
+    print ("movie2 is the better choice!")
+elif relevant_1==relevant_2 :
+    print ("either movie1 or movie2 is good for you!")
diff --git a/P71_set.py b/P71_set.py
new file mode 100644
index 0000000..a213c44
--- /dev/null
+++ b/P71_set.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+#import numpy as np
+
+list = ([1, 2, 2, 3, 5, 2,3,1,4,6])
+
+s=set(list)
+length_s=len(s)
+
+print ("set list is: ", s)
+print ("length of set : ",length_s)
+
+print ("\n")
+
+dict = {"dog":"human's best friend", "cat": "wonderer of the world"}
+
+print ("Check dict, the dgog is:",end=" "), print (dict["dog"])
+
+print ("length of cat is:",end=" "), print (len(dict["cat"]))
diff --git a/P74_Jaccard.py b/P74_Jaccard.py
new file mode 100644
index 0000000..011f282
--- /dev/null
+++ b/P74_Jaccard.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+#import numpy as np
+
+user1={"Target","Banana Republic","Old Navy"}
+user2={"Banana Republic","Gap","Uno Kolo"}
+
+def jaccard (user1, user2):
+
+    common_set= set (user1 & user2)
+    full_set = set (user1 | user2)
+    print("common set= ", common_set)
+    print("full set  = ", full_set)
+    return len(common_set)/len(full_set)
+
+print ("user1= ", user1)
+print ("user2= ", user2)
+
+print ("\n")
+
+Jaccard=jaccard (user1, user2)
+
+print ("\n")
+print ("Jaccard factor= ", Jaccard)
+
+
diff --git a/P77_user_and_movie.py b/P77_user_and_movie.py
new file mode 100644
index 0000000..ecc40c6
--- /dev/null
+++ b/P77_user_and_movie.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import time
+
+user_fever_explain=np.array(['comic', 'love', 'act', 'war'])
+user_fever=np.array([5 ,1 , 3, 4])
+
+movie_egen_matrix=np.random.randint(5,size=(4,100))+1
+
+movie_recommend=np.dot(user_fever,movie_egen_matrix)
+
+print ('user_fever_matrix shape is: ',user_fever.shape)
+
+print ('user_fever   =      ',user_fever_explain)
+print ('\n')
+print ('movie set egen  =      \n',movie_egen_matrix)
+print ('\n')
+print ('movie_egen_matrix shape is: ',movie_egen_matrix.shape)
+print ('user_fever_matrix shape is: ',user_fever.shape)
+print ('\n')
+print ('movie recommand list : \n',movie_recommend)
+
+
+print ('\n')
+
+for num_of_movie in (100,1000,10000,100000, 1000000,10000000):
+    movie_egen_matrix=np.random.randint(5,size=(4,num_of_movie))+1
+    time_start=time.time()
+    np.dot(user_fever,movie_egen_matrix)
+    print ('It takes ',time.time()-time_start,' seconds to do ',num_of_movie, 'correlation calculate!')
diff --git a/P83_large_number.py b/P83_large_number.py
new file mode 100644
index 0000000..726a389
--- /dev/null
+++ b/P83_large_number.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+result=[]
+
+for num_of_toss in range(1, 10000):
+    toss_value=np.random.randint(low=1,high=10,size=num_of_toss)
+    mean=toss_value.mean()
+    result.append(mean)
+
+print ("We get ",len(result)," test data!")
+print (type(result))
+
+df=pd.DataFrame({'mean': result})
+print ("The first 5 mean value are:")
+print (df.head())
+print ("The last 5 mean value are:")
+print (df.tail())
+
+df.plot (title=' Law of Large Numbers')
+plt.xlabel ("Number of throes in sample")
+plt.ylabel ("Average (mean) of samples")
+plt.show()
diff --git a/P83_large_number_1.py b/P83_large_number_1.py
new file mode 100644
index 0000000..a2add7f
--- /dev/null
+++ b/P83_large_number_1.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+result=[]
+sum_up=0
+
+for num_of_toss in range(1, 5000):
+    toss_value=np.random.randint(low=1,high=10,size=1)
+    sum_up = sum_up + toss_value
+    temp= sum_up/num_of_toss
+    #print (type(temp))
+    mean = float (sum_up/num_of_toss)
+    result.append(mean)
+   # print (mean)
+
+print ("We get ",len(result)," test data!")
+print (type(result))
+
+df=pd.DataFrame({'mean': result})
+print ("The first 5 mean value are:")
+print (df.head())
+print ("The last 5 mean value are:")
+print (df.tail())
+
+df.plot (title=' Law of Large Numbers')
+plt.xlabel ("Number of throes in sample")
+plt.ylabel ("Average (mean) of samples")
+plt.show()
diff --git a/Test_Matrix.py b/Test_Matrix.py
new file mode 100644
index 0000000..8c88ef4
--- /dev/null
+++ b/Test_Matrix.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+x=np.array([3,6,8])
+
+y=np.array([4,7,0])
+
+multi=x*y
+add =x+y
+sub =x-y
+
+dot =x.dot(y)
+dot_1=y.dot(x)
+cross=np.cross(x,y)
+cross_1=np.cross(y,x)
+
+print ('x   =      ',x)
+print ('y   =      ',y)
+print ('\n')
+print ('x+y =      ',add)
+print ('x-y =      ',sub)
+print ('x*y =      ',multi)
+print ('\n')
+print ('x.y =      ',dot)
+print ('y.x =      ',dot_1)
+print ('xXy =      ',cross)
+print ('yXx =      ',cross_1)
diff --git a/coin_study.py b/coin_study.py
new file mode 100644
index 0000000..ecc40c6
--- /dev/null
+++ b/coin_study.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import time
+
+user_fever_explain=np.array(['comic', 'love', 'act', 'war'])
+user_fever=np.array([5 ,1 , 3, 4])
+
+movie_egen_matrix=np.random.randint(5,size=(4,100))+1
+
+movie_recommend=np.dot(user_fever,movie_egen_matrix)
+
+print ('user_fever_matrix shape is: ',user_fever.shape)
+
+print ('user_fever   =      ',user_fever_explain)
+print ('\n')
+print ('movie set egen  =      \n',movie_egen_matrix)
+print ('\n')
+print ('movie_egen_matrix shape is: ',movie_egen_matrix.shape)
+print ('user_fever_matrix shape is: ',user_fever.shape)
+print ('\n')
+print ('movie recommand list : \n',movie_recommend)
+
+
+print ('\n')
+
+for num_of_movie in (100,1000,10000,100000, 1000000,10000000):
+    movie_egen_matrix=np.random.randint(5,size=(4,num_of_movie))+1
+    time_start=time.time()
+    np.dot(user_fever,movie_egen_matrix)
+    print ('It takes ',time.time()-time_start,' seconds to do ',num_of_movie, 'correlation calculate!')
diff --git a/prac_bs4.py b/prac_bs4.py
new file mode 100644
index 0000000..0e06148
--- /dev/null
+++ b/prac_bs4.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from requests import get
+from pattern.web import plaintext
+url = 'http://sethgodin.typepad.com/seths_blog/2016/11/the-yeasayer.html'
+htmlString = get(url).text
+webText = plaintext(htmlString)
+length=len(webText)
+print(length)