Business Analytics lecture notes summary from Hui added

ila · Apr 13, 2021 · 3c1a329 · 3c1a329
1 parent 3548317
commit 3c1a329
Show file tree

Hide file tree

Showing 60 changed files with 3,526 additions and 0 deletions.
diff --git a/Business Analytics/bilder/1-rule.png b/Business Analytics/bilder/1-rule.png
diff --git a/Business Analytics/bilder/CRISP-DM.png b/Business Analytics/bilder/CRISP-DM.png
diff --git a/Business Analytics/bilder/H1.png b/Business Analytics/bilder/H1.png
diff --git a/Business Analytics/bilder/ai.png b/Business Analytics/bilder/ai.png
diff --git a/Business Analytics/bilder/bagging-boosting.png b/Business Analytics/bilder/bagging-boosting.png
diff --git a/Business Analytics/bilder/baynet_example.png b/Business Analytics/bilder/baynet_example.png
diff --git a/Business Analytics/bilder/bias-variance.png b/Business Analytics/bilder/bias-variance.png
diff --git a/Business Analytics/bilder/confusion.png b/Business Analytics/bilder/confusion.png
diff --git a/Business Analytics/bilder/dag_bayesnet.png b/Business Analytics/bilder/dag_bayesnet.png
diff --git a/Business Analytics/bilder/dag_naivebayes.png b/Business Analytics/bilder/dag_naivebayes.png
diff --git a/Business Analytics/bilder/datatoinfo.png b/Business Analytics/bilder/datatoinfo.png
diff --git a/Business Analytics/bilder/dimreduce.png b/Business Analytics/bilder/dimreduce.png
diff --git a/Business Analytics/bilder/fre_table.png b/Business Analytics/bilder/fre_table.png
diff --git a/Business Analytics/bilder/gain.png b/Business Analytics/bilder/gain.png
diff --git a/Business Analytics/bilder/gauss-markov.png b/Business Analytics/bilder/gauss-markov.png
diff --git a/Business Analytics/bilder/gradientascent.png b/Business Analytics/bilder/gradientascent.png
diff --git a/Business Analytics/bilder/itemset.png b/Business Analytics/bilder/itemset.png
diff --git a/Business Analytics/bilder/labfield.png b/Business Analytics/bilder/labfield.png
diff --git a/Business Analytics/bilder/lift.png b/Business Analytics/bilder/lift.png
diff --git a/Business Analytics/bilder/like_table.png b/Business Analytics/bilder/like_table.png
diff --git a/Business Analytics/bilder/lm_R.png b/Business Analytics/bilder/lm_R.png
diff --git a/Business Analytics/bilder/logit.png b/Business Analytics/bilder/logit.png
diff --git a/Business Analytics/bilder/logit_interpretation.png b/Business Analytics/bilder/logit_interpretation.png
diff --git a/Business Analytics/bilder/logit_r.png b/Business Analytics/bilder/logit_r.png
diff --git a/Business Analytics/bilder/logodds_odds.png b/Business Analytics/bilder/logodds_odds.png
diff --git a/Business Analytics/bilder/long.png b/Business Analytics/bilder/long.png
diff --git a/Business Analytics/bilder/missingvalue.png b/Business Analytics/bilder/missingvalue.png
diff --git a/Business Analytics/bilder/nn.png b/Business Analytics/bilder/nn.png
diff --git a/Business Analytics/bilder/poisson_coeff.png b/Business Analytics/bilder/poisson_coeff.png
diff --git a/Business Analytics/bilder/projection.png b/Business Analytics/bilder/projection.png
diff --git a/Business Analytics/bilder/prune.png b/Business Analytics/bilder/prune.png
diff --git a/Business Analytics/bilder/roc.png b/Business Analytics/bilder/roc.png
diff --git a/Business Analytics/bilder/ruleset.png b/Business Analytics/bilder/ruleset.png
diff --git a/Business Analytics/bilder/scale_measurement.jpg b/Business Analytics/bilder/scale_measurement.jpg
diff --git a/Business Analytics/bilder/tidy.png b/Business Analytics/bilder/tidy.png
diff --git a/Business Analytics/bilder/tidy1.png b/Business Analytics/bilder/tidy1.png
diff --git a/Business Analytics/bilder/vif1.png b/Business Analytics/bilder/vif1.png
diff --git a/Business Analytics/bilder/vif2.png b/Business Analytics/bilder/vif2.png
diff --git a/Business Analytics/bilder/vif3.png b/Business Analytics/bilder/vif3.png
diff --git a/Business Analytics/bilder/wide.png b/Business Analytics/bilder/wide.png
diff --git a/Business Analytics/lecturenotes.pdf b/Business Analytics/lecturenotes.pdf
diff --git a/Business Analytics/lecturenotes.tex b/Business Analytics/lecturenotes.tex
@@ -0,0 +1,47 @@
+% Dokumentenart. Ersetze 12pt, falls die Schriftgröße anzupassen ist.
+\documentclass[12pt]{scrartcl}
+
+% Einbinden der Pakete, des Headers und der Formatierung.
+\input{styles/Packages.tex}
+\input{styles/FormatAndHeader.tex}
+
+\graphicspath{{bilder/}}
+
+\def\ojoin{\setbox0=\hbox{$\bowtie$}%
+	\rule[-.02ex]{.25em}{.4pt}\llap{\rule[\ht0]{.25em}{.4pt}}}
+\def\leftouterjoin{\mathbin{\ojoin\mkern-5.8mu\bowtie}}
+\def\rightouterjoin{\mathbin{\bowtie\mkern-5.8mu\ojoin}}
+\def\fullouterjoin{\mathbin{\ojoin\mkern-5.8mu\bowtie\mkern-5.8mu\ojoin}}
+% Beginn des eigentlichen Dokuments
+\usepackage{color,soul}
+\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
+		\node[shape=circle,draw,inner sep=2pt] (char) {#1};}}     % number in circles
+
+\begin{document}		
+	\pagenumbering{roman}
+	\input{styles/title.tex}
+	\newpage
+	\setcounter{tocdepth}{2}
+
+	\tableofcontents
+	\newpage
+	\flushleft
+
+	\pagenumbering{arabic}
+	\input{lectures/formulasheet.tex}
+	\input{lectures/introduction.tex}
+	\input{lectures/regression_analysis.tex}
+	\input{lectures/regression_diagnostics.tex}
+	\input{lectures/logit.tex}
+	\input{lectures/naivebayes.tex}
+	\input{lectures/decisiontree.tex}
+	\input{lectures/ensemble.tex}
+	\input{lectures/nn.tex}
+	\input{lectures/causal.tex}
+	\input{lectures/clustering.tex}
+	\input{lectures/assorules.tex}
+	\input{lectures/dataprep.tex}
+	\input{lectures/dataevaluate.tex}
+	\input{lectures/dimreduce.tex}
+
+\end{document}
diff --git a/Business Analytics/lectures/assorules.tex b/Business Analytics/lectures/assorules.tex
@@ -0,0 +1,188 @@
+\section{Association Rules Discovery}
+\begin{itemize}
+	\item Goal: discover \textbf{correlation among attributes} or other relationships in large databases.
+	\item Use-case: Market Basket Analysis, cross/up-selling
+	\item \textbf{Unsupervised learning}: no dependent variable defined, no labeled training data.
+\end{itemize}
+
+\subsection{Terminology}
+
+\paragraph{Rule} if $A$ and $B$ then $C$ and $D$. denote as $R: A,B \Rightarrow C,D$. It only describes \textbf{correlation}, not causality. 
+
+
+
+\paragraph{Transaction Database} an instance/observation is a transaction. Each \textbf{attribute} in the database is converted to \textbf{binary flags 0/1}. 
+\paragraph{Item} single element/attribute. eg: Milk/Bread
+\paragraph{Itemset} a set of items. eg: {Milk, Bread, Butter}
+\paragraph{Frequent Itemset} the itemset $I$ that meets the \textbf{minimum support}. $$supp(I) \geq \min supp$$
+\paragraph{Support}
+\begin{itemize}
+	\item support of \textbf{an item set}: \textbf{relative frequency} of the transactions that contain the item-set in \textbf{all transactions}
+	\item support of \textbf{a rule}: the support of all item sets it contains. 
+	$$supp(A,B \Rightarrow C,D) = supp(\{A,B,C,D\})$$
+	The \textbf{order, the arrow} of the rule \textbf{doesn't matter in computing support}.
+	$$supp(\text{Milk} \Rightarrow \text{Bread}) = supp(\{\text{Milk, Bread}\}) = supp(\{\text{Bread} \Rightarrow \text{Milk}\})$$
+
+	\item support \textbf{estimation}: lower bound + upper bound. 
+	\begin{itemize}
+		\item lower bound: the support of a subset is always higher than its superset. \textbf{subset property}, every subset of a frequent set is frequent. 
+		$$supp(\{B,C\}) \geq supp(\{A,B,C,D\})$$
+		\item upper bound: use Venn-Diagramm.
+	\end{itemize}
+\end{itemize}
+
+\paragraph{Confidence of a Rule} the likeliness to apply to the dataset. $\rightarrow$ the probability that X and Y coexist given that X exists.
+$$conf(R: X \Rightarrow Y) = \frac{supp(X \cup Y)}{supp(X)}$$
+
+$$conf(\{\text{Milk,Bread}\} \Rightarrow \{\text{Butter}\}) = \frac{supp(\{\text{Milk, Bread, Butter}\})}{supp(\{\text{Milk, Bread}\})}$$
+
+\paragraph{Strong Rule} association rules with \textbf{minimum support \& confidence}.
+
+\paragraph{Lift of a Rule} indicates \textbf{by how much (ratio)} the \textbf{confidence of a rule} surpasses the \textbf{expected value}. 
+$$Lift(R: X \Rightarrow Y) = \frac{conf(R)}{expConf(R)} = \dfrac{\frac{supp(X \cup Y)}{supp(X)}}{supp(Y)} = \frac{supp(X \cup Y)}{supp(X)\cdot supp(Y)}$$
+
+\subparagraph{Interpretation of Lifts}
+\begin{itemize}
+	\item lift < 1: X has \textbf{positive} effect on Y. Item-sets X and Y appears \textbf{more frequent than expected value}.
+	\item lift = 1: X and Y are \textbf{independent}. X has \textbf{no effect} on Y.
+	\item lift > 1: X has \textbf{negative} effect on Y. Item-sets X and Y appears \textbf{less frequent than expected value}.
+\end{itemize}
+
+
+\subsection{A priori Algorithm: Generation of Itemsets and Rules}
+
+\begin{itemize}
+	\item Idea: if X is a frequent k-item set, then all (k-1)-item subsets of X have to be frequent item sets as well.
+
+	$\rightarrow$ iteratively compute frequent item sets, compute k-item sets by merging (k-1)-item sets.
+
+	\item Process: 
+	\begin{enumerate}[label= \protect \circled{\arabic*} ]
+		\item Generation of \textbf{Item sets}: 
+		\begin{itemize}
+			\item start with item sets in \textbf{size 1}.
+			\item only select those that \textbf{exceeds minimum support} $\rightarrow$ frequent.
+			\item iteratively build item sets in larger sizes based on previous sizes. 
+		\end{itemize}
+	\begin{figure}[H]
+		\centering
+		\includegraphics[width=0.5\textwidth]{itemset.png}
+	\end{figure}
+		\item Generation of \textbf{Rules} based on frequent item sets:
+		\begin{itemize}
+			\item start with rules with only \textbf{1 item on the right}. 
+			\item rule $X \Rightarrow Y$ is different from $Y \Rightarrow X$. Compute \textbf{both directions}. 
+			\item only select rules that \textbf{exceeds minimum confidence}. 
+			\item evaluate rules containing \textbf{multiple items on the right} by checking whether single item on the right side. \textbf{Only expand if single rules exceeds minimum confidence.}
+			$$X \Rightarrow Y, Z \quad \text{bases on} \quad X \Rightarrow Y \quad \text{and} \quad X \Rightarrow Z$$ 
+		\end{itemize}
+	\begin{figure}[H]
+		\centering
+		\includegraphics[width=0.6\textwidth]{ruleset.png}
+	\end{figure}
+	\end{enumerate}
+\end{itemize}
+
+\section{Recommendation Systems}
+\begin{itemize}
+	\item Approaches:
+	\begin{itemize}
+		\item Association Rules: discover \textbf{correlations}
+		\begin{itemize}
+			\item product association
+			\item user association
+			\item combination of both
+		\end{itemize}
+		\item Collaborative Filtering: discover \textbf{similarity}
+		\item Singular Value Decomposition
+	\end{itemize}
+\end{itemize}
+
+\subsection{Collaborative Filtering}
+\begin{itemize}
+	\item Idea: 
+	\begin{itemize}
+		\item maintain a database of \textbf{user's rating} on items.
+		\item for a \textbf{given active user}, find other \textbf{similar users} whose \textbf{rating strongly correlates} with the active user.
+
+		$\rightarrow$ recommend items highly rated by similar users, which is \textbf{not rated} by active user.
+	\end{itemize}
+
+\end{itemize}
+
+\subsubsection{Process}
+\begin{enumerate}[label= \protect \circled{\arabic*} ]
+	\item define \textbf{active user} $a$ and \textbf{other users} $u$.
+	\item calculate \textbf{weighted correlation} $w_{a,u}$ based on \textbf{number of co-rated items m} . 
+	\begin{itemize}
+		\item calculate average of the \textbf{co-rated items} $\bar{r}_a, \bar{r}_u$
+		\item calculate the variance $\sigma_{r_{a}}^2, \sigma_{r_{u}}^2$ and standard deviation.
+		\item calculate the covariance. \textbf{Don't forget the minus/plus symbol!!!} 
+		\item calculate the weighted correlation.
+	\end{itemize}
+
+	$$w_{a,u} = s_{a,u} \cdot c_{a,u}$$ 
+	$$c_{a,u} = \frac{Cov(r_{a}, r_{u})}{\sigma_{r_{a}} \cdot \sigma_{r_{u}}}$$
+	$$Cov(r_{a}, r_{u}) = \frac{1}{m-1}\cdot \Sigma (r_a - \bar{r}_a) (r_u - \bar{r}_u)$$
+	\item \textbf{rating prediction} for item i for active user.
+	\begin{itemize}
+		\item calculate \textbf{average rating for all rated items} $\bar{r}_a$ of active user a. 
+		\item calculate \textbf{average rating for all rated items} $\bar{r}_u$ of each other user u.
+		\item $r_{u,i}$: other user u's rating on the i-th item.
+	\end{itemize}
+	$$p_{a,i} = \bar{r}_a + \Sigma_{u = 1}^k \dfrac{w_{a,u} \cdot (r_{u,i} - \bar{r}_u)}{\Sigma_{u=1}^k |w_{a,u}|}$$
+\end{enumerate}	
+
+\subsubsection{Limitation in Collaborative Filtering}
+\begin{itemize}
+	\item \textbf{Cold Start}: \textbf{enough users and ratings} are needed to generate recommendations.
+	\item \textbf{Sparsity}: the user/rating matrix can be sparse even there are many users 
+
+	$\rightarrow$ hard to find \textbf{co-rated} items.
+	\item \textbf{First Rater}: with a \textbf{new product}, there must first be consumers who test and evaluate it. 
+	\item \textbf{Popularity Bias}: cannot recommend items to users with \textbf{unique taste}. Tend to recommend popular items.
+\end{itemize}
+
+$\rightarrow$ Alternative: \textbf{Content-Based Filtering}
+\begin{itemize}
+	\item idea: based on information of the content of items.
+	\item solve: 
+	\begin{itemize}
+		\item combat popularity bias
+		\item combat first rater.
+		\item no need of user ratings $\rightarrow$ cold start + sparsity combated.
+	\end{itemize}
+\end{itemize}
+
+\subsection{Singular Value Decomposition}
+\begin{itemize}
+	\item Idea: produce a low-dimensional representation of the customer-product space.
+	\item Model:
+	$$A = U \cdot S \cdot V^T$$
+	\begin{itemize}
+		\item A: the rating matrix, or the rating we want to predict.
+		\item U: maps \textbf{users to concepts}
+		\item S: strength of concepts/categories
+		\item $V^T$: maps \textbf{venues/products to concepts} 
+	\end{itemize}
+	\item \textbf{Rating Prediction}: rating for item i from user.
+	\begin{itemize}
+		\item calculate/consider the average rating of user.
+	\end{itemize}
+	$$r_{u,i} = \bar{r}_u + U(user) \cdot S \cdot V^T(item)$$
+
+	\item Interpretation of values:
+	\begin{itemize}
+		\item User matrix(U):
+		\begin{itemize}
+			\item positive: higher interest
+			\item negative: lower interest
+			\item 0: no interest
+		\end{itemize}
+		\item Product matrix($V^T$):
+		\begin{itemize}
+			\item positive: \textbf{positively represented} in the i-th latent factor. Users having preference in i-th latent factor will \textbf{prefer items with positive value over items with negative values}.
+			\item negative: \textbf{negatively represented} in the i-th latent factor. Users having preference in i-th latent factor will \textbf{like item less}.
+		\end{itemize}
+	\end{itemize}
+\end{itemize}
diff --git a/Business Analytics/lectures/causal.tex b/Business Analytics/lectures/causal.tex
@@ -0,0 +1,117 @@
+\section{Causal Inference}
+Given a \textbf{treatment}, we want to know if there is a \textbf{causality} between the \textbf{treatment and outcome}.
+
+Given a \textbf{control group} and \textbf{treatment group}, if we can observe the before and after treatment for both groups, we can find out different treatment effects:
+\begin{itemize}
+	\item individual treatment effect: $Y_{1i} - Y_{0i}$
+	\item average treatment effect: $E(Y_{1i} - Y_{0i})$
+	\item subgroup treatment effect: $E(Y_{1i} - Y_{0i} | X)$
+\end{itemize}
+However, $Y_{1i}$ and $Y_{0i})$ can't be both observable for one group.
+
+$\rightarrow$ approximation
+
+\subsection{Data Collection in Causal Inference}
+\paragraph{Golden Rule} \textbf{randomized controlled trials}. The treatment is controlled, individuals are assigned randomly to the treatment. 
+
+$\rightarrow$ Sample selection bias is prevented.
+
+\subsubsection{Different Types of Experiments/Data Collections}
+\begin{itemize}
+	\item \textbf{Randomized Controlled Trials}: Treatment/Control groups separated. Subject is \textbf{randomly assigned} to the \textbf{treatment/control} group.
+
+	$\rightarrow$ minimum selection bias
+	\begin{itemize}
+		\item Lab Experiments
+		\item Field Experiments
+	\end{itemize}
+
+	\item \textbf{Quasi-experiements}: \textbf{natural groups} pre-exist, no separation of control/treatment group beforehand. The independent variable(treatment variable) is \textbf{controlled}, subjects are \textbf{not randomly assigned}.
+
+	$\rightarrow$ selection bias
+
+	\item \textbf{Observational studies}: what we always have. The independent variable is \textbf{not controlled}, individuals \textbf{self-assigned}.
+
+	$\rightarrow$ selection bias 
+	\begin{itemize}
+		\item cross-sectional study
+		\item longitudinal study
+		\item panel study
+		\item case-control study
+	\end{itemize}
+\end{itemize}
+
+\subsection{Challenges to Quasi-Experiments \& Observational Studies: Confounding Variables \& Identification Strategies}
+\subsubsection{Confounding Variables}
+
+ For \textbf{Quasi-Experiments and Observational Studies}, confounding variables might exist, but \textbf{not observable, therefore omitted from the model}. In order to identify precise causal effects, we need to deal with confounding variables.
+
+
+\paragraph{Confounding Variables} an extraneous variable that is \textbf{unobservable}, which \textbf{correlates} with \textbf{dependent and independent variables}.
+
+$\rightarrow$ $Cov(\varepsilon, X) \neq 0$ $\rightarrow$ Endogeneity
+
+$\rightarrow$ Consequence: biased results.
+
+\subsubsection{Combat Confounding Variables by Data Collection: Randomized Controlled Trials}
+Apply Randomized Controlled Trials: confounding variables \textbf{automatically removed}.
+
+Ways to conduct RCTs:
+\begin{itemize}
+	\item Lab experiment
+	\item Field experiement
+\end{itemize}
+\begin{figure}[H]
+	\centering
+	\includegraphics[width=0.8\textwidth]{labfield.png}
+\end{figure}
+\subsubsection{Indentification Strategy for Quasi-Experiments: Difference-in-Difference}
+\begin{itemize}
+	\item Idea: Observe the \textbf{effect of treatment} controlled by the researcher between \textbf{control \& treatment group}, \textbf{over time}. 
+	\item Process:
+	\begin{itemize}
+		\item Assume there exists an \textbf{overall trend} on both control \& treatment group. We still want to estimate the treatment effect while not omitting confounding variable (eg: time). 
+		\item treatment effect:
+		$$\text{Treatment effect} = (Y_{t2} - Y_{t1}) - (Y_{c2} - Y_{c1})$$
+	\end{itemize}
+\end{itemize}
+
+
+
+
+\subsubsection{Indentification Strategy for Panel Studies: Fixed-Effect Models}
+\begin{itemize}
+	\item Idea: \textbf{Fixed influence} is omitted as one of the confounding variables in modeling the treatment effect on outcome.
+
+	$\rightarrow$ model \textbf{fixed effect} to soak up \textbf{individual effects on model}.
+	\item Fixed-Effect Model: the fixed effect is modeled as an \textbf{additional intercept $\lambda_i$} for \textbf{each individual}.
+\end{itemize}
+
+\subsubsection{Indentification Strategy for Observational Studies: Propensity Score Matching}
+\begin{itemize}
+	\item Idea: in cross-sectional data, find a \textbf{data section} where control \& treatment group has the \textbf{maximum similarity} in covariate distribution. 
+
+	$\rightarrow$ resembles \textbf{randomized experiment}
+
+	\item Process:
+	\begin{itemize}
+		\item estimate \textbf{propensity score} by logistic regression for each individual in \textbf{treatment group}. 
+		\item \textbf{match} the \textbf{control group} to the treatment group. Find subjects with \textbf{similar propensity score}.
+		\item evaluate quality of matching
+		\item evaluate \textbf{treatment effect} based on the \textbf{treatment and matched control group}.
+
+	\end{itemize}
+\end{itemize}
+
+\subsubsection{Confounding Variable as Instrument Variables}
+\paragraph{Instrument} attributes that \textbf{has causal effect} on \textbf{treatment variable}, but \textbf{no causal effect} on \textbf{outcome}.
+\begin{itemize}
+	\item Modeling Process:
+	\begin{itemize}
+		\item instruments \textbf{randomly assigned} to the treatment variable
+		\item model the relationship between the instrument and treatment variable.
+		\item model the relationship between the predicted treatment variable and outcome.
+	\end{itemize}
+	\item Estimation: 2-stage least square.                                                                                                                                                                                   
+\end{itemize}
+