-
Notifications
You must be signed in to change notification settings - Fork 97
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Business Analytics lecture notes summary from Hui added
- Loading branch information
1 parent
3548317
commit 3c1a329
Showing
60 changed files
with
3,526 additions
and
0 deletions.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
% Dokumentenart. Ersetze 12pt, falls die Schriftgröße anzupassen ist. | ||
\documentclass[12pt]{scrartcl} | ||
|
||
% Einbinden der Pakete, des Headers und der Formatierung. | ||
\input{styles/Packages.tex} | ||
\input{styles/FormatAndHeader.tex} | ||
|
||
\graphicspath{{bilder/}} | ||
|
||
\def\ojoin{\setbox0=\hbox{$\bowtie$}% | ||
\rule[-.02ex]{.25em}{.4pt}\llap{\rule[\ht0]{.25em}{.4pt}}} | ||
\def\leftouterjoin{\mathbin{\ojoin\mkern-5.8mu\bowtie}} | ||
\def\rightouterjoin{\mathbin{\bowtie\mkern-5.8mu\ojoin}} | ||
\def\fullouterjoin{\mathbin{\ojoin\mkern-5.8mu\bowtie\mkern-5.8mu\ojoin}} | ||
% Beginn des eigentlichen Dokuments | ||
\usepackage{color,soul} | ||
\newcommand*\circled[1]{\tikz[baseline=(char.base)]{ | ||
\node[shape=circle,draw,inner sep=2pt] (char) {#1};}} % number in circles | ||
|
||
\begin{document} | ||
\pagenumbering{roman} | ||
\input{styles/title.tex} | ||
\newpage | ||
\setcounter{tocdepth}{2} | ||
|
||
\tableofcontents | ||
\newpage | ||
\flushleft | ||
|
||
\pagenumbering{arabic} | ||
\input{lectures/formulasheet.tex} | ||
\input{lectures/introduction.tex} | ||
\input{lectures/regression_analysis.tex} | ||
\input{lectures/regression_diagnostics.tex} | ||
\input{lectures/logit.tex} | ||
\input{lectures/naivebayes.tex} | ||
\input{lectures/decisiontree.tex} | ||
\input{lectures/ensemble.tex} | ||
\input{lectures/nn.tex} | ||
\input{lectures/causal.tex} | ||
\input{lectures/clustering.tex} | ||
\input{lectures/assorules.tex} | ||
\input{lectures/dataprep.tex} | ||
\input{lectures/dataevaluate.tex} | ||
\input{lectures/dimreduce.tex} | ||
|
||
\end{document} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
\section{Association Rules Discovery} | ||
\begin{itemize} | ||
\item Goal: discover \textbf{correlation among attributes} or other relationships in large databases. | ||
\item Use-case: Market Basket Analysis, cross/up-selling | ||
\item \textbf{Unsupervised learning}: no dependent variable defined, no labeled training data. | ||
\end{itemize} | ||
|
||
\subsection{Terminology} | ||
|
||
\paragraph{Rule} if $A$ and $B$ then $C$ and $D$. denote as $R: A,B \Rightarrow C,D$. It only describes \textbf{correlation}, not causality. | ||
|
||
|
||
|
||
\paragraph{Transaction Database} an instance/observation is a transaction. Each \textbf{attribute} in the database is converted to \textbf{binary flags 0/1}. | ||
\paragraph{Item} single element/attribute. eg: Milk/Bread | ||
\paragraph{Itemset} a set of items. eg: {Milk, Bread, Butter} | ||
\paragraph{Frequent Itemset} the itemset $I$ that meets the \textbf{minimum support}. $$supp(I) \geq \min supp$$ | ||
\paragraph{Support} | ||
\begin{itemize} | ||
\item support of \textbf{an item set}: \textbf{relative frequency} of the transactions that contain the item-set in \textbf{all transactions} | ||
\item support of \textbf{a rule}: the support of all item sets it contains. | ||
$$supp(A,B \Rightarrow C,D) = supp(\{A,B,C,D\})$$ | ||
The \textbf{order, the arrow} of the rule \textbf{doesn't matter in computing support}. | ||
$$supp(\text{Milk} \Rightarrow \text{Bread}) = supp(\{\text{Milk, Bread}\}) = supp(\{\text{Bread} \Rightarrow \text{Milk}\})$$ | ||
|
||
\item support \textbf{estimation}: lower bound + upper bound. | ||
\begin{itemize} | ||
\item lower bound: the support of a subset is always higher than its superset. \textbf{subset property}, every subset of a frequent set is frequent. | ||
$$supp(\{B,C\}) \geq supp(\{A,B,C,D\})$$ | ||
\item upper bound: use Venn-Diagramm. | ||
\end{itemize} | ||
\end{itemize} | ||
|
||
\paragraph{Confidence of a Rule} the likeliness to apply to the dataset. $\rightarrow$ the probability that X and Y coexist given that X exists. | ||
$$conf(R: X \Rightarrow Y) = \frac{supp(X \cup Y)}{supp(X)}$$ | ||
|
||
$$conf(\{\text{Milk,Bread}\} \Rightarrow \{\text{Butter}\}) = \frac{supp(\{\text{Milk, Bread, Butter}\})}{supp(\{\text{Milk, Bread}\})}$$ | ||
|
||
\paragraph{Strong Rule} association rules with \textbf{minimum support \& confidence}. | ||
|
||
\paragraph{Lift of a Rule} indicates \textbf{by how much (ratio)} the \textbf{confidence of a rule} surpasses the \textbf{expected value}. | ||
$$Lift(R: X \Rightarrow Y) = \frac{conf(R)}{expConf(R)} = \dfrac{\frac{supp(X \cup Y)}{supp(X)}}{supp(Y)} = \frac{supp(X \cup Y)}{supp(X)\cdot supp(Y)}$$ | ||
|
||
\subparagraph{Interpretation of Lifts} | ||
\begin{itemize} | ||
\item lift < 1: X has \textbf{positive} effect on Y. Item-sets X and Y appears \textbf{more frequent than expected value}. | ||
\item lift = 1: X and Y are \textbf{independent}. X has \textbf{no effect} on Y. | ||
\item lift > 1: X has \textbf{negative} effect on Y. Item-sets X and Y appears \textbf{less frequent than expected value}. | ||
\end{itemize} | ||
|
||
|
||
\subsection{A priori Algorithm: Generation of Itemsets and Rules} | ||
|
||
\begin{itemize} | ||
\item Idea: if X is a frequent k-item set, then all (k-1)-item subsets of X have to be frequent item sets as well. | ||
|
||
$\rightarrow$ iteratively compute frequent item sets, compute k-item sets by merging (k-1)-item sets. | ||
|
||
\item Process: | ||
\begin{enumerate}[label= \protect \circled{\arabic*} ] | ||
\item Generation of \textbf{Item sets}: | ||
\begin{itemize} | ||
\item start with item sets in \textbf{size 1}. | ||
\item only select those that \textbf{exceeds minimum support} $\rightarrow$ frequent. | ||
\item iteratively build item sets in larger sizes based on previous sizes. | ||
\end{itemize} | ||
\begin{figure}[H] | ||
\centering | ||
\includegraphics[width=0.5\textwidth]{itemset.png} | ||
\end{figure} | ||
\item Generation of \textbf{Rules} based on frequent item sets: | ||
\begin{itemize} | ||
\item start with rules with only \textbf{1 item on the right}. | ||
\item rule $X \Rightarrow Y$ is different from $Y \Rightarrow X$. Compute \textbf{both directions}. | ||
\item only select rules that \textbf{exceeds minimum confidence}. | ||
\item evaluate rules containing \textbf{multiple items on the right} by checking whether single item on the right side. \textbf{Only expand if single rules exceeds minimum confidence.} | ||
$$X \Rightarrow Y, Z \quad \text{bases on} \quad X \Rightarrow Y \quad \text{and} \quad X \Rightarrow Z$$ | ||
\end{itemize} | ||
\begin{figure}[H] | ||
\centering | ||
\includegraphics[width=0.6\textwidth]{ruleset.png} | ||
\end{figure} | ||
\end{enumerate} | ||
\end{itemize} | ||
|
||
\section{Recommendation Systems} | ||
\begin{itemize} | ||
\item Approaches: | ||
\begin{itemize} | ||
\item Association Rules: discover \textbf{correlations} | ||
\begin{itemize} | ||
\item product association | ||
\item user association | ||
\item combination of both | ||
\end{itemize} | ||
\item Collaborative Filtering: discover \textbf{similarity} | ||
\item Singular Value Decomposition | ||
\end{itemize} | ||
\end{itemize} | ||
|
||
\subsection{Collaborative Filtering} | ||
\begin{itemize} | ||
\item Idea: | ||
\begin{itemize} | ||
\item maintain a database of \textbf{user's rating} on items. | ||
\item for a \textbf{given active user}, find other \textbf{similar users} whose \textbf{rating strongly correlates} with the active user. | ||
|
||
$\rightarrow$ recommend items highly rated by similar users, which is \textbf{not rated} by active user. | ||
\end{itemize} | ||
|
||
\end{itemize} | ||
|
||
\subsubsection{Process} | ||
\begin{enumerate}[label= \protect \circled{\arabic*} ] | ||
\item define \textbf{active user} $a$ and \textbf{other users} $u$. | ||
\item calculate \textbf{weighted correlation} $w_{a,u}$ based on \textbf{number of co-rated items m} . | ||
\begin{itemize} | ||
\item calculate average of the \textbf{co-rated items} $\bar{r}_a, \bar{r}_u$ | ||
\item calculate the variance $\sigma_{r_{a}}^2, \sigma_{r_{u}}^2$ and standard deviation. | ||
\item calculate the covariance. \textbf{Don't forget the minus/plus symbol!!!} | ||
\item calculate the weighted correlation. | ||
\end{itemize} | ||
|
||
$$w_{a,u} = s_{a,u} \cdot c_{a,u}$$ | ||
$$c_{a,u} = \frac{Cov(r_{a}, r_{u})}{\sigma_{r_{a}} \cdot \sigma_{r_{u}}}$$ | ||
$$Cov(r_{a}, r_{u}) = \frac{1}{m-1}\cdot \Sigma (r_a - \bar{r}_a) (r_u - \bar{r}_u)$$ | ||
\item \textbf{rating prediction} for item i for active user. | ||
\begin{itemize} | ||
\item calculate \textbf{average rating for all rated items} $\bar{r}_a$ of active user a. | ||
\item calculate \textbf{average rating for all rated items} $\bar{r}_u$ of each other user u. | ||
\item $r_{u,i}$: other user u's rating on the i-th item. | ||
\end{itemize} | ||
$$p_{a,i} = \bar{r}_a + \Sigma_{u = 1}^k \dfrac{w_{a,u} \cdot (r_{u,i} - \bar{r}_u)}{\Sigma_{u=1}^k |w_{a,u}|}$$ | ||
\end{enumerate} | ||
|
||
\subsubsection{Limitation in Collaborative Filtering} | ||
\begin{itemize} | ||
\item \textbf{Cold Start}: \textbf{enough users and ratings} are needed to generate recommendations. | ||
\item \textbf{Sparsity}: the user/rating matrix can be sparse even there are many users | ||
|
||
$\rightarrow$ hard to find \textbf{co-rated} items. | ||
\item \textbf{First Rater}: with a \textbf{new product}, there must first be consumers who test and evaluate it. | ||
\item \textbf{Popularity Bias}: cannot recommend items to users with \textbf{unique taste}. Tend to recommend popular items. | ||
\end{itemize} | ||
|
||
$\rightarrow$ Alternative: \textbf{Content-Based Filtering} | ||
\begin{itemize} | ||
\item idea: based on information of the content of items. | ||
\item solve: | ||
\begin{itemize} | ||
\item combat popularity bias | ||
\item combat first rater. | ||
\item no need of user ratings $\rightarrow$ cold start + sparsity combated. | ||
\end{itemize} | ||
\end{itemize} | ||
|
||
\subsection{Singular Value Decomposition} | ||
\begin{itemize} | ||
\item Idea: produce a low-dimensional representation of the customer-product space. | ||
\item Model: | ||
$$A = U \cdot S \cdot V^T$$ | ||
\begin{itemize} | ||
\item A: the rating matrix, or the rating we want to predict. | ||
\item U: maps \textbf{users to concepts} | ||
\item S: strength of concepts/categories | ||
\item $V^T$: maps \textbf{venues/products to concepts} | ||
\end{itemize} | ||
\item \textbf{Rating Prediction}: rating for item i from user. | ||
\begin{itemize} | ||
\item calculate/consider the average rating of user. | ||
\end{itemize} | ||
$$r_{u,i} = \bar{r}_u + U(user) \cdot S \cdot V^T(item)$$ | ||
|
||
\item Interpretation of values: | ||
\begin{itemize} | ||
\item User matrix(U): | ||
\begin{itemize} | ||
\item positive: higher interest | ||
\item negative: lower interest | ||
\item 0: no interest | ||
\end{itemize} | ||
\item Product matrix($V^T$): | ||
\begin{itemize} | ||
\item positive: \textbf{positively represented} in the i-th latent factor. Users having preference in i-th latent factor will \textbf{prefer items with positive value over items with negative values}. | ||
\item negative: \textbf{negatively represented} in the i-th latent factor. Users having preference in i-th latent factor will \textbf{like item less}. | ||
\end{itemize} | ||
\end{itemize} | ||
\end{itemize} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
\section{Causal Inference} | ||
Given a \textbf{treatment}, we want to know if there is a \textbf{causality} between the \textbf{treatment and outcome}. | ||
|
||
Given a \textbf{control group} and \textbf{treatment group}, if we can observe the before and after treatment for both groups, we can find out different treatment effects: | ||
\begin{itemize} | ||
\item individual treatment effect: $Y_{1i} - Y_{0i}$ | ||
\item average treatment effect: $E(Y_{1i} - Y_{0i})$ | ||
\item subgroup treatment effect: $E(Y_{1i} - Y_{0i} | X)$ | ||
\end{itemize} | ||
However, $Y_{1i}$ and $Y_{0i})$ can't be both observable for one group. | ||
|
||
$\rightarrow$ approximation | ||
|
||
\subsection{Data Collection in Causal Inference} | ||
\paragraph{Golden Rule} \textbf{randomized controlled trials}. The treatment is controlled, individuals are assigned randomly to the treatment. | ||
|
||
$\rightarrow$ Sample selection bias is prevented. | ||
|
||
\subsubsection{Different Types of Experiments/Data Collections} | ||
\begin{itemize} | ||
\item \textbf{Randomized Controlled Trials}: Treatment/Control groups separated. Subject is \textbf{randomly assigned} to the \textbf{treatment/control} group. | ||
|
||
$\rightarrow$ minimum selection bias | ||
\begin{itemize} | ||
\item Lab Experiments | ||
\item Field Experiments | ||
\end{itemize} | ||
|
||
\item \textbf{Quasi-experiements}: \textbf{natural groups} pre-exist, no separation of control/treatment group beforehand. The independent variable(treatment variable) is \textbf{controlled}, subjects are \textbf{not randomly assigned}. | ||
|
||
$\rightarrow$ selection bias | ||
|
||
\item \textbf{Observational studies}: what we always have. The independent variable is \textbf{not controlled}, individuals \textbf{self-assigned}. | ||
|
||
$\rightarrow$ selection bias | ||
\begin{itemize} | ||
\item cross-sectional study | ||
\item longitudinal study | ||
\item panel study | ||
\item case-control study | ||
\end{itemize} | ||
\end{itemize} | ||
|
||
\subsection{Challenges to Quasi-Experiments \& Observational Studies: Confounding Variables \& Identification Strategies} | ||
\subsubsection{Confounding Variables} | ||
|
||
For \textbf{Quasi-Experiments and Observational Studies}, confounding variables might exist, but \textbf{not observable, therefore omitted from the model}. In order to identify precise causal effects, we need to deal with confounding variables. | ||
|
||
|
||
\paragraph{Confounding Variables} an extraneous variable that is \textbf{unobservable}, which \textbf{correlates} with \textbf{dependent and independent variables}. | ||
|
||
$\rightarrow$ $Cov(\varepsilon, X) \neq 0$ $\rightarrow$ Endogeneity | ||
|
||
$\rightarrow$ Consequence: biased results. | ||
|
||
\subsubsection{Combat Confounding Variables by Data Collection: Randomized Controlled Trials} | ||
Apply Randomized Controlled Trials: confounding variables \textbf{automatically removed}. | ||
|
||
Ways to conduct RCTs: | ||
\begin{itemize} | ||
\item Lab experiment | ||
\item Field experiement | ||
\end{itemize} | ||
\begin{figure}[H] | ||
\centering | ||
\includegraphics[width=0.8\textwidth]{labfield.png} | ||
\end{figure} | ||
\subsubsection{Indentification Strategy for Quasi-Experiments: Difference-in-Difference} | ||
\begin{itemize} | ||
\item Idea: Observe the \textbf{effect of treatment} controlled by the researcher between \textbf{control \& treatment group}, \textbf{over time}. | ||
\item Process: | ||
\begin{itemize} | ||
\item Assume there exists an \textbf{overall trend} on both control \& treatment group. We still want to estimate the treatment effect while not omitting confounding variable (eg: time). | ||
\item treatment effect: | ||
$$\text{Treatment effect} = (Y_{t2} - Y_{t1}) - (Y_{c2} - Y_{c1})$$ | ||
\end{itemize} | ||
\end{itemize} | ||
|
||
|
||
|
||
|
||
\subsubsection{Indentification Strategy for Panel Studies: Fixed-Effect Models} | ||
\begin{itemize} | ||
\item Idea: \textbf{Fixed influence} is omitted as one of the confounding variables in modeling the treatment effect on outcome. | ||
|
||
$\rightarrow$ model \textbf{fixed effect} to soak up \textbf{individual effects on model}. | ||
\item Fixed-Effect Model: the fixed effect is modeled as an \textbf{additional intercept $\lambda_i$} for \textbf{each individual}. | ||
\end{itemize} | ||
|
||
\subsubsection{Indentification Strategy for Observational Studies: Propensity Score Matching} | ||
\begin{itemize} | ||
\item Idea: in cross-sectional data, find a \textbf{data section} where control \& treatment group has the \textbf{maximum similarity} in covariate distribution. | ||
|
||
$\rightarrow$ resembles \textbf{randomized experiment} | ||
|
||
\item Process: | ||
\begin{itemize} | ||
\item estimate \textbf{propensity score} by logistic regression for each individual in \textbf{treatment group}. | ||
\item \textbf{match} the \textbf{control group} to the treatment group. Find subjects with \textbf{similar propensity score}. | ||
\item evaluate quality of matching | ||
\item evaluate \textbf{treatment effect} based on the \textbf{treatment and matched control group}. | ||
|
||
\end{itemize} | ||
\end{itemize} | ||
|
||
\subsubsection{Confounding Variable as Instrument Variables} | ||
\paragraph{Instrument} attributes that \textbf{has causal effect} on \textbf{treatment variable}, but \textbf{no causal effect} on \textbf{outcome}. | ||
\begin{itemize} | ||
\item Modeling Process: | ||
\begin{itemize} | ||
\item instruments \textbf{randomly assigned} to the treatment variable | ||
\item model the relationship between the instrument and treatment variable. | ||
\item model the relationship between the predicted treatment variable and outcome. | ||
\end{itemize} | ||
\item Estimation: 2-stage least square. | ||
\end{itemize} | ||
|
Oops, something went wrong.