liboyue
diff --git a/Diff for: ‎.gitignore
+6-2 b/Diff for: ‎.gitignore
+6-2
diff --git a/Diff for: ‎README.md
+45-13 b/Diff for: ‎README.md
+45-13
diff --git a/Diff for: ‎experiments/convex/linear_regression.py
+3-3 b/Diff for: ‎experiments/convex/linear_regression.py
+3-3
diff --git a/Diff for: ‎experiments/convex/logistic_regression.py
+37-16 b/Diff for: ‎experiments/convex/logistic_regression.py
+37-16
diff --git a/Diff for: ‎experiments/non_convex/gisette_classification.py
-35 b/Diff for: ‎experiments/non_convex/gisette_classification.py
-35
@@ -1,7 +1,11 @@
 __pycache__
 figs
 data
-*.data
-*.npz
+build
+problems/MNIST
+*npz
+*pt
+*egg-info
+*data
 .DS_Store
 *egg-info
@@ -7,7 +7,7 @@ This repository contains a set of optimization algorithms and objective function
 2. "Communication-Efficient Distributed Optimization in Networks with Gradient Tracking and Variance Reduction" [[PDF](https://arxiv.org/abs/1909.05844v2)]. (code is in the previous version of this repo [[link](https://github.com/liboyue/Network-Distributed-Algorithm/tree/08abe14f2a2d5929fc401ff99961ca3bae40ff60)])
 
 Due to the random data generation procedure,
-resulting graphs may be slightly different from those appeared in the paper,
+results may be slightly different from those appeared in papers,
 but conclusions remain the same.
 
 If you find this code useful, please cite our papers:
@@ -32,44 +32,76 @@ If you find this code useful, please cite our papers:
 }
 ```
 
-## Implemented objective functions
+
+## 1. Features
+- Easy to use: come with several popular objective functions with optional regularization and compression, essential optimization algorithms, utilities to run experiments and plot results
+- Extendability: easy to implement your own objective functions / optimization algorithms / datasets
+- Correctness: numerically verified gradient implementation
+- Performance: can run on both CPU and GPU
+- Data preprocessing: shuffling, normalizing, splitting
+
+
+## 2. Installation and usage
+### 2.1 Installation
+
+`pip install git+https://github.com/liboyue/Network-Distributed-Algorithm.git`
+
+If you have Nvidia GPUs, please also install `cupy`.
+
+### 2.2 Implementing your own objective function
+### 2.3 Implementing your own optimizer
+
+
+## 3. Objective functions
 The gradient implementations of all objective functions are checked numerically.
 
-### Linear regression
+### 3.1 Linear regression
 Linear regression with random generated data.
 The objective function is
 <img src="https://render.githubusercontent.com/render/math?math=f(w) = \frac{1}{N} \sum_i (y_i - x_i^\top w)^2">
 
-### Logistic regression
-Logistic regression with $l$-2 or nonconvex regularization with random generated data or the Gisette dataset or datasets from `libsvmtools`.
+### 3.2 Logistic regression
+Logistic regression with l-2 or nonconvex regularization with random generated data or the Gisette dataset or datasets from `libsvmtools`.
 The objective function is
-<img src="https://render.githubusercontent.com/render/math?math=f(w) =  - \frac{1}{N} * \Big(\sum_i y_i \log \frac{1}{1 + exp(w^T x_i)} + (1 - y_i) \log \frac{exp(w^T x_i)}{1 + exp(w^T x_i)} \Big) + \frac{\lambda}{2} \| w \|_2^2 + \alpha \sum_j \frac{w_j^2}{1 + w_j^2}2">
+<img src="https://render.githubusercontent.com/render/math?math=f(w) = - \frac{1}{N} * \Big(\sum_i y_i \log \frac{1}{1 %2B exp(w^T x_i)} %2B (1 - y_i) \log \frac{exp(w^T x_i)}{1 %2B exp(w^T x_i)} \Big) %2B \frac{\lambda}{2} \| w \|_2^2 %2B \alpha \sum_j \frac{w_j^2}{1 %2B w_j^2}">
 
-
-### One-hidden-layer fully-connected neural netowrk
+### 3.3 One-hidden-layer fully-connected neural netowrk
 One-hidden-layer fully-connected neural network with softmax loss on the MNIST dataset.
 
 
-## Implemented optimization algorithms
+## 4. Datasets
+- MNIST
+- Gisette
+- LibSVM data
+- Random generated data
+
 
-### Centralized optimization algorithms
+## 5. Optimization algorithms
+
+### 5.1 Centralized optimization algorithms
 - Gradient descent
 - Stochastic gradient descent
 - Nesterov's accelerated gradient descent
 - SVRG
 - SARAH
 
-### Distributed optimization algorithms (i.e. with parameter server)
+### 5.2 Distributed optimization algorithms (i.e. with parameter server)
 - ADMM
 - DANE
 
-
-### Decentralized optimization algorithms
+### 5.3 Decentralized optimization algorithms
 - Decentralized gradient descent
 - Decentralized stochastic gradient descent
 - Decentralized gradient descent with gradient tracking
 - EXTRA
 - NIDS
+- D2
+- CHOCO-SGD
 - Network-DANE/SARAH/SVRG
 - GT-SARAH
 - DESTRESS
+
+
+## 6. Change log
+
+- Mar-03-2022: Add GPU support, refactor code
@@ -18,9 +18,9 @@
 
     kappa = 10
     mu = 5e-10
-    n_iters = 30
+    n_iters = 10
 
-    p = LinearRegression(n_agent, m, dim, noise_variance=1, kappa=kappa, graph_type='er', graph_params=0.3)
+    p = LinearRegression(n_agent=n_agent, m=m, dim=dim, noise_variance=1, kappa=kappa, graph_type='er', graph_params=0.3)
     W, alpha = generate_mixing_matrix(p)
 
     log.info('m = %d, n = %d, alpha = %.4f' % (m, n_agent, alpha))
@@ -52,6 +52,6 @@
 
     exps = centralized + distributed
 
-    res = run_exp(exps, kappa=kappa, max_iter=n_iters, name='linear_regression', n_process=5, save=True)
+    res = run_exp(exps, kappa=kappa, max_iter=n_iters, name='linear_regression', n_cpu_processes=4, save=True)
 
     plt.show()
@@ -3,52 +3,73 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-from nda import log
 from nda.problems import LogisticRegression
 from nda.optimizers import *
 from nda.optimizers.utils import generate_mixing_matrix
-from nda.experiment_utils import run_exp
 
+from nda.experiment_utils import run_exp
 
 if __name__ == '__main__':
-
     n_agent = 20
     m = 1000
     dim = 40
 
-    kappa = 10
+
+    kappa = 10000
+    mu = 5e-3
+
+    kappa = 100
     mu = 5e-8
-    n_iters = 30
 
-    p = LogisticRegression(n_agent, m, dim, noise_ratio=0.05, kappa=kappa, graph_type='er', graph_params=0.3)
-    W, alpha = generate_mixing_matrix(p)
-    log.info('m = %d, n = %d, alpha = %.4f' % (m, n_agent, alpha))
+    n_iters = 10
+
+    p = LogisticRegression(n_agent=n_agent, m=m, dim=dim, noise_ratio=0.05, graph_type='er', kappa=kappa, graph_params=0.3)
+    print(p.n_edges)
+
 
     x_0 = np.random.rand(dim, n_agent)
     x_0_mean = x_0.mean(axis=1)
+    W, alpha = generate_mixing_matrix(p)
+    print('alpha = ' + str(alpha))
+
 
-    eta = 2 / (p.L + p.sigma)
+    eta = 2/(p.L + p.sigma)
     n_inner_iters = int(m * 0.05)
     batch_size = int(m / 10)
+    batch_size = 10
     n_dgd_iters = n_iters * 20
-    n_sarah_iters = n_iters * 20 
+    n_svrg_iters = n_iters * 20 
     n_dsgd_iters = int(n_iters * m / batch_size)
 
-    centralized = [
+
+    single_machine = [
         GD(p, n_iters=n_iters, eta=eta, x_0=x_0_mean),
         SGD(p, n_iters=n_dsgd_iters, eta=eta*3, batch_size=batch_size, x_0=x_0_mean, diminishing_step_size=True),
         NAG(p, n_iters=n_iters, x_0=x_0_mean),
-        SARAH(p, n_iters=n_sarah_iters, n_inner_iters=n_inner_iters, eta=eta / 20, x_0=x_0_mean)
+        SVRG(p, n_iters=n_svrg_iters, n_inner_iters=n_inner_iters, eta=eta/20, x_0=x_0_mean),
+        SARAH(p, n_iters=n_svrg_iters, n_inner_iters=n_inner_iters, eta=eta/20, x_0=x_0_mean),
         ]
 
+
     distributed = [
-        DGD_tracking(p, n_iters=n_dgd_iters, eta=eta / 10, x_0=x_0, W=W),
-        DANE(p, n_iters=n_iters, mu=mu, x_0=x_0_mean),
+        DGD_tracking(p, n_iters=n_dgd_iters, eta=eta/10, x_0=x_0, W=W),
+        DSGD(p, n_iters=n_dsgd_iters, eta=eta*2, batch_size=batch_size, x_0=x_0, W=W, diminishing_step_size=True),
+        EXTRA(p, n_iters=n_dgd_iters, eta=eta/2, x_0=x_0, W=W),
+        NIDS(p, n_iters=n_dgd_iters, eta=eta, x_0=x_0, W=W),
+
+        ADMM(p, n_iters=n_iters, rho=1, x_0=x_0_mean),
+        DANE(p, n_iters=n_iters, mu=mu, x_0=x_0_mean)
+        ]
+
+    network = [
+        NetworkSVRG(p, n_iters=n_svrg_iters, n_inner_iters=n_inner_iters, eta=eta/20, mu=mu, x_0=x_0, W=W, batch_size=batch_size),
+        NetworkSARAH(p, n_iters=n_svrg_iters, n_inner_iters=n_inner_iters, eta=eta/20, mu=mu, x_0=x_0, W=W, batch_size=batch_size),
         NetworkDANE(p, n_iters=n_iters, mu=mu, x_0=x_0, W=W),
         ]
 
-    exps = centralized + distributed
+    exps = single_machine + distributed + network
+
+    res = run_exp(exps, kappa=kappa, max_iter=n_iters, name='logistic_regression', n_cpu_processes=4, save=True)
 
-    res = run_exp(exps, kappa=kappa, max_iter=n_iters, name='logistic_regression', n_process=1, save=True)
 
     plt.show()