Merge pull request #292 from vmarkovtsev/master

vmarkovtsev · web-flow · commit 3a8653cc54b6 · 2019-06-21T12:42:15.000+02:00
Added 3D overwrites visual
diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@ Table of Contents
       * [Project burndown](#project-burndown)
       * [Files](#files)
       * [People](#people)
-      * [Churn matrix](#churn-matrix)
+      * [Churn matrix](#overwrites-matrix)
       * [Code ownership](#code-ownership)
       * [Couples](#couples)
       * [Structural hotness](#structural-hotness)
@@ -214,14 +214,14 @@ If `--people-dict` is specified, it should point to a text file with the custom
 format is: every line is a single developer, it contains all the matching emails and names separated
 by `|`. The case is ignored.
 
-#### Churn matrix
+#### Overwrites matrix
 
-![Wireshark top 20 churn matrix](doc/wireshark_churn_matrix.png)
-<p align="center">Wireshark top 20 devs - churn matrix</p>
+![Wireshark top 20 overwrites matrix](doc/wireshark_overwrites_matrix.png)
+<p align="center">Wireshark top 20 devs - overwrites matrix</p>
 
 ```
 hercules --burndown --burndown-people [--people-dict=/path/to/identities]
-labours -m churn-matrix
+labours -m overwrites-matrix
 ```
 
 Beside the burndown information, `--burndown-people` collects the added and deleted line statistics per
diff --git a/doc/wireshark_overwrites_matrix.png b/doc/wireshark_overwrites_matrix.png
diff --git a/python/labours/labours.py b/python/labours/labours.py
@@ -58,13 +58,13 @@ def parse_args():
     parser.add_argument("--size", help="Axes' size in inches, for example \"12,9\"")
     parser.add_argument("--relative", action="store_true",
                         help="Occupy 100%% height for every measurement.")
-    parser.add_argument("--couples-tmp-dir", help="Temporary directory to work with couples.")
+    parser.add_argument("--tmpdir", help="Temporary directory for intermediate files.")
     parser.add_argument("-m", "--mode",
                         choices=["burndown-project", "burndown-file", "burndown-person",
-                                 "churn-matrix", "ownership", "couples-files", "couples-people",
-                                 "couples-shotness", "shotness", "sentiment", "devs",
-                                 "devs-efforts", "old-vs-new", "all", "run-times", "languages",
-                                 "devs-parallel"],
+                                 "overwrites-matrix", "ownership", "couples-files",
+                                 "couples-people", "couples-shotness", "shotness", "sentiment",
+                                 "devs", "devs-efforts", "old-vs-new", "all", "run-times",
+                                 "languages", "devs-parallel"],
                         help="What to plot.")
     parser.add_argument(
         "--resample", default="year",
@@ -82,7 +82,7 @@ def parse_args():
     parser.add_argument("--disable-projector", action="store_true",
                         help="Do not run Tensorflow Projector on couples.")
     parser.add_argument("--max-people", default=20, type=int,
-                        help="Maximum number of developers in churn matrix and people plots.")
+                        help="Maximum number of developers in overwrites matrix and people plots.")
     args = parser.parse_args()
     return args
 
@@ -716,18 +716,19 @@ def load_ownership(header, sequence, contents, max_people):
     return sequence, people, date_range_sampling, last
 
 
-def load_churn_matrix(people, matrix, max_people):
+def load_overwrites_matrix(people, matrix, max_people, normalize=True):
     matrix = matrix.astype(float)
     if matrix.shape[0] > max_people:
         order = numpy.argsort(-matrix[:, 0])
         matrix = matrix[order[:max_people]][:, [0, 1] + list(2 + order[:max_people])]
         people = [people[i] for i in order[:max_people]]
         print("Warning: truncated people to most productive %d" % max_people)
-    zeros = matrix[:, 0] == 0
-    matrix[zeros, :] = 1
-    matrix /= matrix[:, 0][:, None]
+    if normalize:
+        zeros = matrix[:, 0] == 0
+        matrix[zeros, :] = 1
+        matrix /= matrix[:, 0][:, None]
+        matrix[zeros, :] = 0
     matrix = -matrix[:, 1:]
-    matrix[zeros, :] = 0
     for i, name in enumerate(people):
         if len(name) > 40:
             people[i] = name[:37] + "..."
@@ -907,11 +908,11 @@ def plot_many_burndown(args, target, header, parts):
     sys.stdout.write(stdout.getvalue())
 
 
-def plot_churn_matrix(args, repo, people, matrix):
+def plot_overwrites_matrix(args, repo, people, matrix):
     if args.output and args.output.endswith(".json"):
         data = locals().copy()
         del data["args"]
-        data["type"] = "churn_matrix"
+        data["type"] = "overwrites_matrix"
         if args.mode == "all":
             output = get_plot_path(args.output, "matrix")
         else:
@@ -1410,24 +1411,9 @@ def order_commits(chosen_people, days, people):
     series = list(devseries.values())
     for i, s in enumerate(series):
         arr = numpy.array(s).transpose().astype(numpy.float32)
-        commits = arr[1]
-        if len(commits) < 7:
-            commits /= commits.max()
-        else:
-            # 4 is sizeof(float32)
-            windows = numpy.lib.stride_tricks.as_strided(commits, [len(commits) - 6, 7], [4, 4])
-            commits = numpy.concatenate((
-                [windows[0, 0] / windows[0].max(),
-                 windows[0, 1] / windows[0].max(),
-                 windows[0, 2] / windows[0].max()],
-                windows[:, 3] / windows.max(axis=1),
-                [windows[-1, 4] / windows[-1].max(),
-                 windows[-1, 5] / windows[-1].max(),
-                 windows[-1, 6] / windows[-1].max()]
-            ))
-        arr[1] = commits * 7  # 7 is a pure heuristic here and is not related to the window size
+        arr[1] /= arr[1].sum()
         series[i] = arr.transpose()
-    # calculate the distance matrix using dynamic time warping metric
+    # calculate the distance matrix using dynamic time warping
     dists = numpy.full((len(series),) * 2, -100500, dtype=numpy.float32)
     for x, serx in enumerate(series):
         dists[x, x] = 0
@@ -1450,8 +1436,7 @@ def hdbscan_cluster_routed_series(dists, route):
     try:
         from hdbscan import HDBSCAN
     except ImportError as e:
-        print("Cannot import ortools: %s\nInstall it from "
-              "https://developers.google.com/optimization/install/python/" % e)
+        print("Cannot import hdbscan: %s" % e)
         sys.exit(1)
 
     opt_dist_chain = numpy.cumsum(numpy.array(
@@ -1799,12 +1784,22 @@ def people_burndown():
         except KeyError:
             print("people: " + burndown_people_warning)
 
-    def churn_matrix():
+    def overwrites_matrix():
         try:
-            plot_churn_matrix(args, name, *load_churn_matrix(
+
+            plot_overwrites_matrix(args, name, *load_overwrites_matrix(
                 *reader.get_people_interaction(), max_people=args.max_people))
+            people, matrix = load_overwrites_matrix(
+                *reader.get_people_interaction(), max_people=1000000, normalize=False)
+            from scipy.sparse import csr_matrix
+            matrix = matrix[:, 1:]
+            matrix = numpy.triu(matrix) + numpy.tril(matrix).T
+            matrix = matrix + matrix.T
+            matrix = csr_matrix(matrix)
+            write_embeddings("overwrites", args.output, not args.disable_projector,
+                             *train_embeddings(people, matrix, tmpdir=args.tmpdir))
         except KeyError:
-            print("churn_matrix: " + burndown_people_warning)
+            print("overwrites_matrix: " + burndown_people_warning)
 
     def ownership_burndown():
         try:
@@ -1822,23 +1817,23 @@ def couples_files():
         try:
             write_embeddings("files", args.output, not args.disable_projector,
                              *train_embeddings(*reader.get_files_coocc(),
-                                               tmpdir=args.couples_tmp_dir))
+                                               tmpdir=args.tmpdir))
         except KeyError:
             print(couples_warning)
 
     def couples_people():
         try:
             write_embeddings("people", args.output, not args.disable_projector,
                              *train_embeddings(*reader.get_people_coocc(),
-                                               tmpdir=args.couples_tmp_dir))
+                                               tmpdir=args.tmpdir))
         except KeyError:
             print(couples_warning)
 
     def couples_shotness():
         try:
             write_embeddings("shotness", args.output, not args.disable_projector,
                              *train_embeddings(*reader.get_shotness_coocc(),
-                                               tmpdir=args.couples_tmp_dir))
+                                               tmpdir=args.tmpdir))
         except KeyError:
             print(shotness_warning)
 
@@ -1916,7 +1911,7 @@ def devs_parallel():
         "burndown-project": project_burndown,
         "burndown-file": files_burndown,
         "burndown-person": people_burndown,
-        "churn-matrix": churn_matrix,
+        "overwrites-matrix": overwrites_matrix,
         "ownership": ownership_burndown,
         "couples-files": couples_files,
         "couples-people": couples_people,
@@ -1936,7 +1931,7 @@ def devs_parallel():
         project_burndown()
         files_burndown()
         people_burndown()
-        churn_matrix()
+        overwrites_matrix()
         ownership_burndown()
         couples_files()
         couples_people()
diff --git a/python/setup.py b/python/setup.py
@@ -15,7 +15,7 @@
     description="Python companion for github.com/src-d/hercules to visualize the results.",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="10.1.0",
+    version="10.2.0",
     license="Apache-2.0",
     author="source{d}",
     author_email="machine-learning@sourced.tech",