histogrammar
diff --git a/‎.travis.yml‎
Lines changed: 2 additions & 3 deletions b/‎.travis.yml‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 3 additions & 0 deletions b/‎MANIFEST.in‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎NOTICE‎
Lines changed: 38 additions & 0 deletions b/‎NOTICE‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 0 additions & 65 deletions b/‎README.md‎
Lines changed: 0 additions & 65 deletions
diff --git a/‎README.rst‎
Lines changed: 146 additions & 0 deletions b/‎README.rst‎
Lines changed: 146 additions & 0 deletions
diff --git a/‎histogrammar/__init__.py‎
Lines changed: 32 additions & 27 deletions b/‎histogrammar/__init__.py‎
Lines changed: 32 additions & 27 deletions
@@ -4,11 +4,10 @@ os:
   - linux
 
 python:
-  - 2.7
-  - 3.4
-  - 3.5
   - 3.6
   - 3.7
+  - 3.8
+  - 3.9
 
 addons:
   apt:
 
@@ -0,0 +1,3 @@
+include requirements.txt
+include LICENSE
+include NOTICE
@@ -0,0 +1,38 @@
+################################################################################################
+#
+# NOTICE: pass-through licensing of bundled components
+#
+# Histogrammar gathers together a toolkit of pre-existing third-party 
+# open-source software components. These software components are governed by their own licenses 
+# which Histogrammar does not modify or supersede, please consult the originating 
+# authors. These components altogether have a mixture of the following licenses: Apache 2.0, MIT.
+#
+# Although we have examined the licenses to verify acceptance of commercial and non-commercial
+# use, please see and consult the original licenses or authors.
+#
+# Here is the full list of license dependencies:
+#
+# numpy: https://github.com/numpy/numpy/blob/master/LICENSE.txt
+# tqdm: https://github.com/tqdm/tqdm/blob/master/LICENCE
+# matplotlib: https://github.com/matplotlib/matplotlib/blob/master/LICENSE/LICENSE
+# joblib: https://github.com/joblib/joblib/blob/master/LICENSE.txt
+# root: https://root.cern.ch/license
+# popmon: https://github.com/ing-bank/popmon/blob/master/LICENSE
+#
+# There are several functions/classes where code or techniques have been reproduced and/or modified
+# from existing open-source packages. We list these here:
+#
+# Package: popmon
+# popmon file: histogrammar/dfinterface/spark_histogrammar.py
+#    Class: SparkHistogrammar
+#    Reference: https://github.com/ing-bank/popmon/blob/master/popmon/hist/filling/spark_histogrammar.py
+# popmon file: histogrammar/dfinterface/pandas_histogrammar.py
+#    Class: PandasHistogrammar
+#    Reference: https://github.com/ing-bank/popmon/blob/master/popmon/hist/filling/pandas_histogrammar.py
+# popmon file: histogrammar/dfinterface/histogram_filler_base.py
+#    Class: HistogramFillerBase
+#    Reference: https://github.com/ing-bank/popmon/blob/master/popmon/hist/filling/histogram_filler_base.py
+# License: MIT
+#    For details see: https://github.com/ing-bank/popmon/blob/master/LICENSE
+#
+################################################################################################
@@ -0,0 +1,146 @@
+==================================
+histogrammar Python implementation
+==================================
+
+histogrammar is a Python package for creating histograms. histogrammar has multiple histogram types,
+supports numeric and categorical features, and works with Numpy arrays and Pandas and Spark dataframes.
+Once a histogram is filled, it's easy to plot it, store it in JSON format (and retrieve it), or convert
+it to Numpy arrays for further analysis.
+
+At its core histogrammar is a suite of data aggregation primitives designed for use in parallel processing.
+In the simplest case, you can use this to compute histograms, but the generality of the primitives
+allows much more.
+
+Several common histogram types can be plotted in Matplotlib, Bokeh and PyROOT with a single method call.
+If Numpy or Pandas is available, histograms and other aggregators can be filled from arrays ten to a hundred times
+more quickly via Numpy commands, rather than Python for loops. If PyROOT is available, histograms and other
+aggregators can be filled from ROOT TTrees hundreds of times more quickly by JIT-compiling a specialized C++ filler.
+Histograms and other aggregators may also be converted into CUDA code for inclusion in a GPU workflow. And if
+PyCUDA is available, they can also be filled from Numpy arrays by JIT-compiling the CUDA code.
+This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation.
+
+Latest Python release: v1.0.20 (Feb 2021).
+
+Announcements
+=============
+
+Spark 3.0
+---------
+
+With Spark 3.0, based on Scala 2.12, make sure to pick up the correct histogrammar jar file:
+
+.. code-block:: python
+
+  spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar-sparksql_2.12:1.0.11").getOrCreate()
+
+For Spark 2.X compiled against scala 2.11, in the string above simply replace "2.12" with "2.11".
+
+February, 2021
+
+Example notebooks
+=================
+
+.. list-table::
+   :widths: 80 20
+   :header-rows: 1
+
+   * - Tutorial
+     - Colab link
+   * - `Basic tutorial <https://nbviewer.jupyter.org/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_basic.ipynb>`_
+     - |notebook_basic_colab|
+   * - `Detailed example (featuring configuration, Apache Spark and more) <https://nbviewer.jupyter.org/github/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb>`_
+     - |notebook_advanced_colab|
+
+Documentation
+=============
+
+See `histogrammar-docs <https://histogrammar.github.io/histogrammar-docs/>`_ for a complete introduction to `histogrammar`.
+(A bit old but still good.) There you can also find documentation about the Scala implementation of `histogrammar`.
+
+Check it out
+============
+
+The `historgrammar` library requires Python 3.6+ and is pip friendly. To get started, simply do:
+
+.. code-block:: bash
+
+  $ pip install histogrammar
+
+or check out the code from our GitHub repository:
+
+.. code-block:: bash
+
+  $ git clone https://github.com/histogrammar/histogrammar-python
+  $ pip install -e histogrammar-python
+
+where in this example the code is installed in edit mode (option -e).
+
+You can now use the package in Python with:
+
+.. code-block:: python
+
+  import histogrammar
+
+**Congratulations, you are now ready to use the histogrammar library!**
+
+Quick run
+=========
+
+As a quick example, you can do:
+
+.. code-block:: python
+
+  import pandas as pd
+  import histogrammar as hg
+  from histogrammar import resources
+
+  # open synthetic data
+  df = pd.read_csv(resources.data('test.csv.gz'), parse_dates=['date'])
+  df.head()
+
+  # create a histogram, tell it to look for column 'age'
+  # fill the histogram with column 'age' and plot it
+  hist = hg.Histogram(num=100, low=0, high=100, quantity='age')
+  hist.fill.numpy(df)
+  hist.plot.matplotlib()
+
+  # generate histograms of all features in the dataframe using automatic binning
+  # (importing histogrammar automatically adds this functionality to a pandas or spark dataframe)
+  hists = df.hg_make_histograms()
+  print(hists.keys())
+
+  # multi-dimensional histograms are also supported. e.g. features longitude vs latitude
+  hists = df.hg_make_histograms(features=['longitude:latitude'])
+  ll = hists['longitude:latitude']
+  ll.plot.matplotlib()
+
+  # store histogram and retrieve it again
+  ll.toJsonFile('longitude_latitude.json')
+  ll2 = hg.Factory().fromJsonFile('longitude_latitude.json')
+
+
+These examples also work with Spark dataframes. For more examples please see the notebooks and tutorials.
+
+
+Project contributors
+====================
+
+This package was originally authored by DIANA-HEP and is now maintained by volunteers.
+
+Contact and support
+===================
+
+* Issues & Ideas & Support: https://github.com/histogrammar/histogrammar-python/issues
+
+Please note that `histogrammar` is supported only on a best-effort basis.
+
+License
+=======
+`histogrammar` is completely free, open-source and licensed under the `Apache-2.0 license <https://en.wikipedia.org/wiki/Apache_License>`_.
+
+.. |notebook_basic_colab| image:: https://colab.research.google.com/assets/colab-badge.svg
+    :alt: Open in Colab
+    :target: https://colab.research.google.com/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_basic.ipynb
+.. |notebook_advanced_colab| image:: https://colab.research.google.com/assets/colab-badge.svg
+    :alt: Open in Colab
+    :target: https://colab.research.google.com/histogrammar/histogrammar-python/blob/master/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb
@@ -1,42 +1,47 @@
+# flake8: noqa
+
 #!/usr/bin/env python
 
 # Copyright 2016 DIANA-HEP
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from histogrammar.defs import *
+from histogrammar.defs import Factory, Container
+
+from histogrammar.primitives.average import Average
+from histogrammar.primitives.bag import Bag
+from histogrammar.primitives.bin import Bin
+from histogrammar.primitives.categorize import Categorize
+from histogrammar.primitives.centrallybin import CentrallyBin
+from histogrammar.primitives.collection import Collection, Branch, Index, Label, UntypedLabel
+from histogrammar.primitives.count import Count
+from histogrammar.primitives.deviate import Deviate
+from histogrammar.primitives.fraction import Fraction
+from histogrammar.primitives.irregularlybin import IrregularlyBin
+from histogrammar.primitives.minmax import Minimize, Maximize
+from histogrammar.primitives.select import Select
+from histogrammar.primitives.sparselybin import SparselyBin
+from histogrammar.primitives.stack import Stack
+from histogrammar.primitives.sum import Sum
 
-from histogrammar.primitives.average import *
-from histogrammar.primitives.bag import *
-from histogrammar.primitives.bin import *
-from histogrammar.primitives.categorize import *
-from histogrammar.primitives.centrallybin import *
-from histogrammar.primitives.collection import *
-from histogrammar.primitives.count import *
-from histogrammar.primitives.deviate import *
-from histogrammar.primitives.fraction import *
-from histogrammar.primitives.irregularlybin import *
-from histogrammar.primitives.minmax import *
-from histogrammar.primitives.select import *
-from histogrammar.primitives.sparselybin import *
-from histogrammar.primitives.stack import *
-from histogrammar.primitives.sum import *
+from histogrammar.convenience import Histogram
+from histogrammar.convenience import SparselyHistogram
+from histogrammar.convenience import Profile
+from histogrammar.convenience import SparselyProfile
+from histogrammar.convenience import ProfileErr
+from histogrammar.convenience import SparselyProfileErr
+from histogrammar.convenience import TwoDimensionallyHistogram
+from histogrammar.convenience import TwoDimensionallySparselyHistogram
 
-from histogrammar.specialized import Histogram
-from histogrammar.specialized import SparselyHistogram
-from histogrammar.specialized import Profile
-from histogrammar.specialized import SparselyProfile
-from histogrammar.specialized import ProfileErr
-from histogrammar.specialized import SparselyProfileErr
-from histogrammar.specialized import TwoDimensionallyHistogram
-from histogrammar.specialized import TwoDimensionallySparselyHistogram
+# handy monkey patch functions for pandas and spark dataframes
+import histogrammar.dfinterface
-Original file line number
+Diff line change
   - linux
 python:
 -  - 2.7
 -  - 3.4
 -  - 3.5
   - 3.6
   - 3.7
 +  - 3.8
 +  - 3.9
 addons:
   apt:
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+include requirements.txt`
	`2`	`+include LICENSE`
	`3`	`+include NOTICE`