diff --git a/Moments Cube Creation.ipynb b/Moments Cube Creation.ipynb new file mode 100644 index 000000000..22fec53e0 --- /dev/null +++ b/Moments Cube Creation.ipynb @@ -0,0 +1,5415 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def moment(n):\n", + " def moment_(x):\n", + " return np.sum(np.power(x, n))\n", + " moment_.__name__ = 'm%s' % n\n", + " return moment_\n", + "\n", + "def log_moment(n):\n", + " def log_moment_(x):\n", + " return np.sum(np.power(np.log(x), n))\n", + " log_moment_.__name__ = 'lm%s' % n\n", + " return log_moment_\n", + "\n", + "def log_min():\n", + " def log_min_(x):\n", + " return np.min(np.log(x))\n", + " log_min_.__name__ = 'lmin'\n", + " return log_min_\n", + "\n", + "def log_max():\n", + " def log_max_(x):\n", + " return np.max(np.log(x))\n", + " log_max_.__name__ = 'lmax'\n", + " return log_max_\n", + "\n", + "def outliers(t, name):\n", + " def outliers_(x):\n", + " return np.sum(x >= t)\n", + " outliers_.__name__ = 'outliers%s' % name\n", + " return outliers_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Milan" + ] + }, + { + "cell_type": "code", + "execution_count": 235, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# column_names = [\"Grid\", \"Time\", \"Country\", \"SMSin\", \"SMSout\", \"Callin\", \"Callout\", \"Internet\"]\n", + "# milan_data = pd.DataFrame(columns=column_names)\n", + "# for i in range(1, 6):\n", + "# milan_data_day = pd.read_csv('~/Downloads/sms-call-internet-mi-2013-11-0' + str(i) + '.txt',\n", + "# sep='\\t', header=None,\n", + "# names=column_names)\n", + "# milan_data = milan_data.append(milan_data_day)\n", + "# milan_data.to_csv('~/msketch/javamsketch/src/test/resources/milan_5day.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "milan_data = pd.read_csv('~/msketch/javamsketch/src/test/resources/milan_5day.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 239, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 239, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY8AAAD8CAYAAACPWyg8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFDVJREFUeJzt3W+MXFd5x/HvQyAQpUAcEraWHcmRsFoZLAJZJa7gxZaU\nxEmqOlQQBUXEoSmuRCJRYak4baW0BKSARFOoIJJLrDhVIVi0USziYNzgEeoLkz9A8z/KNhjFKycW\ncXDYIIKWPn0xx2ayO7s7Z727d3bn+5FGvnPm3HvOHI/3t+fcO9eRmUiSVON1TXdAkrT0GB6SpGqG\nhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqq9vpdKEXEQ+CXwW2AiM4cj4kzgW8Aa4CBw\nZWa+FBEBfBm4DPgVcG1m/qgcZzPw9+Wwn8vMnaX8fOAO4DRgD/CpzMzp2pipr2eddVauWbOml7e1\nJL3yyiucfvrpTXej7zguUzkmUzkm3b3yyis89dRTP8/Ms3veKTNnfdD+wX3WpLIvAtvK9jbgC2X7\nMuA+IIANwA9L+ZnAs+XPFWV7RXntgVI3yr6XztTGTI/zzz8/l7P9+/c33YW+5LhM5ZhM5Zh0t3//\n/gQeyh7y4PjjZJatNgE7y/ZO4IqO8jtLnw4AZ0TESuASYF9mHs327GEfsLG89pbMPJCZCdw56Vjd\n2pAkNajX8EjgexHxcERsKWVDmXm4bD8PDJXtVcBzHfseKmUzlR/qUj5TG5KkBvV0zgN4f2aORcTb\ngX0R8VTni5mZEbGgt+edqY0SaFsAhoaGaLVaC9mVRo2Pjy/r9zdXjstUjslUjkl34+Pj1fv0FB6Z\nOVb+PBIRdwMXAC9ExMrMPFyWno6U6mPAOR27ry5lY8DIpPJWKV/dpT4ztDG5f9uB7QDDw8M5MjLS\nrdqy0Gq1WM7vb64cl6kck6kck+7mEqizLltFxOkR8ebj28DFwGPAbmBzqbYZuKds7wauibYNwLGy\n9LQXuDgiVkTEinKcveW1lyNiQ7lS65pJx+rWhiSpQb3MPIaAu9s/13k98I3M/G5EPAjsiojrgJ8B\nV5b6e2hfcTVK+1LdjwNk5tGIuBl4sNT7bGYeLduf5HeX6t5XHgC3TNOGJKlBs4ZHZj4LvLtL+YvA\nRV3KE7h+mmPtAHZ0KX8IeFevbUiSmuU3zCVJ1QwPSVK1Xi/VlaRFtWbbvSe2D95yeYM9UTfOPCRJ\n1Zx5SOp7zkL6j+EhqW90hoT6m8tWkqRqhockqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKq\nGR6SpGp+w1xSo/xW+dLkzEOSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVfNSXUlLiv+rYH8wPCQt\nOr/bsfS5bCVJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqr1HB4R\ncUpE/DgivlOenxsRP4yI0Yj4VkScWsrfWJ6PltfXdBzjxlL+dERc0lG+sZSNRsS2jvKubUhaetZs\nu/fEQ0tfzczjU8CTHc+/ANyame8AXgKuK+XXAS+V8ltLPSJiHXAV8E5gI/C1EkinAF8FLgXWAR8t\ndWdqQ5LUoJ7CIyJWA5cDXy/PA/gA8O1SZSdwRdneVJ5TXr+o1N8E3JWZr2bmT4FR4ILyGM3MZzPz\nN8BdwKZZ2pAkNajXu+r+M/A3wJvL87cBv8jMifL8ELCqbK8CngPIzImIOFbqrwIOdByzc5/nJpVf\nOEsbrxERW4AtAENDQ7RarR7f1tIzPj6+rN/fXDkuU/XbmGxdPzF7pUq176/fxqRfjI+PV+8za3hE\nxJ8CRzLz4YgYmUO/Flxmbge2AwwPD+fIyEizHVpArVaL5fz+5spxmarfxuTaBTjXcfDqkar6/TYm\n/WIugdrLzON9wJ9FxGXAm4C3AF8GzoiI15eZwWpgrNQfA84BDkXE64G3Ai92lB/XuU+38hdnaEOS\n1KBZz3lk5o2ZuToz19A+4f39zLwa2A98uFTbDNxTtneX55TXv5+ZWcqvKldjnQusBR4AHgTWliur\nTi1t7C77TNeGJKlBJ/M9j88An46IUdrnJ24v5bcDbyvlnwa2AWTm48Au4Angu8D1mfnbMqu4AdhL\n+2quXaXuTG1IkhpU9d/QZmYLaJXtZ2lfKTW5zq+Bj0yz/+eBz3cp3wPs6VLetQ1JUrP8hrkkqVrV\nzEOS+knnt9UP3nJ5gz0ZPIaHtID84ablymUrSVI1w0OSVM1lK2keuDylQWN4SFow3n59+XLZSpJU\nzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjVvjCjN\nkTf90yBz5iFJqmZ4SJKqGR6SpGqGhySpmuEhSarm1VaS5pVXoQ0Gw0PSsjA5tA7ecnlDPRkMhoc0\nz/zNW4PAcx6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqdqs4RERb4qIByLifyLi8Yj4x1J+bkT8\nMCJGI+JbEXFqKX9jeT5aXl/TcawbS/nTEXFJR/nGUjYaEds6yru2IUlqVi8zj1eBD2Tmu4HzgI0R\nsQH4AnBrZr4DeAm4rtS/DniplN9a6hER64CrgHcCG4GvRcQpEXEK8FXgUmAd8NFSlxnakBqxZtu9\nJx7SIJs1PLJtvDx9Q3kk8AHg26V8J3BF2d5UnlNevygiopTflZmvZuZPgVHggvIYzcxnM/M3wF3A\nprLPdG1IkhrU0zfMy+zgYeAdtGcJ/wv8IjMnSpVDwKqyvQp4DiAzJyLiGPC2Un6g47Cd+zw3qfzC\nss90bUzu3xZgC8DQ0BCtVquXt7UkjY+PL+v3N1eLNS5b10/MXmkai/331tRn5WTGaD51e+/+++lu\nfHx89kqT9BQemflb4LyIOAO4G/jD6pYWUGZuB7YDDA8P58jISLMdWkCtVovl/P7marHG5dqTWK46\nePXI/HWkB019Vk5mjOZTt/H23093cwnUqqutMvMXwH7gj4AzIuJ4+KwGxsr2GHAOQHn9rcCLneWT\n9pmu/MUZ2pAkNaiXq63OLjMOIuI04IPAk7RD5MOl2mbgnrK9uzynvP79zMxSflW5GutcYC3wAPAg\nsLZcWXUq7ZPqu8s+07UhSWpQL8tWK4Gd5bzH64BdmfmdiHgCuCsiPgf8GLi91L8d+LeIGAWO0g4D\nMvPxiNgFPAFMANeX5TAi4gZgL3AKsCMzHy/H+sw0bUiSGjRreGTmI8B7upQ/S/tKqcnlvwY+Ms2x\nPg98vkv5HmBPr21Ikprl/+chzcLvdEhTGR7SIukMIf+XOy113ttKklTNmYekk+bS3uBx5iFJqubM\nQ2qA5z+01BkekpYlA3phuWwlSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqp5qa7UsKV6SanfKh9s\nzjwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1b08iqWfekkTH\nOfOQJFUzPCRJ1QwPSVI1z3lIWvaOn6vZun6CkWa7smw485AkVTM8JEnVDA9JUjXPeWigLdX/AlZq\n2qwzj4g4JyL2R8QTEfF4RHyqlJ8ZEfsi4pny54pSHhHxlYgYjYhHIuK9HcfaXOo/ExGbO8rPj4hH\nyz5fiYiYqQ1JUrN6WbaaALZm5jpgA3B9RKwDtgH3Z+Za4P7yHOBSYG15bAFug3YQADcBFwIXADd1\nhMFtwCc69ttYyqdrQ5LUoFnDIzMPZ+aPyvYvgSeBVcAmYGepthO4omxvAu7MtgPAGRGxErgE2JeZ\nRzPzJWAfsLG89pbMPJCZCdw56Vjd2pAkNajqhHlErAHeA/wQGMrMw+Wl54Ghsr0KeK5jt0OlbKby\nQ13KmaENSVKDej5hHhG/B/wH8NeZ+XI5LQFAZmZE5AL0r6c2ImIL7SUyhoaGaLVaC9mVRo2Pjy/r\n9zdXcx2XresnTmxPt39nnYU2n3+38/VZeXTs2IntretP+nCNGjptfsd4uRgfH6/ep6fwiIg30A6O\nf8/M/yzFL0TEysw8XJaejpTyMeCcjt1Xl7IxeM2XO1cDrVK+ukv9mdp4jczcDmwHGB4ezpGRkW7V\nloVWq8Vyfn9zNddxubbzaquru+9/7SLeSXa6PszFfH1WFvP9L7St6ye40n8/U8wlUHu52iqA24En\nM/OfOl7aDRy/YmozcE9H+TXlqqsNwLGy9LQXuDgiVpQT5RcDe8trL0fEhtLWNZOO1a0NSVKDepl5\nvA/4GPBoRPyklP0tcAuwKyKuA34GXFle2wNcBowCvwI+DpCZRyPiZuDBUu+zmXm0bH8SuAM4Dbiv\nPJihDUlSg2YNj8z8byCmefmiLvUTuH6aY+0AdnQpfwh4V5fyF7u1IS1XfmlRS4W3J5EkVTM8JEnV\nDA9JUjXDQ5JUzbvqSoUnq6XeOfOQJFVz5qGBs2YZfWNa9Zxhzg/DQ9IUBqxm47KVJKma4SFJqmZ4\nSJKqec5D6sI1f2lmzjwkSdUMD0lSNcNDklTNcx5Sn/LLbOpnzjwkSdWceWggePWUNL+ceUiSqjnz\nkDSwPK80d848JEnVnHlIS4C/IavfGB6SAC8qUB2XrSRJ1QwPSVI1w0OSVM1zHtISM/nchCfQ1QRn\nHpKkaoaHJKma4SFJquY5D0nCL2LWMjykAeYXAzVXLltJkqo589Cy5W/V0sKZdeYRETsi4khEPNZR\ndmZE7IuIZ8qfK0p5RMRXImI0Ih6JiPd27LO51H8mIjZ3lJ8fEY+Wfb4SETFTG5Kk5vWybHUHsHFS\n2Tbg/sxcC9xfngNcCqwtjy3AbdAOAuAm4ELgAuCmjjC4DfhEx34bZ2lDUoc12+498ZAWy6zhkZk/\nAI5OKt4E7CzbO4ErOsrvzLYDwBkRsRK4BNiXmUcz8yVgH7CxvPaWzDyQmQncOelY3dqQJDVsrifM\nhzLzcNl+Hhgq26uA5zrqHSplM5Uf6lI+UxuSpIad9AnzzMyIyPnozFzbiIgttJfJGBoaotVqLWR3\nGjU+Pr6s399cdRuXresnmulMgzrHoJfPyqCN0dBpvb3nQfs3Nj4+Xr3PXMPjhYhYmZmHy9LTkVI+\nBpzTUW91KRsDRiaVt0r56i71Z2pjiszcDmwHGB4ezpGRkemqLnmtVovl/P7m6vi4vHbdf/AuJjx4\n9ciJ7V4+K9cO2HmSresn+NKjs38uOsdxEMwlLOe6bLUbOH7F1Gbgno7ya8pVVxuAY2XpaS9wcUSs\nKCfKLwb2ltdejogN5SqrayYdq1sbkqSGzRrBEfFN2rOGsyLiEO2rpm4BdkXEdcDPgCtL9T3AZcAo\n8Cvg4wCZeTQibgYeLPU+m5nHT8J/kvYVXacB95UHM7QhaRqdM687Np4+ax1prmYNj8z86DQvXdSl\nbgLXT3OcHcCOLuUPAe/qUv5itzYk9ebRsWMnlqW8V1Md73M1O29PIkmqZnhoyVuz7V4eHTvmcoy0\niAwPSVK1wbuWURpAzso035x5SJKqGR6SpGqGhySpmuc8tCS5hi81y5mHJKma4SFJqmZ4SJKqec5D\nkmbgfa66Mzy0ZHiSXOofhof6lmEh9S/PeUiSqhkekqRqhockqZrhIUmq5glzNc4T49LSY3hIUo/8\nzsfvGB5qhLMNaWnznIckqZrhIUmq5rKVFo1LVdLy4cxDklTN8JAkVXPZSgvKpSotV4N+2a7hoXln\nYEjLn8tWkqRqzjw0Z84wpMFleKiKgSFNNYjnPwwPzcrAkDSZ4aETBvG3J0lzY3hI0jwalF/C+j48\nImIj8GXgFODrmXlLw11aVqZbknKpStJM+jo8IuIU4KvAB4FDwIMRsTszn2i2Z0uPYSAtvuU8C+nr\n8AAuAEYz81mAiLgL2AQMZHis2XYvW9dPcO22e1/zQTQYpP633IKk38NjFfBcx/NDwIUN9WVenewP\nfANDWrqWQ5BEZjbdh2lFxIeBjZn5l+X5x4ALM/OGSfW2AFvK0z8Anl7Uji6us4CfN92JPuS4TOWY\nTOWYdHcWcHpmnt3rDv0+8xgDzul4vrqUvUZmbge2L1anmhQRD2XmcNP96DeOy1SOyVSOSXdlXNbU\n7NPv97Z6EFgbEedGxKnAVcDuhvskSQOvr2cemTkRETcAe2lfqrsjMx9vuFuSNPD6OjwAMnMPsKfp\nfvSRgViemwPHZSrHZCrHpLvqcenrE+aSpP7U7+c8JEl9yPBYIiLiIxHxeET8X0QMT3rtxogYjYin\nI+KSpvrYpIj4h4gYi4iflMdlTfepKRGxsXwWRiNiW9P96RcRcTAiHi2fj4ea7k8TImJHRByJiMc6\nys6MiH0R8Uz5c0UvxzI8lo7HgD8HftBZGBHraF+F9k5gI/C1cluXQXRrZp5XHgN5nqzjlj6XAuuA\nj5bPiNr+uHw+BvVy3Tto/5zotA24PzPXAveX57MyPJaIzHwyM7t9+XETcFdmvpqZPwVGad/WRYPp\nxC19MvM3wPFb+khk5g+Ao5OKNwE7y/ZO4IpejmV4LH3dbuGyqqG+NO2GiHikTM17mnovQ34eppfA\n9yLi4XJXCrUNZebhsv08MNTLTn1/qe4giYj/An6/y0t/l5n3LHZ/+s1M4wPcBtxM+wfEzcCXgL9Y\nvN5pCXh/Zo5FxNuBfRHxVPlNXEVmZkT0dAmu4dFHMvNP5rBbT7dwWQ56HZ+I+FfgOwvcnX41MJ+H\nWpk5Vv48EhF3017iMzzghYhYmZmHI2IlcKSXnVy2Wvp2A1dFxBsj4lxgLfBAw31adOVDf9yHaF9g\nMIi8pU8XEXF6RLz5+DZwMYP7GZlsN7C5bG8GelrlcOaxRETEh4B/Ac4G7o2In2TmJZn5eETsov1/\nnEwA12fmb5vsa0O+GBHn0V62Ogj8VbPdaYa39JnWEHB3RED75943MvO7zXZp8UXEN4ER4KyIOATc\nBNwC7IqI64CfAVf2dCy/YS5JquWylSSpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqoZHpKk\nav8PWhx2pwP674UAAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "np.log(milan_data[\"Internet\"]).hist(bins=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "metric = \"Internet\"\n", + "# milan_data_mod = milan_data.set_index(pd.DatetimeIndex(milan_data['Time']))\n", + "milan_data_mod = milan_data[[\"Grid\", \"Country\", \"Time\", metric]]\n", + "milan_data_mod = milan_data_mod[np.isfinite(milan_data_mod[metric])]\n", + "milan_data_mod[\"Time\"] = pd.to_datetime(milan_data_mod[\"Time\"], unit=\"ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Moments\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/site-packages/pandas/core/indexing.py:537: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " self.obj[item] = s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Log moments\n", + "Min/max\n", + "Group By\n", + " Grid Country Time min max lmin lmax \\\n", + "0 1 32 2013-11-01 12:00:00 0.001787 0.001787 -6.327044 -6.327044 \n", + "1 1 32 2013-11-02 00:00:00 0.026137 0.026137 -3.644387 -3.644387 \n", + "2 1 32 2013-11-02 18:00:00 0.026137 0.026137 -3.644387 -3.644387 \n", + "3 1 32 2013-11-03 06:00:00 0.001787 0.001787 -6.327044 -6.327044 \n", + "4 1 32 2013-11-03 12:00:00 0.001787 0.001787 -6.327044 -6.327044 \n", + "\n", + " m0 m1 m2 m3 m4 m5 lm0 \\\n", + "0 1 0.001787 0.000003 5.709522e-09 1.020469e-11 1.823894e-14 1 \n", + "1 1 0.026137 0.000683 1.785617e-05 4.667143e-07 1.219871e-08 1 \n", + "2 1 0.026137 0.000683 1.785617e-05 4.667143e-07 1.219871e-08 1 \n", + "3 1 0.001787 0.000003 5.709522e-09 1.020469e-11 1.823894e-14 1 \n", + "4 2 0.003575 0.000006 1.141904e-08 2.040937e-11 3.647788e-14 2 \n", + "\n", + " lm1 lm2 lm3 lm4 lm5 \n", + "0 -6.327044 40.031480 -253.280915 1602.519371 -10139.209808 \n", + "1 -3.644387 13.281557 -48.403137 176.399768 -642.869039 \n", + "2 -3.644387 13.281557 -48.403137 176.399768 -642.869039 \n", + "3 -6.327044 40.031480 -253.280915 1602.519371 -10139.209808 \n", + "4 -12.654087 80.062960 -506.561829 3205.038742 -20278.419617 \n", + "CPU times: user 28 s, sys: 27.3 s, total: 55.3 s\n", + "Wall time: 1min 31s\n" + ] + } + ], + "source": [ + "%%time\n", + "milan_cube = milan_data_mod.iloc[:12000000,:]\n", + "\n", + "print('Moments')\n", + "milan_cube.loc[:,'m0'] = 1\n", + "milan_cube.loc[:,'m1'] = milan_cube[metric]\n", + "milan_cube.loc[:,'m2'] = milan_cube['m1'] ** 2\n", + "milan_cube.loc[:,'m3'] = milan_cube['m1'] * milan_cube['m2']\n", + "milan_cube.loc[:,'m4'] = milan_cube['m2'] ** 2\n", + "milan_cube.loc[:,'m5'] = milan_cube['m2'] * milan_cube['m3']\n", + "\n", + "print('Log moments')\n", + "milan_cube.loc[:,'lm0'] = 1\n", + "milan_cube.loc[:,'lm1'] = np.log(milan_cube[metric])\n", + "milan_cube.loc[:,'lm2'] = milan_cube['lm1'] ** 2\n", + "milan_cube.loc[:,'lm3'] = milan_cube['lm1'] * milan_cube['lm2']\n", + "milan_cube.loc[:,'lm4'] = milan_cube['lm2'] ** 2\n", + "milan_cube.loc[:,'lm5'] = milan_cube['lm2'] * milan_cube['lm3']\n", + "\n", + "print('Min/max')\n", + "milan_cube.loc[:,'min'] = milan_cube['m1']\n", + "milan_cube.loc[:,'max'] = milan_cube['m1']\n", + "milan_cube.loc[:,'lmin'] = milan_cube['lm1']\n", + "milan_cube.loc[:,'lmax'] = milan_cube['lm1']\n", + "\n", + "print('Group By')\n", + "milan_cube = milan_cube.groupby([\"Grid\", \"Country\"] + [pd.Grouper(key=\"Time\", freq='6H')]).agg({\n", + " 'min': 'min',\n", + " 'max': 'max',\n", + " 'lmin': 'min',\n", + " 'lmax': 'max',\n", + " 'm0': 'sum',\n", + " 'm1': 'sum',\n", + " 'm2': 'sum',\n", + " 'm3': 'sum',\n", + " 'm4': 'sum',\n", + " 'm5': 'sum',\n", + " 'lm0': 'sum',\n", + " 'lm1': 'sum',\n", + " 'lm2': 'sum',\n", + " 'lm3': 'sum',\n", + " 'lm4': 'sum',\n", + " 'lm5': 'sum'\n", + "}).reset_index(col_level=0)\n", + "milan_cube.columns = milan_cube.columns.get_level_values(0)\n", + "print(milan_cube.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12000000" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "milan_cube['m0'].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# # Old cube creation\n", + "# milan_cube = milan_data_mod.groupby([\"Grid\", \"Country\"] + [pd.Grouper(key=\"Time\", freq='H')]).agg({metric: [\n", + "# 'min',\n", + "# 'max',\n", + "# log_min(),\n", + "# log_max(),\n", + "# moment(0),\n", + "# moment(1),\n", + "# moment(2),\n", + "# moment(3),\n", + "# moment(4),\n", + "# moment(5),\n", + "# log_moment(0),\n", + "# log_moment(1),\n", + "# log_moment(2),\n", + "# log_moment(3),\n", + "# log_moment(4),\n", + "# log_moment(5)\n", + "# ]}).reset_index(col_level=1)\n", + "# milan_cube.columns = milan_cube.columns.get_level_values(1)\n", + "# milan_cube" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Grid 10000\n", + "Country 133\n", + "Time 21\n", + "min 257642\n", + "max 369555\n", + "lmin 257556\n", + "lmax 369348\n", + "m0 36\n", + "m1 716164\n", + "m2 750658\n", + "m3 758154\n", + "m4 759665\n", + "m5 759261\n", + "lm0 36\n", + "lm1 756516\n", + "lm2 757599\n", + "lm3 757900\n", + "lm4 758072\n", + "lm5 758419\n", + "dtype: int64" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "milan_cube.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "milan_cube.to_csv('~/msketch/javamsketch/src/test/resources/milan_12M_6H_cubed.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 2min 47s, sys: 37.5 s, total: 3min 24s\n", + "Wall time: 4min 43s\n" + ] + } + ], + "source": [ + "%%time\n", + "milan_grouped = milan_data_mod.iloc[:12000000,:]\n", + "milan_grouped = milan_grouped.groupby([\"Grid\", \"Country\"] + [pd.Grouper(key=\"Time\", freq='6H')])\n", + "milan_grouped = milan_grouped[metric].apply(list).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Grid 1548982\n", + "Country 1548982\n", + "Time 1548982\n", + "Internet 1548982\n", + "dtype: int64" + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "milan_grouped.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "milan_grouped.to_csv('~/msketch/javamsketch/src/test/resources/milan_grouped_12M_6H.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "# t1 = milan_data[metric].quantile(0.99)\n", + "# t5 = milan_data[metric].quantile(0.95)\n", + "# t10 = milan_data[metric].quantile(0.90)\n", + "# milan_oracle = milan_data.groupby([\"Grid\", \"Country\"]).agg({metric: [\n", + "# 'count',\n", + "# outliers(t1, \"1\"),\n", + "# outliers(t5, \"5\"),\n", + "# outliers(t10, \"10\")\n", + "# ]}).reset_index(col_level=1)\n", + "# milan_oracle.columns = milan_oracle.columns.get_level_values(1)\n", + "# milan_oracle" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "milan_oracle.to_csv('lib/src/test/resources/milan_oracle_cubed.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sample" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "sample = pd.read_csv('~/msketch/javamsketch/src/test/resources/sample.csv')\n", + "sample = sample.groupby([\"location\", \"version\"])\n", + "sample = sample[\"usage\"].apply(list).reset_index()\n", + "sample.to_csv('~/msketch/javamsketch/src/test/resources/sample_grouped.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
usagelatencylocationversion
68646.40787AUSv3
71046.49976UKv2
56346.54429AUSv4
46346.61684AUSv4
8446.69209AUSv3
78246.82514RUSv4
2346.88836AUSv3
65447.17317CANv2
31247.86680RUSv4
68947.89642AUSv3
33049.22464RUSv4
33150.04735RUSv4
\n", + "
" + ], + "text/plain": [ + " usage latency location version\n", + "686 46.40 787 AUS v3\n", + "710 46.49 976 UK v2\n", + "563 46.54 429 AUS v4\n", + "463 46.61 684 AUS v4\n", + "84 46.69 209 AUS v3\n", + "782 46.82 514 RUS v4\n", + "23 46.88 836 AUS v3\n", + "654 47.17 317 CAN v2\n", + "312 47.86 680 RUS v4\n", + "689 47.89 642 AUS v3\n", + "330 49.22 464 RUS v4\n", + "331 50.04 735 RUS v4" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample = pd.read_csv('~/msketch/javamsketch/src/test/resources/sample.csv')\n", + "cutoff = sample[\"usage\"].quantile(0.99)\n", + "sample[sample[\"usage\"] >= 46.4].sort_values([\"usage\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
usagelatency
locationversion
AUSv3150150
v45050
CANv15050
v2150150
v32020
RUSv4200200
UKv2100100
v3100100
USAv1200200
\n", + "
" + ], + "text/plain": [ + " usage latency\n", + "location version \n", + "AUS v3 150 150\n", + " v4 50 50\n", + "CAN v1 50 50\n", + " v2 150 150\n", + " v3 20 20\n", + "RUS v4 200 200\n", + "UK v2 100 100\n", + " v3 100 100\n", + "USA v1 200 200" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample = pd.read_csv('~/msketch/javamsketch/src/test/resources/sample.csv')\n", + "sample.groupby([\"location\", \"version\"]).count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Wikipedia" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "wiki_data = pd.read_json('~/Downloads/wikipedia-2015-09-12', lines=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "metric = \"delta\"\n", + "attributes = [\"channel\", \"countryName\", \"namespace\"]\n", + "wiki_data = wiki_data[attributes + [metric]]\n", + "wiki_data = wiki_data[(np.isfinite(wiki_data[metric])) & (wiki_data[metric] != 0)]\n", + "wiki_data[metric] = np.log(np.absolute(wiki_data[metric]))" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAD8CAYAAACcjGjIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFf1JREFUeJzt3X+w3XWd3/Hna4msLK4GVnuHJkzDjBkdFqriHYi13bnK\nigEdwx+ug0MlWGr+EK3bYUZDOx2m/uiw03ZdaV07GckStlSWYXXICBoz0TNOZ4oCakFAyy2iJA3g\nGn5spKvN7rt/nE/wmO+9ueee3JtzLnk+Zs7c7/f9/Xy+9/PhhPO63x/nnFQVkiQN+o1xD0CSNHkM\nB0lSh+EgSeowHCRJHYaDJKnDcJAkdRgOkqQOw0GS1GE4SJI6Vo17AKN65StfWevWrRup789//nNO\nPfXUpR3QceYcJoNzmAzOYTj33XffX1XVq4Zpu2LDYd26ddx7770j9e31eszMzCztgI4z5zAZnMNk\ncA7DSfLjYdt6WkmS1GE4SJI6DAdJUofhIEnqMBwkSR2GgySpw3CQJHUYDpKkDsNBktSxYt8hraW3\nbuudLyw/dv07xjgSSePmkYMkqcNwkCR1GA6SpI6hwiHJ6iS3J/lBkoeTvCnJ6Ul2J3mk/TyttU2S\nG5LMJrk/yXkD+9nc2j+SZPNA/Y1JHmh9bkiSpZ+qJGlYwx45fAb4alW9Fngd8DCwFdhTVeuBPW0d\n4GJgfXtsAT4HkOR04DrgAuB84LrDgdLafGCg38Zjm5Yk6VgsGA5JXgH8HnAjQFX9sqqeATYBO1qz\nHcClbXkTcHP13Q2sTnIG8HZgd1UdqKqngd3Axrbt5VV1d1UVcPPAviRJYzDMkcNZwE+BP0vy3SSf\nT3IqMFVV+1ubJ4CptrwGeHyg/95WO1p97xx1SdKYDPM+h1XAecCHq+pbST7Dr04hAVBVlaSWY4CD\nkmyhf6qKqakper3eSPs5ePDgyH0nxXLM4ZpzD72wfDz++/g8TAbnMBkmbQ7DhMNeYG9Vfaut304/\nHJ5MckZV7W+nhp5q2/cBZw70X9tq+4CZI+q9Vl87R/uOqtoGbAOYnp6uUb9Sz68UnNuVg2+Cu3xp\n9z0Xn4fJ4Bwmw6TNYcHTSlX1BPB4kte00oXAQ8BO4PAdR5uBO9ryTuCKdtfSBuDZdvppF3BRktPa\nheiLgF1t23NJNrS7lK4Y2JckaQyG/fiMDwO3JDkZeBR4P/1guS3JVcCPgfe0tncBlwCzwPOtLVV1\nIMkngHtau49X1YG2/EHgJuAU4CvtIUkak6HCoaq+B0zPsenCOdoWcPU8+9kObJ+jfi9wzjBjkSQt\nP98hLUnqMBwkSR2GgySpw3CQJHUYDpKkDsNBktRhOEiSOgwHSVKH4SBJ6jAcJEkdhoMkqcNwkCR1\nGA6SpA7DQZLUYThIkjoMB0lSh+EgSeowHCRJHYaDJKnDcJAkdRgOkqQOw0GS1GE4SJI6DAdJUsdQ\n4ZDksSQPJPlekntb7fQku5M80n6e1upJckOS2ST3JzlvYD+bW/tHkmweqL+x7X+29c1ST1SSNLzF\nHDm8papeX1XTbX0rsKeq1gN72jrAxcD69tgCfA76YQJcB1wAnA9cdzhQWpsPDPTbOPKMJEnH7FhO\nK20CdrTlHcClA/Wbq+9uYHWSM4C3A7ur6kBVPQ3sBja2bS+vqrurqoCbB/YlSRqDYcOhgK8luS/J\nllabqqr9bfkJYKotrwEeH+i7t9WOVt87R12SNCarhmz3j6tqX5K/B+xO8oPBjVVVSWrph/frWjBt\nAZiamqLX6420n4MHD47cd1IsxxyuOffQC8vH47+Pz8NkcA6TYdLmMFQ4VNW+9vOpJF+if83gySRn\nVNX+dmroqdZ8H3DmQPe1rbYPmDmi3mv1tXO0n2sc24BtANPT0zUzMzNXswX1ej1G7TsplmMOV269\n84Xlxy5f2n3PxedhMjiHyTBpc1jwtFKSU5P89uFl4CLg+8BO4PAdR5uBO9ryTuCKdtfSBuDZdvpp\nF3BRktPaheiLgF1t23NJNrS7lK4Y2JckaQyGOXKYAr7U7i5dBfy3qvpqknuA25JcBfwYeE9rfxdw\nCTALPA+8H6CqDiT5BHBPa/fxqjrQlj8I3AScAnylPSRJY7JgOFTVo8Dr5qj/DLhwjnoBV8+zr+3A\n9jnq9wLnDDFeSdJx4DukJUkdhoMkqcNwkCR1GA6SpA7DQZLUYThIkjoMB0lSh+EgSeowHCRJHYaD\nJKnDcJAkdQz7fQ7SUa0b/Ljv698xxpFIWgoeOUiSOgwHSVKH4SBJ6jAcJEkdhoMkqcNwkCR1GA6S\npA7DQZLUYThIkjoMB0lSh+EgSeowHCRJHUOHQ5KTknw3yZfb+llJvpVkNslfJDm51X+zrc+27esG\n9nFtq/8wydsH6htbbTbJ1qWbniRpFIs5cvgI8PDA+h8Bn66qVwNPA1e1+lXA063+6daOJGcDlwG/\nC2wE/rQFzknAZ4GLgbOB97a2kqQxGSockqwF3gF8vq0HeCtwe2uyA7i0LW9q67TtF7b2m4Bbq+oX\nVfUjYBY4vz1mq+rRqvolcGtrK0kak2GPHP4E+Cjwd239d4BnqupQW98LrGnLa4DHAdr2Z1v7F+pH\n9JmvLkkakwW/7CfJO4Gnquq+JDPLP6SjjmULsAVgamqKXq830n4OHjw4ct9JsRxzuObcQy8sL3bf\no/T1eZgMzmEyTNochvkmuDcD70pyCfBS4OXAZ4DVSVa1o4O1wL7Wfh9wJrA3ySrgFcDPBuqHDfaZ\nr/5rqmobsA1genq6ZmZmhhh+V6/XY9S+k2I55nDl4Le5Xb64fY/S1+dhMjiHyTBpc1jwtFJVXVtV\na6tqHf0Lyl+vqsuBbwDvbs02A3e05Z1tnbb961VVrX5Zu5vpLGA98G3gHmB9u/vp5PY7di7J7CRJ\nIzmW75D+GHBrkk8C3wVubPUbgT9PMgscoP9iT1U9mOQ24CHgEHB1Vf0tQJIPAbuAk4DtVfXgMYxL\nknSMFhUOVdUDem35Ufp3Gh3Z5m+AP5in/6eAT81Rvwu4azFjkSQtH98hLUnqMBwkSR2GgySpw3CQ\nJHUYDpKkDsNBktRhOEiSOgwHSVKH4SBJ6jAcJEkdhoMkqcNwkCR1GA6SpA7DQZLUYThIkjoMB0lS\nh+EgSeowHCRJHYaDJKnDcJAkdRgOkqQOw0GS1GE4SJI6DAdJUofhIEnqWDAckrw0ybeT/M8kDyb5\nt61+VpJvJZlN8hdJTm7132zrs237uoF9XdvqP0zy9oH6xlabTbJ16acpSVqMYY4cfgG8tapeB7we\n2JhkA/BHwKer6tXA08BVrf1VwNOt/unWjiRnA5cBvwtsBP40yUlJTgI+C1wMnA28t7WVJI3JguFQ\nfQfb6kvao4C3Are3+g7g0ra8qa3Ttl+YJK1+a1X9oqp+BMwC57fHbFU9WlW/BG5tbSVJY7JqmEbt\nr/v7gFfT/yv/fwPPVNWh1mQvsKYtrwEeB6iqQ0meBX6n1e8e2O1gn8ePqF8wzzi2AFsApqam6PV6\nwwy/4+DBgyP3nRTLMYdrzj30wvJi9z1KX5+HyeAcJsOkzWGocKiqvwVen2Q18CXgtcs6qvnHsQ3Y\nBjA9PV0zMzMj7afX6zFq30mxHHO4cuudLyw/dvni9j1KX5+HyeAcJsOkzWFRdytV1TPAN4A3AauT\nHA6XtcC+trwPOBOgbX8F8LPB+hF95qtLksZkmLuVXtWOGEhyCvA24GH6IfHu1mwzcEdb3tnWadu/\nXlXV6pe1u5nOAtYD3wbuAda3u59Opn/ReudSTE6SNJphTiudAexo1x1+A7itqr6c5CHg1iSfBL4L\n3Nja3wj8eZJZ4AD9F3uq6sEktwEPAYeAq9vpKpJ8CNgFnARsr6oHl2yGkqRFWzAcqup+4A1z1B+l\nf6fRkfW/Af5gnn19CvjUHPW7gLuGGK8k6TjwHdKSpA7DQZLUYThIkjoMB0lSh+EgSeowHCRJHYaD\nJKnDcJAkdRgOkqQOw0GS1GE4SJI6DAdJUofhIEnqMBwkSR2GgySpw3CQJHUYDpKkDsNBktRhOEiS\nOgwHSVKH4SBJ6jAcJEkdhoMkqcNwkCR1LBgOSc5M8o0kDyV5MMlHWv30JLuTPNJ+ntbqSXJDktkk\n9yc5b2Bfm1v7R5JsHqi/MckDrc8NSbIck5UkDWeYI4dDwDVVdTawAbg6ydnAVmBPVa0H9rR1gIuB\n9e2xBfgc9MMEuA64ADgfuO5woLQ2Hxjot/HYpyZJGtWC4VBV+6vqO235r4GHgTXAJmBHa7YDuLQt\nbwJurr67gdVJzgDeDuyuqgNV9TSwG9jYtr28qu6uqgJuHtiXJGkM0n89HrJxsg74JnAO8JOqWt3q\nAZ6uqtVJvgxcX1X/vW3bA3wMmAFeWlWfbPV/A/xfoNfa/36r/xPgY1X1zjl+/xb6RyNMTU298dZb\nb138jIGDBw/yspe9bKS+k2I55vDAvmdfWD53zSuWva/Pw2RwDpPheMzhLW95y31VNT1M21XD7jTJ\ny4C/BP6wqp4bvCxQVZVk+JQZUVVtA7YBTE9P18zMzEj76fV6jNp3UizHHK7ceucLy49dvrh9j9LX\n52EyOIfJMGlzGOpupSQvoR8Mt1TVF1v5yXZKiPbzqVbfB5w50H1tqx2tvnaOuiRpTIa5WynAjcDD\nVfXHA5t2AofvONoM3DFQv6LdtbQBeLaq9gO7gIuSnNYuRF8E7Grbnkuyof2uKwb2JUkag2FOK70Z\neB/wQJLvtdq/Aq4HbktyFfBj4D1t213AJcAs8DzwfoCqOpDkE8A9rd3Hq+pAW/4gcBNwCvCV9pAk\njcmC4dAuLM/3voML52hfwNXz7Gs7sH2O+r30L3JLkiaA75CWJHUYDpKkDsNBktRhOEiSOgwHSVKH\n4SBJ6jAcJEkdJ2Q4PLDvWdZtvZN1A58HJEn6lRMyHCRJR2c4SJI6DAdJUofhIEnqMBwkSR2GgySp\nw3CQJHUYDpKkDsNBktRhOEiSOgwHSVKH4SBJ6jAcJEkdhoMkqcNwkCR1GA6SpI4FwyHJ9iRPJfn+\nQO30JLuTPNJ+ntbqSXJDktkk9yc5b6DP5tb+kSSbB+pvTPJA63NDkiz1JHV8Hf4iJb9MSVq5hjly\nuAnYeERtK7CnqtYDe9o6wMXA+vbYAnwO+mECXAdcAJwPXHc4UFqbDwz0O/J3SZKOs1ULNaiqbyZZ\nd0R5EzDTlncAPeBjrX5zVRVwd5LVSc5obXdX1QGAJLuBjUl6wMur6u5Wvxm4FPjKsUxqOQ3+NfzY\n9e8Y40gkafmMes1hqqr2t+UngKm2vAZ4fKDd3lY7Wn3vHHVJ0hgteOSwkKqqJLUUg1lIki30T1cx\nNTVFr9cbaT9Tp8A15x4CWPQ+Dvcbpe9SOnjw4JL//mOZ22DfQUfbz3LM4XhzDpPBOSy9UcPhySRn\nVNX+dtroqVbfB5w50G5tq+3jV6ehDtd7rb52jvZzqqptwDaA6enpmpmZma/pUf2nW+7gPz7Qn/pj\nly9uH1cOnlZaZN+l1Ov1GHX+8zmWuV05z8Xno+1nOeZwvDmHyeAclt6op5V2AofvONoM3DFQv6Ld\ntbQBeLadftoFXJTktHYh+iJgV9v2XJIN7S6lKwb2JUkakwWPHJJ8gf5f/a9Mspf+XUfXA7cluQr4\nMfCe1vwu4BJgFngeeD9AVR1I8gngntbu44cvTgMfpH9H1Cn0L0RP7MXolcAL5pKWwjB3K713nk0X\nztG2gKvn2c92YPsc9XuBcxYahyaP72OQXrx8h7QkqcNwkCR1HPOtrBqN1wYkTTLDQceNgSitHJ5W\nWoHWbb2TB/Y9uyI+3G6+D+EbnIOkyeORg+bkX/nSic1wOAEt9oXfoJBOPIbDMlhJL6ae1pE0F8NB\ni2KYSCcGw2GCLeURyCS/qK+kIy3pROHdSpKkDo8cNFHmO4pYrqMoj1SkuRkOmljznQobJkCO3CZp\ncQwHrWhHu5YyyddZpElnOEwAT3OMzzAB4nOiE5Hh8CLgX8jLa77/voaGXsy8W0kakZ8PpRczjxxW\niFFegHzROn48NagXG48cJEkdHjlIS8yjCL0YGA7SMvJuKK1UhoM0Zou9NmSY6HgwHKQV5nh8xIhk\nOEgrmHekablMTDgk2Qh8BjgJ+HxVXT/mIUkr1tGOIpbqCMMjlRe3iQiHJCcBnwXeBuwF7kmys6oe\nGu/IpJVvoc+fuubcQ1y59U5f4PVrJiIcgPOB2ap6FCDJrcAmwHCQjhNPUWnQpITDGuDxgfW9wAVj\nGoukRVqOYPFIZrxSVeMeA0neDWysqn/e1t8HXFBVHzqi3RZgS1t9DfDDEX/lK4G/GrHvpHAOk8E5\nTAbnMJx/UFWvGqbhpBw57APOHFhf22q/pqq2AduO9Zclubeqpo91P+PkHCaDc5gMzmHpTcpnK90D\nrE9yVpKTgcuAnWMekySdsCbiyKGqDiX5ELCL/q2s26vqwTEPS5JOWBMRDgBVdRdw13H6dcd8amoC\nOIfJ4Bwmg3NYYhNxQVqSNFkm5ZqDJGmCnFDhkGRjkh8mmU2yddzjWawkZyb5RpKHkjyY5CPjHtOo\nkpyU5LtJvjzusYwqyeoktyf5QZKHk7xp3GNarCT/sv1b+n6SLyR56bjHtJAk25M8leT7A7XTk+xO\n8kj7edo4x7iQeebw79u/pfuTfCnJ6nGO8YQJh4GP6LgYOBt4b5KzxzuqRTsEXFNVZwMbgKtX4BwO\n+wjw8LgHcYw+A3y1ql4LvI4VNp8ka4B/AUxX1Tn0bwa5bLyjGspNwMYjaluBPVW1HtjT1ifZTXTn\nsBs4p6r+IfC/gGuP96AGnTDhwMBHdFTVL4HDH9GxYlTV/qr6Tlv+a/ovRmvGO6rFS7IWeAfw+XGP\nZVRJXgH8HnAjQFX9sqqeGe+oRrIKOCXJKuC3gP8z5vEsqKq+CRw4orwJ2NGWdwCXHtdBLdJcc6iq\nr1XVobZ6N/33e43NiRQOc31Ex4p7YT0syTrgDcC3xjuSkfwJ8FHg78Y9kGNwFvBT4M/a6bHPJzl1\n3INajKraB/wH4CfAfuDZqvraeEc1sqmq2t+WnwCmxjmYJfDPgK+McwAnUji8aCR5GfCXwB9W1XPj\nHs9iJHkn8FRV3TfusRyjVcB5wOeq6g3Az5n8Uxm/pp2X30Q/6P4+cGqSfzreUR276t+CuWJvw0zy\nr+mfQr5lnOM4kcJhqI/omHRJXkI/GG6pqi+OezwjeDPwriSP0T+199Yk/3W8QxrJXmBvVR0+crud\nflisJL8P/KiqflpV/w/4IvCPxjymUT2Z5AyA9vOpMY9nJEmuBN4JXF5jfp/BiRQOK/4jOpKE/jnu\nh6vqj8c9nlFU1bVVtbaq1tF/Dr5eVSvur9WqegJ4PMlrWulCVt5HzP8E2JDkt9q/rQtZYRfVB+wE\nNrflzcAdYxzLSNoXnn0UeFdVPT/u8Zww4dAu9Bz+iI6HgdtW4Ed0vBl4H/2/tr/XHpeMe1AnsA8D\ntyS5H3g98O/GPJ5FaUc9twPfAR6g/3owUe/SnUuSLwD/A3hNkr1JrgKuB96W5BH6R0QT/U2S88zh\nPwO/Dexu/2//l7GO0XdIS5KOdMIcOUiShmc4SJI6DAdJUofhIEnqMBwkSR2GgySpw3CQJHUYDpKk\njv8PKv7S972dhRMAAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "wiki_data[\"delta\"].hist(bins=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
channelcountryNamenamespaceminmaxm0m1m2m3m4m5m6m7m8
0#ar.wikipediaAlgeriaMain0.0000006.66695718.057.644747234.8847851108.0079785842.0839793.325478e+041.988641e+051.226715e+067.717880e+06
1#ar.wikipediaAlgeriaنقاش5.2678585.2678581.05.26785827.750330146.184800770.0807924.056676e+032.137000e+041.125741e+055.930244e+05
2#ar.wikipediaAustraliaMain5.5645205.5645201.05.56452030.963887172.299183958.7623215.335052e+032.968701e+041.651940e+059.192252e+05
3#ar.wikipediaBahrainMain3.7841903.7841901.03.78419014.32009154.189941205.0650127.760049e+022.936550e+031.111246e+044.205166e+04
4#ar.wikipediaBelgiumMain5.4595865.4595861.05.45958629.807074162.734269888.4616604.850632e+032.648244e+041.445832e+057.893641e+05
5#ar.wikipediaDenmarkويكيبيديا6.0306856.0306851.06.03068536.369165219.3309861322.7161427.976885e+034.810608e+042.901126e+051.749578e+06
6#ar.wikipediaEgyptMain0.6931478.66406028.0124.659483680.8941254142.01872026951.8032941.839678e+051.302402e+069.492862e+067.087854e+07
7#ar.wikipediaHashemite Kingdom of JordanMain0.6931475.1647868.019.51721159.503557216.944287908.0317064.171502e+032.025181e+041.013195e+055.148902e+05
8#ar.wikipediaHashemite Kingdom of Jordanنقاش المستخدم4.1271344.1271341.04.12713417.03323870.298463290.1312051.197410e+034.941874e+032.039578e+048.417612e+04
9#ar.wikipediaIranMain0.6931470.6931471.00.6931470.4804530.3330250.2308351.600027e-011.109054e-017.687378e-025.328484e-02
10#ar.wikipediaIraqMain0.0000006.52062119.061.367666287.3481811524.9356218645.0854035.101748e+043.091354e+051.907096e+061.191112e+07
11#ar.wikipediaIraqنقاش5.9269266.2785212.012.20544774.548283455.7019922787.9312461.707021e+041.046042e+056.415184e+053.937453e+06
12#ar.wikipediaIsraelMain0.0000005.4424184.013.75749564.504332308.8495821507.5222677.485986e+033.773624e+041.926923e+059.947230e+05
13#ar.wikipediaKuwaitMain0.6931475.4722719.027.385355112.181294525.6303622608.2142371.329461e+046.875724e+043.588217e+051.884294e+06
14#ar.wikipediaKuwaitنقاش المستخدم6.2728776.2728771.06.27287739.348986246.8313491548.3426949.712563e+036.092571e+043.821795e+052.397365e+06
15#ar.wikipediaLebanonMain5.2364425.9964522.011.23289463.377762359.2019902044.8115141.169018e+046.710735e+043.867371e+052.237002e+06
16#ar.wikipediaLibyaMain4.4886367.4725013.019.227964128.792904891.4256476312.4042064.538466e+043.295322e+052.407739e+061.766220e+07
17#ar.wikipediaMoroccoMain0.0000007.75405313.045.491987239.3203051502.90043110204.6948097.174515e+045.133805e+053.712545e+062.704703e+07
18#ar.wikipediaMoroccoبوابة5.5214615.5214611.05.52146130.486531168.330188929.4285525.131803e+032.833505e+041.564509e+058.638374e+05
19#ar.wikipediaMoroccoقالب4.3820274.3820271.04.38202719.20215784.144365368.7228501.615753e+037.080274e+033.102595e+041.359565e+05
20#ar.wikipediaMoroccoنقاش6.6669576.6669571.06.66695744.448313296.3349811975.6525171.317159e+048.781442e+045.854550e+053.903203e+06
21#ar.wikipediaMoroccoنقاش المستخدم5.6454475.6454471.05.64544731.871071179.9264371015.7651465.734448e+033.237352e+041.827630e+051.031779e+06
22#ar.wikipediaMoroccoويكيبيديا8.7119378.7119371.08.71193775.897851661.2173165760.4837815.018497e+044.372083e+053.808932e+063.318317e+07
23#ar.wikipediaOmanMain1.3862943.8066622.05.19295716.41249157.825323213.6731488.044423e+023.049848e+031.159256e+044.410515e+04
24#ar.wikipediaPalestineMain0.0000003.4657364.06.93147224.02265183.256163288.5438731.000017e+033.465794e+031.201153e+044.162878e+04
25#ar.wikipediaQatarMain2.1972254.8040213.09.48615234.081175136.821624594.0579052.704686e+031.264015e+045.988422e+042.856838e+05
26#ar.wikipediaSaudi ArabiaMain0.0000008.31139897.0308.7907381375.6009107038.33958439448.0363222.366367e+051.497962e+069.907004e+066.793387e+07
27#ar.wikipediaSaudi Arabiaمستخدم3.3672963.3672961.03.36729611.33868138.180694128.5656924.329187e+021.457765e+034.908727e+031.652914e+04
28#ar.wikipediaSaudi Arabiaويكيبيديا3.6109183.6109181.03.61091813.03872847.081777170.0084326.138865e+022.216694e+038.004299e+032.890287e+04
29#ar.wikipediaSudanMain4.4773374.9767342.09.45407144.814424213.0182721015.3117824.852247e+032.324979e+041.116849e+055.378129e+05
.............................................
994#zh.wikipediaChinaMain1.0986126.68959922.080.418326342.5176851597.8340837929.1124354.128033e+042.237338e+051.255615e+067.263910e+06
995#zh.wikipediaChinaWikipedia1.6094386.0450054.019.648868111.067428656.4989703929.6372042.360291e+041.419026e+058.533656e+055.132418e+06
996#zh.wikipediaCzech RepublicMain1.0986124.8598123.09.56934337.863454163.185710729.2645303.326287e+031.539243e+047.202931e+043.400451e+05
997#zh.wikipediaFinlandMain3.3322053.3322051.03.33220511.10358736.999422123.2896424.108263e+021.368957e+034.561646e+031.520034e+04
998#zh.wikipediaFranceMain3.6635625.1239642.08.78752639.676691183.700876869.4669854.192038e+032.051605e+041.015925e+055.076205e+05
999#zh.wikipediaGermanyMain3.1354943.1354941.03.1354949.83132430.82605996.6549313.030610e+029.502459e+022.979491e+039.342176e+03
1000#zh.wikipediaHong KongMain0.0000009.431322440.01541.1997426864.10377534431.390925189571.3545391.133382e+067.309490e+065.048145e+073.699213e+08
1001#zh.wikipediaHong KongTemplate2.0794423.5835192.05.66296017.16568555.009811183.6045396.298276e+022.198520e+037.756832e+032.754389e+04
1002#zh.wikipediaIsraelMain6.2653016.2653011.06.26530139.253999245.9381291540.8764609.654055e+036.048556e+043.789603e+052.374300e+06
1003#zh.wikipediaItalyMain5.2470245.2470241.05.24702427.531262144.457192757.9703663.977089e+032.086788e+041.094943e+055.745191e+05
1004#zh.wikipediaJapanMain0.0000008.80522512.051.071233279.2365251743.36715412017.8736048.883793e+046.890841e+055.526233e+064.537615e+07
1005#zh.wikipediaJapanTalk3.8066623.8066621.03.80666214.49067955.161125209.9797877.993222e+023.042750e+031.158272e+044.409151e+04
1006#zh.wikipediaMacaoMain1.3862946.29526612.048.543764228.3448491169.9633356287.6314183.476625e+041.958443e+051.118023e+066.448414e+06
1007#zh.wikipediaMacaoWikipedia5.3706385.3706381.05.37063828.843753154.909356831.9620774.468167e+032.399691e+041.288787e+056.921609e+05
1008#zh.wikipediaMalaysiaMain0.0000006.10924845.0127.018209459.1996561821.1409377705.1503443.435583e+041.604540e+057.817490e+053.958450e+06
1009#zh.wikipediaNew ZealandMain2.4849073.9512442.06.43615021.78708877.031814281.8724261.057839e+034.040852e+031.562117e+046.086522e+04
1010#zh.wikipediaPortugalMain2.4849075.1873862.07.67229233.083733154.930922762.2204213.850892e+031.972002e+041.016591e+055.257640e+05
1011#zh.wikipediaRepublic of KoreaMain3.0910423.0910421.03.0910429.55454329.53349991.2893012.821791e+028.722276e+022.696093e+038.333736e+03
1012#zh.wikipediaSingaporeMain1.3862944.98360712.032.615067106.962952399.0854201608.9950056.796709e+032.960535e+041.318929e+055.982321e+05
1013#zh.wikipediaTaiwanFile0.6931475.3981633.07.18992230.827563158.962321850.8365184.585605e+032.474621e+041.335760e+057.210561e+05
1014#zh.wikipediaTaiwanMain0.0000008.348775659.02209.9134549248.88059143555.882096222371.0875421.205020e+066.842703e+064.039094e+072.465605e+08
1015#zh.wikipediaTaiwanTalk2.5649496.9847168.033.299343158.796546848.4365674912.1874782.994611e+041.885966e+051.213369e+067.924473e+06
1016#zh.wikipediaTaiwanTemplate2.3025854.1271342.06.42971922.33513682.506535318.2413281.262136e+035.090911e+032.073895e+048.496629e+04
1017#zh.wikipediaTaiwanUser3.5553483.5553481.03.55534812.64050044.941377159.7822365.680815e+022.019727e+037.180834e+032.553036e+04
1018#zh.wikipediaTaiwanUser talk6.0282796.0282791.06.02827936.340142219.0684971320.6059157.960980e+034.799101e+042.893032e+051.744000e+06
1019#zh.wikipediaTaiwanWikipedia5.8435447.0596187.045.332490294.7324101923.80860712607.0441898.294237e+045.478200e+053.632255e+062.417480e+07
1020#zh.wikipediaUnited KingdomMain2.5649492.7725892.05.33753814.26621338.188290102.3765682.748609e+027.390245e+021.989885e+035.365475e+03
1021#zh.wikipediaUnited StatesMain0.0000007.19893139.0129.906500595.1102903138.03526117972.9644911.084415e+056.770740e+054.328078e+062.814110e+07
1022#zh.wikipediaUnited StatesTemplate2.9957322.9957321.02.9957328.97441226.88493580.5400682.412765e+027.227997e+022.165315e+036.486703e+03
1023#zh.wikipediaVietnamMain2.0794425.2832045.018.00348873.970915332.1140261572.0943107.672361e+033.813046e+041.916971e+059.712955e+05
\n", + "

1024 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " channel countryName namespace min \\\n", + "0 #ar.wikipedia Algeria Main 0.000000 \n", + "1 #ar.wikipedia Algeria نقاش 5.267858 \n", + "2 #ar.wikipedia Australia Main 5.564520 \n", + "3 #ar.wikipedia Bahrain Main 3.784190 \n", + "4 #ar.wikipedia Belgium Main 5.459586 \n", + "5 #ar.wikipedia Denmark ويكيبيديا 6.030685 \n", + "6 #ar.wikipedia Egypt Main 0.693147 \n", + "7 #ar.wikipedia Hashemite Kingdom of Jordan Main 0.693147 \n", + "8 #ar.wikipedia Hashemite Kingdom of Jordan نقاش المستخدم 4.127134 \n", + "9 #ar.wikipedia Iran Main 0.693147 \n", + "10 #ar.wikipedia Iraq Main 0.000000 \n", + "11 #ar.wikipedia Iraq نقاش 5.926926 \n", + "12 #ar.wikipedia Israel Main 0.000000 \n", + "13 #ar.wikipedia Kuwait Main 0.693147 \n", + "14 #ar.wikipedia Kuwait نقاش المستخدم 6.272877 \n", + "15 #ar.wikipedia Lebanon Main 5.236442 \n", + "16 #ar.wikipedia Libya Main 4.488636 \n", + "17 #ar.wikipedia Morocco Main 0.000000 \n", + "18 #ar.wikipedia Morocco بوابة 5.521461 \n", + "19 #ar.wikipedia Morocco قالب 4.382027 \n", + "20 #ar.wikipedia Morocco نقاش 6.666957 \n", + "21 #ar.wikipedia Morocco نقاش المستخدم 5.645447 \n", + "22 #ar.wikipedia Morocco ويكيبيديا 8.711937 \n", + "23 #ar.wikipedia Oman Main 1.386294 \n", + "24 #ar.wikipedia Palestine Main 0.000000 \n", + "25 #ar.wikipedia Qatar Main 2.197225 \n", + "26 #ar.wikipedia Saudi Arabia Main 0.000000 \n", + "27 #ar.wikipedia Saudi Arabia مستخدم 3.367296 \n", + "28 #ar.wikipedia Saudi Arabia ويكيبيديا 3.610918 \n", + "29 #ar.wikipedia Sudan Main 4.477337 \n", + "... ... ... ... ... \n", + "994 #zh.wikipedia China Main 1.098612 \n", + "995 #zh.wikipedia China Wikipedia 1.609438 \n", + "996 #zh.wikipedia Czech Republic Main 1.098612 \n", + "997 #zh.wikipedia Finland Main 3.332205 \n", + "998 #zh.wikipedia France Main 3.663562 \n", + "999 #zh.wikipedia Germany Main 3.135494 \n", + "1000 #zh.wikipedia Hong Kong Main 0.000000 \n", + "1001 #zh.wikipedia Hong Kong Template 2.079442 \n", + "1002 #zh.wikipedia Israel Main 6.265301 \n", + "1003 #zh.wikipedia Italy Main 5.247024 \n", + "1004 #zh.wikipedia Japan Main 0.000000 \n", + "1005 #zh.wikipedia Japan Talk 3.806662 \n", + "1006 #zh.wikipedia Macao Main 1.386294 \n", + "1007 #zh.wikipedia Macao Wikipedia 5.370638 \n", + "1008 #zh.wikipedia Malaysia Main 0.000000 \n", + "1009 #zh.wikipedia New Zealand Main 2.484907 \n", + "1010 #zh.wikipedia Portugal Main 2.484907 \n", + "1011 #zh.wikipedia Republic of Korea Main 3.091042 \n", + "1012 #zh.wikipedia Singapore Main 1.386294 \n", + "1013 #zh.wikipedia Taiwan File 0.693147 \n", + "1014 #zh.wikipedia Taiwan Main 0.000000 \n", + "1015 #zh.wikipedia Taiwan Talk 2.564949 \n", + "1016 #zh.wikipedia Taiwan Template 2.302585 \n", + "1017 #zh.wikipedia Taiwan User 3.555348 \n", + "1018 #zh.wikipedia Taiwan User talk 6.028279 \n", + "1019 #zh.wikipedia Taiwan Wikipedia 5.843544 \n", + "1020 #zh.wikipedia United Kingdom Main 2.564949 \n", + "1021 #zh.wikipedia United States Main 0.000000 \n", + "1022 #zh.wikipedia United States Template 2.995732 \n", + "1023 #zh.wikipedia Vietnam Main 2.079442 \n", + "\n", + " max m0 m1 m2 m3 m4 \\\n", + "0 6.666957 18.0 57.644747 234.884785 1108.007978 5842.083979 \n", + "1 5.267858 1.0 5.267858 27.750330 146.184800 770.080792 \n", + "2 5.564520 1.0 5.564520 30.963887 172.299183 958.762321 \n", + "3 3.784190 1.0 3.784190 14.320091 54.189941 205.065012 \n", + "4 5.459586 1.0 5.459586 29.807074 162.734269 888.461660 \n", + "5 6.030685 1.0 6.030685 36.369165 219.330986 1322.716142 \n", + "6 8.664060 28.0 124.659483 680.894125 4142.018720 26951.803294 \n", + "7 5.164786 8.0 19.517211 59.503557 216.944287 908.031706 \n", + "8 4.127134 1.0 4.127134 17.033238 70.298463 290.131205 \n", + "9 0.693147 1.0 0.693147 0.480453 0.333025 0.230835 \n", + "10 6.520621 19.0 61.367666 287.348181 1524.935621 8645.085403 \n", + "11 6.278521 2.0 12.205447 74.548283 455.701992 2787.931246 \n", + "12 5.442418 4.0 13.757495 64.504332 308.849582 1507.522267 \n", + "13 5.472271 9.0 27.385355 112.181294 525.630362 2608.214237 \n", + "14 6.272877 1.0 6.272877 39.348986 246.831349 1548.342694 \n", + "15 5.996452 2.0 11.232894 63.377762 359.201990 2044.811514 \n", + "16 7.472501 3.0 19.227964 128.792904 891.425647 6312.404206 \n", + "17 7.754053 13.0 45.491987 239.320305 1502.900431 10204.694809 \n", + "18 5.521461 1.0 5.521461 30.486531 168.330188 929.428552 \n", + "19 4.382027 1.0 4.382027 19.202157 84.144365 368.722850 \n", + "20 6.666957 1.0 6.666957 44.448313 296.334981 1975.652517 \n", + "21 5.645447 1.0 5.645447 31.871071 179.926437 1015.765146 \n", + "22 8.711937 1.0 8.711937 75.897851 661.217316 5760.483781 \n", + "23 3.806662 2.0 5.192957 16.412491 57.825323 213.673148 \n", + "24 3.465736 4.0 6.931472 24.022651 83.256163 288.543873 \n", + "25 4.804021 3.0 9.486152 34.081175 136.821624 594.057905 \n", + "26 8.311398 97.0 308.790738 1375.600910 7038.339584 39448.036322 \n", + "27 3.367296 1.0 3.367296 11.338681 38.180694 128.565692 \n", + "28 3.610918 1.0 3.610918 13.038728 47.081777 170.008432 \n", + "29 4.976734 2.0 9.454071 44.814424 213.018272 1015.311782 \n", + "... ... ... ... ... ... ... \n", + "994 6.689599 22.0 80.418326 342.517685 1597.834083 7929.112435 \n", + "995 6.045005 4.0 19.648868 111.067428 656.498970 3929.637204 \n", + "996 4.859812 3.0 9.569343 37.863454 163.185710 729.264530 \n", + "997 3.332205 1.0 3.332205 11.103587 36.999422 123.289642 \n", + "998 5.123964 2.0 8.787526 39.676691 183.700876 869.466985 \n", + "999 3.135494 1.0 3.135494 9.831324 30.826059 96.654931 \n", + "1000 9.431322 440.0 1541.199742 6864.103775 34431.390925 189571.354539 \n", + "1001 3.583519 2.0 5.662960 17.165685 55.009811 183.604539 \n", + "1002 6.265301 1.0 6.265301 39.253999 245.938129 1540.876460 \n", + "1003 5.247024 1.0 5.247024 27.531262 144.457192 757.970366 \n", + "1004 8.805225 12.0 51.071233 279.236525 1743.367154 12017.873604 \n", + "1005 3.806662 1.0 3.806662 14.490679 55.161125 209.979787 \n", + "1006 6.295266 12.0 48.543764 228.344849 1169.963335 6287.631418 \n", + "1007 5.370638 1.0 5.370638 28.843753 154.909356 831.962077 \n", + "1008 6.109248 45.0 127.018209 459.199656 1821.140937 7705.150344 \n", + "1009 3.951244 2.0 6.436150 21.787088 77.031814 281.872426 \n", + "1010 5.187386 2.0 7.672292 33.083733 154.930922 762.220421 \n", + "1011 3.091042 1.0 3.091042 9.554543 29.533499 91.289301 \n", + "1012 4.983607 12.0 32.615067 106.962952 399.085420 1608.995005 \n", + "1013 5.398163 3.0 7.189922 30.827563 158.962321 850.836518 \n", + "1014 8.348775 659.0 2209.913454 9248.880591 43555.882096 222371.087542 \n", + "1015 6.984716 8.0 33.299343 158.796546 848.436567 4912.187478 \n", + "1016 4.127134 2.0 6.429719 22.335136 82.506535 318.241328 \n", + "1017 3.555348 1.0 3.555348 12.640500 44.941377 159.782236 \n", + "1018 6.028279 1.0 6.028279 36.340142 219.068497 1320.605915 \n", + "1019 7.059618 7.0 45.332490 294.732410 1923.808607 12607.044189 \n", + "1020 2.772589 2.0 5.337538 14.266213 38.188290 102.376568 \n", + "1021 7.198931 39.0 129.906500 595.110290 3138.035261 17972.964491 \n", + "1022 2.995732 1.0 2.995732 8.974412 26.884935 80.540068 \n", + "1023 5.283204 5.0 18.003488 73.970915 332.114026 1572.094310 \n", + "\n", + " m5 m6 m7 m8 \n", + "0 3.325478e+04 1.988641e+05 1.226715e+06 7.717880e+06 \n", + "1 4.056676e+03 2.137000e+04 1.125741e+05 5.930244e+05 \n", + "2 5.335052e+03 2.968701e+04 1.651940e+05 9.192252e+05 \n", + "3 7.760049e+02 2.936550e+03 1.111246e+04 4.205166e+04 \n", + "4 4.850632e+03 2.648244e+04 1.445832e+05 7.893641e+05 \n", + "5 7.976885e+03 4.810608e+04 2.901126e+05 1.749578e+06 \n", + "6 1.839678e+05 1.302402e+06 9.492862e+06 7.087854e+07 \n", + "7 4.171502e+03 2.025181e+04 1.013195e+05 5.148902e+05 \n", + "8 1.197410e+03 4.941874e+03 2.039578e+04 8.417612e+04 \n", + "9 1.600027e-01 1.109054e-01 7.687378e-02 5.328484e-02 \n", + "10 5.101748e+04 3.091354e+05 1.907096e+06 1.191112e+07 \n", + "11 1.707021e+04 1.046042e+05 6.415184e+05 3.937453e+06 \n", + "12 7.485986e+03 3.773624e+04 1.926923e+05 9.947230e+05 \n", + "13 1.329461e+04 6.875724e+04 3.588217e+05 1.884294e+06 \n", + "14 9.712563e+03 6.092571e+04 3.821795e+05 2.397365e+06 \n", + "15 1.169018e+04 6.710735e+04 3.867371e+05 2.237002e+06 \n", + "16 4.538466e+04 3.295322e+05 2.407739e+06 1.766220e+07 \n", + "17 7.174515e+04 5.133805e+05 3.712545e+06 2.704703e+07 \n", + "18 5.131803e+03 2.833505e+04 1.564509e+05 8.638374e+05 \n", + "19 1.615753e+03 7.080274e+03 3.102595e+04 1.359565e+05 \n", + "20 1.317159e+04 8.781442e+04 5.854550e+05 3.903203e+06 \n", + "21 5.734448e+03 3.237352e+04 1.827630e+05 1.031779e+06 \n", + "22 5.018497e+04 4.372083e+05 3.808932e+06 3.318317e+07 \n", + "23 8.044423e+02 3.049848e+03 1.159256e+04 4.410515e+04 \n", + "24 1.000017e+03 3.465794e+03 1.201153e+04 4.162878e+04 \n", + "25 2.704686e+03 1.264015e+04 5.988422e+04 2.856838e+05 \n", + "26 2.366367e+05 1.497962e+06 9.907004e+06 6.793387e+07 \n", + "27 4.329187e+02 1.457765e+03 4.908727e+03 1.652914e+04 \n", + "28 6.138865e+02 2.216694e+03 8.004299e+03 2.890287e+04 \n", + "29 4.852247e+03 2.324979e+04 1.116849e+05 5.378129e+05 \n", + "... ... ... ... ... \n", + "994 4.128033e+04 2.237338e+05 1.255615e+06 7.263910e+06 \n", + "995 2.360291e+04 1.419026e+05 8.533656e+05 5.132418e+06 \n", + "996 3.326287e+03 1.539243e+04 7.202931e+04 3.400451e+05 \n", + "997 4.108263e+02 1.368957e+03 4.561646e+03 1.520034e+04 \n", + "998 4.192038e+03 2.051605e+04 1.015925e+05 5.076205e+05 \n", + "999 3.030610e+02 9.502459e+02 2.979491e+03 9.342176e+03 \n", + "1000 1.133382e+06 7.309490e+06 5.048145e+07 3.699213e+08 \n", + "1001 6.298276e+02 2.198520e+03 7.756832e+03 2.754389e+04 \n", + "1002 9.654055e+03 6.048556e+04 3.789603e+05 2.374300e+06 \n", + "1003 3.977089e+03 2.086788e+04 1.094943e+05 5.745191e+05 \n", + "1004 8.883793e+04 6.890841e+05 5.526233e+06 4.537615e+07 \n", + "1005 7.993222e+02 3.042750e+03 1.158272e+04 4.409151e+04 \n", + "1006 3.476625e+04 1.958443e+05 1.118023e+06 6.448414e+06 \n", + "1007 4.468167e+03 2.399691e+04 1.288787e+05 6.921609e+05 \n", + "1008 3.435583e+04 1.604540e+05 7.817490e+05 3.958450e+06 \n", + "1009 1.057839e+03 4.040852e+03 1.562117e+04 6.086522e+04 \n", + "1010 3.850892e+03 1.972002e+04 1.016591e+05 5.257640e+05 \n", + "1011 2.821791e+02 8.722276e+02 2.696093e+03 8.333736e+03 \n", + "1012 6.796709e+03 2.960535e+04 1.318929e+05 5.982321e+05 \n", + "1013 4.585605e+03 2.474621e+04 1.335760e+05 7.210561e+05 \n", + "1014 1.205020e+06 6.842703e+06 4.039094e+07 2.465605e+08 \n", + "1015 2.994611e+04 1.885966e+05 1.213369e+06 7.924473e+06 \n", + "1016 1.262136e+03 5.090911e+03 2.073895e+04 8.496629e+04 \n", + "1017 5.680815e+02 2.019727e+03 7.180834e+03 2.553036e+04 \n", + "1018 7.960980e+03 4.799101e+04 2.893032e+05 1.744000e+06 \n", + "1019 8.294237e+04 5.478200e+05 3.632255e+06 2.417480e+07 \n", + "1020 2.748609e+02 7.390245e+02 1.989885e+03 5.365475e+03 \n", + "1021 1.084415e+05 6.770740e+05 4.328078e+06 2.814110e+07 \n", + "1022 2.412765e+02 7.227997e+02 2.165315e+03 6.486703e+03 \n", + "1023 7.672361e+03 3.813046e+04 1.916971e+05 9.712955e+05 \n", + "\n", + "[1024 rows x 14 columns]" + ] + }, + "execution_count": 151, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wiki_cube = wiki_data.groupby(attributes).agg({metric: [\n", + " 'min',\n", + " 'max',\n", + " moment(0),\n", + " moment(1),\n", + " moment(2),\n", + " moment(3),\n", + " moment(4),\n", + " moment(5),\n", + " moment(6),\n", + " moment(7),\n", + " moment(8)\n", + "]}).reset_index(col_level=1)\n", + "wiki_cube.columns = wiki_cube.columns.get_level_values(1)\n", + "wiki_cube" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "wiki_cube.to_csv('lib/src/test/resources/wiki_moments_cubed.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
channelcountryNamenamespacecountoutliers1outliers5outliers10
0#ar.wikipediaAlgeriaMain180.00.02.0
1#ar.wikipediaAlgeriaنقاش10.00.00.0
2#ar.wikipediaAustraliaMain10.00.00.0
3#ar.wikipediaBahrainMain10.00.00.0
4#ar.wikipediaBelgiumMain10.00.00.0
5#ar.wikipediaDenmarkويكيبيديا10.00.01.0
6#ar.wikipediaEgyptMain281.04.08.0
7#ar.wikipediaHashemite Kingdom of JordanMain80.00.00.0
8#ar.wikipediaHashemite Kingdom of Jordanنقاش المستخدم10.00.00.0
9#ar.wikipediaIranMain10.00.00.0
10#ar.wikipediaIraqMain190.00.04.0
11#ar.wikipediaIraqنقاش20.00.01.0
12#ar.wikipediaIsraelMain40.00.00.0
13#ar.wikipediaKuwaitMain90.00.00.0
14#ar.wikipediaKuwaitنقاش المستخدم10.00.01.0
15#ar.wikipediaLebanonMain20.00.00.0
16#ar.wikipediaLibyaMain30.02.02.0
17#ar.wikipediaMoroccoMain130.02.04.0
18#ar.wikipediaMoroccoبوابة10.00.00.0
19#ar.wikipediaMoroccoقالب10.00.00.0
20#ar.wikipediaMoroccoنقاش10.00.01.0
21#ar.wikipediaMoroccoنقاش المستخدم10.00.00.0
22#ar.wikipediaMoroccoويكيبيديا11.01.01.0
23#ar.wikipediaOmanMain20.00.00.0
24#ar.wikipediaPalestineMain40.00.00.0
25#ar.wikipediaQatarMain30.00.00.0
26#ar.wikipediaSaudi ArabiaMain970.03.08.0
27#ar.wikipediaSaudi Arabiaمستخدم10.00.00.0
28#ar.wikipediaSaudi Arabiaويكيبيديا10.00.00.0
29#ar.wikipediaSudanMain20.00.00.0
........................
994#zh.wikipediaChinaMain220.00.01.0
995#zh.wikipediaChinaWikipedia40.00.02.0
996#zh.wikipediaCzech RepublicMain30.00.00.0
997#zh.wikipediaFinlandMain10.00.00.0
998#zh.wikipediaFranceMain20.00.00.0
999#zh.wikipediaGermanyMain10.00.00.0
1000#zh.wikipediaHong KongMain4402.014.022.0
1001#zh.wikipediaHong KongTemplate20.00.00.0
1002#zh.wikipediaIsraelMain10.00.01.0
1003#zh.wikipediaItalyMain10.00.00.0
1004#zh.wikipediaJapanMain121.02.03.0
1005#zh.wikipediaJapanTalk10.00.00.0
1006#zh.wikipediaMacaoMain120.00.01.0
1007#zh.wikipediaMacaoWikipedia10.00.00.0
1008#zh.wikipediaMalaysiaMain450.00.01.0
1009#zh.wikipediaNew ZealandMain20.00.00.0
1010#zh.wikipediaPortugalMain20.00.00.0
1011#zh.wikipediaRepublic of KoreaMain10.00.00.0
1012#zh.wikipediaSingaporeMain120.00.00.0
1013#zh.wikipediaTaiwanFile30.00.00.0
1014#zh.wikipediaTaiwanMain6590.06.039.0
1015#zh.wikipediaTaiwanTalk80.01.01.0
1016#zh.wikipediaTaiwanTemplate20.00.00.0
1017#zh.wikipediaTaiwanUser10.00.00.0
1018#zh.wikipediaTaiwanUser talk10.00.01.0
1019#zh.wikipediaTaiwanWikipedia70.02.06.0
1020#zh.wikipediaUnited KingdomMain20.00.00.0
1021#zh.wikipediaUnited StatesMain390.01.05.0
1022#zh.wikipediaUnited StatesTemplate10.00.00.0
1023#zh.wikipediaVietnamMain50.00.00.0
\n", + "

1024 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " channel countryName namespace count \\\n", + "0 #ar.wikipedia Algeria Main 18 \n", + "1 #ar.wikipedia Algeria نقاش 1 \n", + "2 #ar.wikipedia Australia Main 1 \n", + "3 #ar.wikipedia Bahrain Main 1 \n", + "4 #ar.wikipedia Belgium Main 1 \n", + "5 #ar.wikipedia Denmark ويكيبيديا 1 \n", + "6 #ar.wikipedia Egypt Main 28 \n", + "7 #ar.wikipedia Hashemite Kingdom of Jordan Main 8 \n", + "8 #ar.wikipedia Hashemite Kingdom of Jordan نقاش المستخدم 1 \n", + "9 #ar.wikipedia Iran Main 1 \n", + "10 #ar.wikipedia Iraq Main 19 \n", + "11 #ar.wikipedia Iraq نقاش 2 \n", + "12 #ar.wikipedia Israel Main 4 \n", + "13 #ar.wikipedia Kuwait Main 9 \n", + "14 #ar.wikipedia Kuwait نقاش المستخدم 1 \n", + "15 #ar.wikipedia Lebanon Main 2 \n", + "16 #ar.wikipedia Libya Main 3 \n", + "17 #ar.wikipedia Morocco Main 13 \n", + "18 #ar.wikipedia Morocco بوابة 1 \n", + "19 #ar.wikipedia Morocco قالب 1 \n", + "20 #ar.wikipedia Morocco نقاش 1 \n", + "21 #ar.wikipedia Morocco نقاش المستخدم 1 \n", + "22 #ar.wikipedia Morocco ويكيبيديا 1 \n", + "23 #ar.wikipedia Oman Main 2 \n", + "24 #ar.wikipedia Palestine Main 4 \n", + "25 #ar.wikipedia Qatar Main 3 \n", + "26 #ar.wikipedia Saudi Arabia Main 97 \n", + "27 #ar.wikipedia Saudi Arabia مستخدم 1 \n", + "28 #ar.wikipedia Saudi Arabia ويكيبيديا 1 \n", + "29 #ar.wikipedia Sudan Main 2 \n", + "... ... ... ... ... \n", + "994 #zh.wikipedia China Main 22 \n", + "995 #zh.wikipedia China Wikipedia 4 \n", + "996 #zh.wikipedia Czech Republic Main 3 \n", + "997 #zh.wikipedia Finland Main 1 \n", + "998 #zh.wikipedia France Main 2 \n", + "999 #zh.wikipedia Germany Main 1 \n", + "1000 #zh.wikipedia Hong Kong Main 440 \n", + "1001 #zh.wikipedia Hong Kong Template 2 \n", + "1002 #zh.wikipedia Israel Main 1 \n", + "1003 #zh.wikipedia Italy Main 1 \n", + "1004 #zh.wikipedia Japan Main 12 \n", + "1005 #zh.wikipedia Japan Talk 1 \n", + "1006 #zh.wikipedia Macao Main 12 \n", + "1007 #zh.wikipedia Macao Wikipedia 1 \n", + "1008 #zh.wikipedia Malaysia Main 45 \n", + "1009 #zh.wikipedia New Zealand Main 2 \n", + "1010 #zh.wikipedia Portugal Main 2 \n", + "1011 #zh.wikipedia Republic of Korea Main 1 \n", + "1012 #zh.wikipedia Singapore Main 12 \n", + "1013 #zh.wikipedia Taiwan File 3 \n", + "1014 #zh.wikipedia Taiwan Main 659 \n", + "1015 #zh.wikipedia Taiwan Talk 8 \n", + "1016 #zh.wikipedia Taiwan Template 2 \n", + "1017 #zh.wikipedia Taiwan User 1 \n", + "1018 #zh.wikipedia Taiwan User talk 1 \n", + "1019 #zh.wikipedia Taiwan Wikipedia 7 \n", + "1020 #zh.wikipedia United Kingdom Main 2 \n", + "1021 #zh.wikipedia United States Main 39 \n", + "1022 #zh.wikipedia United States Template 1 \n", + "1023 #zh.wikipedia Vietnam Main 5 \n", + "\n", + " outliers1 outliers5 outliers10 \n", + "0 0.0 0.0 2.0 \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 \n", + "5 0.0 0.0 1.0 \n", + "6 1.0 4.0 8.0 \n", + "7 0.0 0.0 0.0 \n", + "8 0.0 0.0 0.0 \n", + "9 0.0 0.0 0.0 \n", + "10 0.0 0.0 4.0 \n", + "11 0.0 0.0 1.0 \n", + "12 0.0 0.0 0.0 \n", + "13 0.0 0.0 0.0 \n", + "14 0.0 0.0 1.0 \n", + "15 0.0 0.0 0.0 \n", + "16 0.0 2.0 2.0 \n", + "17 0.0 2.0 4.0 \n", + "18 0.0 0.0 0.0 \n", + "19 0.0 0.0 0.0 \n", + "20 0.0 0.0 1.0 \n", + "21 0.0 0.0 0.0 \n", + "22 1.0 1.0 1.0 \n", + "23 0.0 0.0 0.0 \n", + "24 0.0 0.0 0.0 \n", + "25 0.0 0.0 0.0 \n", + "26 0.0 3.0 8.0 \n", + "27 0.0 0.0 0.0 \n", + "28 0.0 0.0 0.0 \n", + "29 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "994 0.0 0.0 1.0 \n", + "995 0.0 0.0 2.0 \n", + "996 0.0 0.0 0.0 \n", + "997 0.0 0.0 0.0 \n", + "998 0.0 0.0 0.0 \n", + "999 0.0 0.0 0.0 \n", + "1000 2.0 14.0 22.0 \n", + "1001 0.0 0.0 0.0 \n", + "1002 0.0 0.0 1.0 \n", + "1003 0.0 0.0 0.0 \n", + "1004 1.0 2.0 3.0 \n", + "1005 0.0 0.0 0.0 \n", + "1006 0.0 0.0 1.0 \n", + "1007 0.0 0.0 0.0 \n", + "1008 0.0 0.0 1.0 \n", + "1009 0.0 0.0 0.0 \n", + "1010 0.0 0.0 0.0 \n", + "1011 0.0 0.0 0.0 \n", + "1012 0.0 0.0 0.0 \n", + "1013 0.0 0.0 0.0 \n", + "1014 0.0 6.0 39.0 \n", + "1015 0.0 1.0 1.0 \n", + "1016 0.0 0.0 0.0 \n", + "1017 0.0 0.0 0.0 \n", + "1018 0.0 0.0 1.0 \n", + "1019 0.0 2.0 6.0 \n", + "1020 0.0 0.0 0.0 \n", + "1021 0.0 1.0 5.0 \n", + "1022 0.0 0.0 0.0 \n", + "1023 0.0 0.0 0.0 \n", + "\n", + "[1024 rows x 7 columns]" + ] + }, + "execution_count": 153, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t1 = wiki_data[metric].quantile(0.99)\n", + "t5 = wiki_data[metric].quantile(0.95)\n", + "t10 = wiki_data[metric].quantile(0.90)\n", + "wiki_oracle = wiki_data.groupby(attributes).agg({metric: [\n", + " 'count',\n", + " outliers(t1, \"1\"),\n", + " outliers(t5, \"5\"),\n", + " outliers(t10, \"10\")\n", + "]}).reset_index(col_level=1)\n", + "wiki_oracle.columns = wiki_oracle.columns.get_level_values(1)\n", + "wiki_oracle" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "wiki_oracle.to_csv('lib/src/test/resources/wiki_oracle_cubed.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# Big Wiki" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "big_wiki_data = pd.read_csv('~/Downloads/wiki-10M.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "comment 2672206\n", + "isNew 2\n", + "isUnpatrolled 2\n", + "commentLength 317\n", + "deltaBucket 1765\n", + "regionName 2290\n", + "namespace 1005\n", + "isMinor 2\n", + "channel 53\n", + "added 27858\n", + "isRobot 2\n", + "deleted 501\n", + "countryIsoCode 219\n", + "__time 10213634\n", + "user 531887\n", + "delta 28358\n", + "regionIsoCode 1131\n", + "count 1\n", + "countryName 220\n", + "metroCode 209\n", + "cityName 26859\n", + "flags 11\n", + "diffUrl 10209896\n", + "isAnonymous 2\n", + "page 5448626\n", + "dtype: int64" + ] + }, + "execution_count": 170, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "big_wiki_data.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 215, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "metric = \"added\"\n", + "attributes = [\"channel\", \"namespace\"]\n", + "big_wiki_data = big_wiki_data.set_index(pd.DatetimeIndex(big_wiki_data['__time']))\n", + "big_wiki_data = big_wiki_data[attributes + [metric]]\n", + "big_wiki_data = big_wiki_data[(np.isfinite(big_wiki_data[metric])) & (big_wiki_data[metric] > 0)]" + ] + }, + { + "cell_type": "code", + "execution_count": 216, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
channelnamespaceadded
__time
2018-01-08 00:00:01.258#ru.wikipediaMediaWiki6.0
2018-01-08 00:00:01.460#pl.wikipediaWikipedysta1.0
2018-01-08 00:00:02.143#en.wikipediaTalk23.0
2018-01-08 00:00:02.179#en.wikipediaMain9.0
2018-01-08 00:00:02.253#ceb.wikipediaMain8.0
\n", + "
" + ], + "text/plain": [ + " channel namespace added\n", + "__time \n", + "2018-01-08 00:00:01.258 #ru.wikipedia MediaWiki 6.0\n", + "2018-01-08 00:00:01.460 #pl.wikipedia Wikipedysta 1.0\n", + "2018-01-08 00:00:02.143 #en.wikipedia Talk 23.0\n", + "2018-01-08 00:00:02.179 #en.wikipedia Main 9.0\n", + "2018-01-08 00:00:02.253 #ceb.wikipedia Main 8.0" + ] + }, + "execution_count": 216, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "big_wiki_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 217, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
channelnamespace__timecountoutliers1
0#ar.wikipedia162018-01-15 05:00:0010.0
1#ar.wikipediaMain2018-01-08 00:00:001490.0
2#ar.wikipediaMain2018-01-08 01:00:001850.0
3#ar.wikipediaMain2018-01-08 02:00:001050.0
4#ar.wikipediaMain2018-01-08 03:00:00640.0
5#ar.wikipediaMain2018-01-08 04:00:00790.0
6#ar.wikipediaMain2018-01-08 05:00:00630.0
7#ar.wikipediaMain2018-01-08 06:00:001220.0
8#ar.wikipediaMain2018-01-08 07:00:001850.0
9#ar.wikipediaMain2018-01-08 08:00:001660.0
10#ar.wikipediaMain2018-01-08 09:00:001180.0
11#ar.wikipediaMain2018-01-08 10:00:001090.0
12#ar.wikipediaMain2018-01-08 11:00:001480.0
13#ar.wikipediaMain2018-01-08 12:00:002050.0
14#ar.wikipediaMain2018-01-08 13:00:002160.0
15#ar.wikipediaMain2018-01-08 14:00:002070.0
16#ar.wikipediaMain2018-01-08 15:00:001650.0
17#ar.wikipediaMain2018-01-08 16:00:001260.0
18#ar.wikipediaMain2018-01-08 17:00:001870.0
19#ar.wikipediaMain2018-01-08 18:00:003190.0
20#ar.wikipediaMain2018-01-08 19:00:001690.0
21#ar.wikipediaMain2018-01-08 20:00:001660.0
22#ar.wikipediaMain2018-01-08 21:00:001660.0
23#ar.wikipediaMain2018-01-08 22:00:001160.0
24#ar.wikipediaMain2018-01-08 23:00:001330.0
25#ar.wikipediaMain2018-01-09 00:00:00760.0
26#ar.wikipediaMain2018-01-09 01:00:00750.0
27#ar.wikipediaMain2018-01-09 02:00:00420.0
28#ar.wikipediaMain2018-01-09 03:00:00380.0
29#ar.wikipediaMain2018-01-09 04:00:00860.0
..................
119253#zh.wikipedia模块2018-01-24 04:00:0010.0
119254#zh.wikipedia模块2018-01-24 08:00:0010.0
119255#zh.wikipedia模块2018-01-24 13:00:0020.0
119256#zh.wikipedia模块2018-01-24 15:00:0010.0
119257#zh.wikipedia模块2018-01-25 00:00:0010.0
119258#zh.wikipedia模块2018-01-25 11:00:0030.0
119259#zh.wikipedia模块2018-01-25 12:00:0010.0
119260#zh.wikipedia模块2018-01-25 13:00:0020.0
119261#zh.wikipedia模块2018-01-25 14:00:0030.0
119262#zh.wikipedia模块2018-01-25 15:00:0010.0
119263#zh.wikipedia模块2018-01-25 18:00:0010.0
119264#zh.wikipedia模块2018-01-25 19:00:0020.0
119265#zh.wikipedia模块2018-01-25 20:00:0020.0
119266#zh.wikipedia模块2018-01-26 08:00:0010.0
119267#zh.wikipedia模块2018-01-26 09:00:0020.0
119268#zh.wikipedia模块2018-01-26 16:00:0010.0
119269#zh.wikipedia模块2018-01-26 17:00:0030.0
119270#zh.wikipedia模块2018-01-27 09:00:0020.0
119271#zh.wikipedia模块2018-01-27 11:00:0020.0
119272#zh.wikipedia模块2018-01-27 12:00:0010.0
119273#zh.wikipedia模块2018-01-27 13:00:0040.0
119274#zh.wikipedia模块2018-01-27 15:00:0010.0
119275#zh.wikipedia模块讨论2018-01-12 07:00:0010.0
119276#zh.wikipedia模块讨论2018-01-18 09:00:0010.0
119277#zh.wikipedia模块讨论2018-01-22 05:00:0010.0
119278#zh.wikipedia模块讨论2018-01-23 10:00:0030.0
119279#zh.wikipedia模块讨论2018-01-23 11:00:0010.0
119280#zh.wikipedia模块讨论2018-01-23 21:00:0010.0
119281#zh.wikipedia爆笑寵妃2018-01-23 09:00:0010.0
119282#zh.wikipedia阿富汗2018-01-09 23:00:0010.0
\n", + "

119283 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " channel namespace __time count outliers1\n", + "0 #ar.wikipedia 16 2018-01-15 05:00:00 1 0.0\n", + "1 #ar.wikipedia Main 2018-01-08 00:00:00 149 0.0\n", + "2 #ar.wikipedia Main 2018-01-08 01:00:00 185 0.0\n", + "3 #ar.wikipedia Main 2018-01-08 02:00:00 105 0.0\n", + "4 #ar.wikipedia Main 2018-01-08 03:00:00 64 0.0\n", + "5 #ar.wikipedia Main 2018-01-08 04:00:00 79 0.0\n", + "6 #ar.wikipedia Main 2018-01-08 05:00:00 63 0.0\n", + "7 #ar.wikipedia Main 2018-01-08 06:00:00 122 0.0\n", + "8 #ar.wikipedia Main 2018-01-08 07:00:00 185 0.0\n", + "9 #ar.wikipedia Main 2018-01-08 08:00:00 166 0.0\n", + "10 #ar.wikipedia Main 2018-01-08 09:00:00 118 0.0\n", + "11 #ar.wikipedia Main 2018-01-08 10:00:00 109 0.0\n", + "12 #ar.wikipedia Main 2018-01-08 11:00:00 148 0.0\n", + "13 #ar.wikipedia Main 2018-01-08 12:00:00 205 0.0\n", + "14 #ar.wikipedia Main 2018-01-08 13:00:00 216 0.0\n", + "15 #ar.wikipedia Main 2018-01-08 14:00:00 207 0.0\n", + "16 #ar.wikipedia Main 2018-01-08 15:00:00 165 0.0\n", + "17 #ar.wikipedia Main 2018-01-08 16:00:00 126 0.0\n", + "18 #ar.wikipedia Main 2018-01-08 17:00:00 187 0.0\n", + "19 #ar.wikipedia Main 2018-01-08 18:00:00 319 0.0\n", + "20 #ar.wikipedia Main 2018-01-08 19:00:00 169 0.0\n", + "21 #ar.wikipedia Main 2018-01-08 20:00:00 166 0.0\n", + "22 #ar.wikipedia Main 2018-01-08 21:00:00 166 0.0\n", + "23 #ar.wikipedia Main 2018-01-08 22:00:00 116 0.0\n", + "24 #ar.wikipedia Main 2018-01-08 23:00:00 133 0.0\n", + "25 #ar.wikipedia Main 2018-01-09 00:00:00 76 0.0\n", + "26 #ar.wikipedia Main 2018-01-09 01:00:00 75 0.0\n", + "27 #ar.wikipedia Main 2018-01-09 02:00:00 42 0.0\n", + "28 #ar.wikipedia Main 2018-01-09 03:00:00 38 0.0\n", + "29 #ar.wikipedia Main 2018-01-09 04:00:00 86 0.0\n", + "... ... ... ... ... ...\n", + "119253 #zh.wikipedia 模块 2018-01-24 04:00:00 1 0.0\n", + "119254 #zh.wikipedia 模块 2018-01-24 08:00:00 1 0.0\n", + "119255 #zh.wikipedia 模块 2018-01-24 13:00:00 2 0.0\n", + "119256 #zh.wikipedia 模块 2018-01-24 15:00:00 1 0.0\n", + "119257 #zh.wikipedia 模块 2018-01-25 00:00:00 1 0.0\n", + "119258 #zh.wikipedia 模块 2018-01-25 11:00:00 3 0.0\n", + "119259 #zh.wikipedia 模块 2018-01-25 12:00:00 1 0.0\n", + "119260 #zh.wikipedia 模块 2018-01-25 13:00:00 2 0.0\n", + "119261 #zh.wikipedia 模块 2018-01-25 14:00:00 3 0.0\n", + "119262 #zh.wikipedia 模块 2018-01-25 15:00:00 1 0.0\n", + "119263 #zh.wikipedia 模块 2018-01-25 18:00:00 1 0.0\n", + "119264 #zh.wikipedia 模块 2018-01-25 19:00:00 2 0.0\n", + "119265 #zh.wikipedia 模块 2018-01-25 20:00:00 2 0.0\n", + "119266 #zh.wikipedia 模块 2018-01-26 08:00:00 1 0.0\n", + "119267 #zh.wikipedia 模块 2018-01-26 09:00:00 2 0.0\n", + "119268 #zh.wikipedia 模块 2018-01-26 16:00:00 1 0.0\n", + "119269 #zh.wikipedia 模块 2018-01-26 17:00:00 3 0.0\n", + "119270 #zh.wikipedia 模块 2018-01-27 09:00:00 2 0.0\n", + "119271 #zh.wikipedia 模块 2018-01-27 11:00:00 2 0.0\n", + "119272 #zh.wikipedia 模块 2018-01-27 12:00:00 1 0.0\n", + "119273 #zh.wikipedia 模块 2018-01-27 13:00:00 4 0.0\n", + "119274 #zh.wikipedia 模块 2018-01-27 15:00:00 1 0.0\n", + "119275 #zh.wikipedia 模块讨论 2018-01-12 07:00:00 1 0.0\n", + "119276 #zh.wikipedia 模块讨论 2018-01-18 09:00:00 1 0.0\n", + "119277 #zh.wikipedia 模块讨论 2018-01-22 05:00:00 1 0.0\n", + "119278 #zh.wikipedia 模块讨论 2018-01-23 10:00:00 3 0.0\n", + "119279 #zh.wikipedia 模块讨论 2018-01-23 11:00:00 1 0.0\n", + "119280 #zh.wikipedia 模块讨论 2018-01-23 21:00:00 1 0.0\n", + "119281 #zh.wikipedia 爆笑寵妃 2018-01-23 09:00:00 1 0.0\n", + "119282 #zh.wikipedia 阿富汗 2018-01-09 23:00:00 1 0.0\n", + "\n", + "[119283 rows x 5 columns]" + ] + }, + "execution_count": 217, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t1 = big_wiki_data[metric].quantile(0.99)\n", + "big_wiki_oracle = big_wiki_data.groupby(attributes + [pd.TimeGrouper(freq='H')]).agg({metric: [\n", + " 'count',\n", + " outliers(t1, \"1\"),\n", + "]}).reset_index(col_level=1)\n", + "big_wiki_oracle.columns = big_wiki_oracle.columns.get_level_values(1)\n", + "big_wiki_oracle" + ] + }, + { + "cell_type": "code", + "execution_count": 221, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "74235.0" + ] + }, + "execution_count": 221, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "big_wiki_oracle['outliers1'].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "big_wiki_oracle.to_csv('lib/src/test/resources/big_wiki_oracle_cubed.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 219, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
channelnamespace__timeminmaxlminlmaxm0m1m2m3m4m5lm0lm1lm2lm3lm4lm5
0#ar.wikipedia162018-01-15 05:00:0025.025.03.2188763.2188761.025.0625.01.562500e+043.906250e+059.765625e+061.03.21887610.36116233.351293107.3536693.455581e+02
1#ar.wikipediaMain2018-01-08 00:00:001.09008.00.0000009.105868149.021663.083984409.07.333297e+116.587312e+155.931574e+19149.0623.8659212750.86632212573.99523160228.0247553.084326e+05
2#ar.wikipediaMain2018-01-08 01:00:002.02211.00.6931477.701200185.022428.017164924.02.951661e+105.669284e+131.124865e+17185.0778.4505243439.84908115837.66063076250.0855283.871860e+05
3#ar.wikipediaMain2018-01-08 02:00:001.02260.00.0000007.723120105.011471.010752073.02.150527e+104.631962e+131.011019e+17105.0409.8653541737.2331897792.11609237124.5115761.893863e+05
4#ar.wikipediaMain2018-01-08 03:00:001.05184.00.0000008.55333264.07686.027016542.01.393251e+117.222052e+143.743906e+1864.0222.911934854.6017603555.37952816586.8489398.970690e+04
5#ar.wikipediaMain2018-01-08 04:00:001.01904.00.0000007.55171279.013132.010292000.01.520400e+102.606315e+134.699229e+1679.0339.6702591597.1026987916.11067141284.4021012.266564e+05
6#ar.wikipediaMain2018-01-08 05:00:003.03720.01.0986128.22147963.022247.038118603.01.136075e+113.783324e+141.295618e+1863.0306.0302361616.4852459106.00998454137.7289873.373556e+05
7#ar.wikipediaMain2018-01-08 06:00:001.02296.00.0000007.738924122.025771.019301921.02.632441e+104.700515e+139.415283e+16122.0533.7467682632.32523513890.84409177252.1019514.486258e+05
8#ar.wikipediaMain2018-01-08 07:00:006.02110.01.7917597.654443185.035555.021997115.02.728389e+104.519400e+138.302905e+16185.0847.3189814131.66505721252.017440114463.6832426.416142e+05
9#ar.wikipediaMain2018-01-08 08:00:004.02244.01.3862947.716015166.032412.019844876.02.461746e+104.288884e+138.480738e+16166.0755.3406783693.91916219119.675276103717.4814045.852788e+05
10#ar.wikipediaMain2018-01-08 09:00:001.09370.00.0000009.145268118.048319.0213810381.01.489308e+121.171073e+169.747945e+19118.0505.9357992471.90923713561.43722282846.6909145.542719e+05
11#ar.wikipediaMain2018-01-08 10:00:002.014400.00.6931479.574983109.038819.0234665775.03.054160e+124.322428e+166.199923e+20109.0485.3954832407.65964013061.33152476704.5631494.838209e+05
12#ar.wikipediaMain2018-01-08 11:00:001.020861.00.0000009.945637148.059901.0545245471.09.823238e+121.954983e+174.003327e+21148.0633.6540163107.06569116850.137822100333.7564586.513246e+05
13#ar.wikipediaMain2018-01-08 12:00:001.019615.00.0000009.884050205.069733.0453862027.07.791436e+121.492376e+172.910235e+21205.0917.4473504610.70021625085.612638146173.1427909.073211e+05
14#ar.wikipediaMain2018-01-08 13:00:001.020813.00.0000009.943333216.091332.0764243202.01.165487e+132.099017e+174.096753e+21216.0957.1652394692.06395925194.242050148637.3423079.627241e+05
15#ar.wikipediaMain2018-01-08 14:00:001.024895.00.00000010.122422207.098639.0828905321.01.662305e+133.926093e+179.629692e+21207.0967.2864784983.87153427972.605325170348.0710881.118231e+06
16#ar.wikipediaMain2018-01-08 15:00:001.02758.00.0000007.922261165.011762.011136476.02.558908e+106.514402e+131.714289e+17165.0561.7893542125.1203178656.20508038185.4918631.846790e+05
17#ar.wikipediaMain2018-01-08 16:00:005.05824.01.6094388.669743126.029981.088989143.04.395026e+112.415290e+151.372286e+19126.0507.1001692243.43918811084.67278061604.6031773.816690e+05
18#ar.wikipediaMain2018-01-08 17:00:001.09981.00.0000009.208439187.036781.0134903627.01.066277e+121.007978e+169.939758e+19187.0710.1830063039.63661914510.23603377742.3518884.661678e+05
19#ar.wikipediaMain2018-01-08 18:00:001.09299.00.0000009.137662319.056324.0180817952.01.160272e+129.029371e+157.672157e+19319.01189.5291055003.13472123442.962217123315.6943407.269031e+05
20#ar.wikipediaMain2018-01-08 19:00:001.012493.00.0000009.432924169.036030.0189039444.02.029805e+122.457818e+163.049638e+20169.0641.1526352752.67252513162.82376570715.5794554.268993e+05
21#ar.wikipediaMain2018-01-08 20:00:001.02868.00.0000007.961370166.027412.030194820.05.825853e+101.326486e+143.273461e+17166.0669.3558182995.18280014509.81117776283.2566994.338066e+05
22#ar.wikipediaMain2018-01-08 21:00:001.09831.00.0000009.193296166.081943.0351898425.02.292421e+121.814749e+161.579637e+20166.0693.0880893395.67919019308.399786124886.0424688.875766e+05
23#ar.wikipediaMain2018-01-08 22:00:001.04344.00.0000008.376551116.041030.094661544.03.027321e+111.094226e+154.217602e+18116.0471.4830262320.78684112923.65101079754.0027445.325321e+05
24#ar.wikipediaMain2018-01-08 23:00:001.04500.00.0000008.411833133.034747.066006241.02.072867e+117.757698e+143.141152e+18133.0551.9933462618.55559613703.86193878719.8354984.903965e+05
25#ar.wikipediaMain2018-01-09 00:00:001.07474.00.0000008.91918676.035724.0116030450.06.091646e+113.818369e+152.606292e+1976.0335.0359361753.13042210269.09136665447.3157514.447135e+05
26#ar.wikipediaMain2018-01-09 01:00:001.02811.00.0000007.94129675.011939.018132095.04.276567e+101.072078e+142.762855e+1775.0289.6234851250.4006355902.21286030618.1992781.747318e+05
27#ar.wikipediaMain2018-01-09 02:00:002.0702.00.6931476.55393342.03056.0964256.05.605717e+083.626704e+112.399057e+1442.0146.596532582.8244682498.18033411387.2893895.507752e+04
28#ar.wikipediaMain2018-01-09 03:00:001.01299.00.0000007.16935038.03930.01942072.02.222922e+092.851957e+123.699472e+1538.0151.273501665.2426633040.73662414415.3831727.132009e+04
29#ar.wikipediaMain2018-01-09 04:00:002.0447.00.6931476.10255986.05300.0532602.01.162811e+084.249718e+101.812852e+1386.0337.7512661368.9236625647.86709023665.5505781.008164e+05
............................................................
119253#zh.wikipedia模块2018-01-24 04:00:001.01.00.0000000.0000001.01.01.01.000000e+001.000000e+001.000000e+001.00.0000000.0000000.0000000.0000000.000000e+00
119254#zh.wikipedia模块2018-01-24 08:00:006.06.01.7917591.7917591.06.036.02.160000e+021.296000e+037.776000e+031.01.7917593.2104025.75226810.3066811.846709e+01
119255#zh.wikipedia模块2018-01-24 13:00:009.06132.02.1972258.7212762.06141.037601505.02.305719e+111.413867e+158.669833e+182.010.91850180.888455673.9537725808.5314995.050575e+04
119256#zh.wikipedia模块2018-01-24 15:00:0028.028.03.3322053.3322051.028.0784.02.195200e+046.146560e+051.721037e+071.03.33220511.10358736.999422123.2896424.108263e+02
119257#zh.wikipedia模块2018-01-25 00:00:0015.015.02.7080502.7080501.015.0225.03.375000e+035.062500e+047.593750e+051.02.7080507.33353619.85958353.7807491.456410e+02
119258#zh.wikipedia模块2018-01-25 11:00:0015.0298.02.7080505.6970933.0431.0102953.02.811000e+078.080079e+092.372951e+123.013.17582862.549842313.3475011625.2211668.618412e+03
119259#zh.wikipedia模块2018-01-25 12:00:002.02.00.6931470.6931471.02.04.08.000000e+001.600000e+013.200000e+011.00.6931470.4804530.3330250.2308351.600027e-01
119260#zh.wikipedia模块2018-01-25 13:00:004.05020.01.3862948.5211852.05024.025200416.01.265060e+116.350602e+143.188002e+182.09.90748074.532409621.3925465275.9922214.493136e+04
119261#zh.wikipedia模块2018-01-25 14:00:002.047.00.6931473.8501483.053.02229.01.038950e+054.879953e+062.293461e+083.05.92958917.22590260.070411223.6643978.513123e+02
119262#zh.wikipedia模块2018-01-25 15:00:0049.049.03.8918203.8918201.049.02401.01.176490e+055.764801e+062.824752e+081.03.89182015.14626558.946542229.4093518.928200e+02
119263#zh.wikipedia模块2018-01-25 18:00:0049.049.03.8918203.8918201.049.02401.01.176490e+055.764801e+062.824752e+081.03.89182015.14626558.946542229.4093518.928200e+02
119264#zh.wikipedia模块2018-01-25 19:00:0042.084.03.7376704.4308172.0126.08820.06.667920e+055.289883e+074.312811e+092.08.16848633.602312139.202300580.5865902.437194e+03
119265#zh.wikipedia模块2018-01-25 20:00:0022.0173.03.0910425.1532922.0195.030413.05.188365e+068.959793e+081.549690e+112.08.24433436.110958166.386446796.5324393.916503e+03
119266#zh.wikipedia模块2018-01-26 08:00:0070.070.04.2484954.2484951.070.04900.03.430000e+052.401000e+071.680700e+091.04.24849518.04971276.684115325.7920971.384126e+03
119267#zh.wikipedia模块2018-01-26 09:00:001.018.00.0000002.8903722.019.0325.05.833000e+031.049770e+051.889569e+062.02.8903728.35424924.14688569.7934752.017291e+02
119268#zh.wikipedia模块2018-01-26 16:00:001799.01799.07.4949867.4949861.01799.03236401.05.822285e+091.047429e+131.884325e+161.07.49498656.174819421.0294923155.6102502.365126e+04
119269#zh.wikipedia模块2018-01-26 17:00:0028.03574.03.3322058.1814413.07016.024429656.08.544395e+102.990099e+141.046926e+183.019.649285144.2281951123.1190208984.6495907.270887e+04
119270#zh.wikipedia模块2018-01-27 09:00:00191.08720.05.2522739.0733752.08911.076074881.06.630618e+115.781840e+155.041763e+192.014.325648109.912501891.8669567538.5990286.549264e+04
119271#zh.wikipedia模块2018-01-27 11:00:00130.0191.04.8675345.2522732.0321.053381.09.164871e+061.616473e+092.913242e+112.010.11980851.279268260.2171571322.3612636.729429e+03
119272#zh.wikipedia模块2018-01-27 12:00:002548.02548.07.8430647.8430641.02548.06492304.01.654239e+104.215001e+131.073982e+171.07.84306461.513653482.4555203783.9295262.967760e+04
119273#zh.wikipedia模块2018-01-27 13:00:0012.01146.02.4849077.0440334.01244.01317608.01.505279e+091.724811e+121.976620e+154.016.92158783.383945468.8020912895.2305601.895162e+04
119274#zh.wikipedia模块2018-01-27 15:00:0088.088.04.4773374.4773371.088.07744.06.814720e+055.996954e+075.277319e+091.04.47733720.04654589.755134401.8639641.799280e+03
119275#zh.wikipedia模块讨论2018-01-12 07:00:00438.0438.06.0822196.0822191.0438.0191844.08.402767e+073.680412e+101.612020e+131.06.08221936.993387225.0018771368.5106728.323581e+03
119276#zh.wikipedia模块讨论2018-01-18 09:00:0096.096.04.5643484.5643481.096.09216.08.847360e+058.493466e+078.153727e+091.04.56434820.83327495.090318434.0253231.981043e+03
119277#zh.wikipedia模块讨论2018-01-22 05:00:0026497.026497.010.18478710.1847871.026497.0702091009.01.860331e+134.929318e+171.306121e+221.010.184787103.7298821056.46673410759.8884451.095872e+05
119278#zh.wikipedia模块讨论2018-01-23 10:00:006.0427.01.7917596.0567843.0632.0221966.08.573530e+073.481210e+101.450721e+133.013.14184867.914111376.2566742141.1375671.232507e+04
119279#zh.wikipedia模块讨论2018-01-23 11:00:00185.0185.05.2203565.2203561.0185.034225.06.331625e+061.171351e+092.166999e+111.05.22035627.252115142.265737742.6777693.877042e+03
119280#zh.wikipedia模块讨论2018-01-23 21:00:00137.0137.04.9199814.9199811.0137.018769.02.571353e+063.522754e+084.826172e+101.04.91998124.206212119.094103585.9407142.882817e+03
119281#zh.wikipedia爆笑寵妃2018-01-23 09:00:0059.059.04.0775374.0775371.059.03481.02.053790e+051.211736e+077.149243e+081.04.07753716.62631267.794408276.4342381.127171e+03
119282#zh.wikipedia阿富汗2018-01-09 23:00:0028.028.03.3322053.3322051.028.0784.02.195200e+046.146560e+051.721037e+071.03.33220511.10358736.999422123.2896424.108263e+02
\n", + "

119283 rows × 19 columns

\n", + "
" + ], + "text/plain": [ + " channel namespace __time min max \\\n", + "0 #ar.wikipedia 16 2018-01-15 05:00:00 25.0 25.0 \n", + "1 #ar.wikipedia Main 2018-01-08 00:00:00 1.0 9008.0 \n", + "2 #ar.wikipedia Main 2018-01-08 01:00:00 2.0 2211.0 \n", + "3 #ar.wikipedia Main 2018-01-08 02:00:00 1.0 2260.0 \n", + "4 #ar.wikipedia Main 2018-01-08 03:00:00 1.0 5184.0 \n", + "5 #ar.wikipedia Main 2018-01-08 04:00:00 1.0 1904.0 \n", + "6 #ar.wikipedia Main 2018-01-08 05:00:00 3.0 3720.0 \n", + "7 #ar.wikipedia Main 2018-01-08 06:00:00 1.0 2296.0 \n", + "8 #ar.wikipedia Main 2018-01-08 07:00:00 6.0 2110.0 \n", + "9 #ar.wikipedia Main 2018-01-08 08:00:00 4.0 2244.0 \n", + "10 #ar.wikipedia Main 2018-01-08 09:00:00 1.0 9370.0 \n", + "11 #ar.wikipedia Main 2018-01-08 10:00:00 2.0 14400.0 \n", + "12 #ar.wikipedia Main 2018-01-08 11:00:00 1.0 20861.0 \n", + "13 #ar.wikipedia Main 2018-01-08 12:00:00 1.0 19615.0 \n", + "14 #ar.wikipedia Main 2018-01-08 13:00:00 1.0 20813.0 \n", + "15 #ar.wikipedia Main 2018-01-08 14:00:00 1.0 24895.0 \n", + "16 #ar.wikipedia Main 2018-01-08 15:00:00 1.0 2758.0 \n", + "17 #ar.wikipedia Main 2018-01-08 16:00:00 5.0 5824.0 \n", + "18 #ar.wikipedia Main 2018-01-08 17:00:00 1.0 9981.0 \n", + "19 #ar.wikipedia Main 2018-01-08 18:00:00 1.0 9299.0 \n", + "20 #ar.wikipedia Main 2018-01-08 19:00:00 1.0 12493.0 \n", + "21 #ar.wikipedia Main 2018-01-08 20:00:00 1.0 2868.0 \n", + "22 #ar.wikipedia Main 2018-01-08 21:00:00 1.0 9831.0 \n", + "23 #ar.wikipedia Main 2018-01-08 22:00:00 1.0 4344.0 \n", + "24 #ar.wikipedia Main 2018-01-08 23:00:00 1.0 4500.0 \n", + "25 #ar.wikipedia Main 2018-01-09 00:00:00 1.0 7474.0 \n", + "26 #ar.wikipedia Main 2018-01-09 01:00:00 1.0 2811.0 \n", + "27 #ar.wikipedia Main 2018-01-09 02:00:00 2.0 702.0 \n", + "28 #ar.wikipedia Main 2018-01-09 03:00:00 1.0 1299.0 \n", + "29 #ar.wikipedia Main 2018-01-09 04:00:00 2.0 447.0 \n", + "... ... ... ... ... ... \n", + "119253 #zh.wikipedia 模块 2018-01-24 04:00:00 1.0 1.0 \n", + "119254 #zh.wikipedia 模块 2018-01-24 08:00:00 6.0 6.0 \n", + "119255 #zh.wikipedia 模块 2018-01-24 13:00:00 9.0 6132.0 \n", + "119256 #zh.wikipedia 模块 2018-01-24 15:00:00 28.0 28.0 \n", + "119257 #zh.wikipedia 模块 2018-01-25 00:00:00 15.0 15.0 \n", + "119258 #zh.wikipedia 模块 2018-01-25 11:00:00 15.0 298.0 \n", + "119259 #zh.wikipedia 模块 2018-01-25 12:00:00 2.0 2.0 \n", + "119260 #zh.wikipedia 模块 2018-01-25 13:00:00 4.0 5020.0 \n", + "119261 #zh.wikipedia 模块 2018-01-25 14:00:00 2.0 47.0 \n", + "119262 #zh.wikipedia 模块 2018-01-25 15:00:00 49.0 49.0 \n", + "119263 #zh.wikipedia 模块 2018-01-25 18:00:00 49.0 49.0 \n", + "119264 #zh.wikipedia 模块 2018-01-25 19:00:00 42.0 84.0 \n", + "119265 #zh.wikipedia 模块 2018-01-25 20:00:00 22.0 173.0 \n", + "119266 #zh.wikipedia 模块 2018-01-26 08:00:00 70.0 70.0 \n", + "119267 #zh.wikipedia 模块 2018-01-26 09:00:00 1.0 18.0 \n", + "119268 #zh.wikipedia 模块 2018-01-26 16:00:00 1799.0 1799.0 \n", + "119269 #zh.wikipedia 模块 2018-01-26 17:00:00 28.0 3574.0 \n", + "119270 #zh.wikipedia 模块 2018-01-27 09:00:00 191.0 8720.0 \n", + "119271 #zh.wikipedia 模块 2018-01-27 11:00:00 130.0 191.0 \n", + "119272 #zh.wikipedia 模块 2018-01-27 12:00:00 2548.0 2548.0 \n", + "119273 #zh.wikipedia 模块 2018-01-27 13:00:00 12.0 1146.0 \n", + "119274 #zh.wikipedia 模块 2018-01-27 15:00:00 88.0 88.0 \n", + "119275 #zh.wikipedia 模块讨论 2018-01-12 07:00:00 438.0 438.0 \n", + "119276 #zh.wikipedia 模块讨论 2018-01-18 09:00:00 96.0 96.0 \n", + "119277 #zh.wikipedia 模块讨论 2018-01-22 05:00:00 26497.0 26497.0 \n", + "119278 #zh.wikipedia 模块讨论 2018-01-23 10:00:00 6.0 427.0 \n", + "119279 #zh.wikipedia 模块讨论 2018-01-23 11:00:00 185.0 185.0 \n", + "119280 #zh.wikipedia 模块讨论 2018-01-23 21:00:00 137.0 137.0 \n", + "119281 #zh.wikipedia 爆笑寵妃 2018-01-23 09:00:00 59.0 59.0 \n", + "119282 #zh.wikipedia 阿富汗 2018-01-09 23:00:00 28.0 28.0 \n", + "\n", + " lmin lmax m0 m1 m2 m3 \\\n", + "0 3.218876 3.218876 1.0 25.0 625.0 1.562500e+04 \n", + "1 0.000000 9.105868 149.0 21663.0 83984409.0 7.333297e+11 \n", + "2 0.693147 7.701200 185.0 22428.0 17164924.0 2.951661e+10 \n", + "3 0.000000 7.723120 105.0 11471.0 10752073.0 2.150527e+10 \n", + "4 0.000000 8.553332 64.0 7686.0 27016542.0 1.393251e+11 \n", + "5 0.000000 7.551712 79.0 13132.0 10292000.0 1.520400e+10 \n", + "6 1.098612 8.221479 63.0 22247.0 38118603.0 1.136075e+11 \n", + "7 0.000000 7.738924 122.0 25771.0 19301921.0 2.632441e+10 \n", + "8 1.791759 7.654443 185.0 35555.0 21997115.0 2.728389e+10 \n", + "9 1.386294 7.716015 166.0 32412.0 19844876.0 2.461746e+10 \n", + "10 0.000000 9.145268 118.0 48319.0 213810381.0 1.489308e+12 \n", + "11 0.693147 9.574983 109.0 38819.0 234665775.0 3.054160e+12 \n", + "12 0.000000 9.945637 148.0 59901.0 545245471.0 9.823238e+12 \n", + "13 0.000000 9.884050 205.0 69733.0 453862027.0 7.791436e+12 \n", + "14 0.000000 9.943333 216.0 91332.0 764243202.0 1.165487e+13 \n", + "15 0.000000 10.122422 207.0 98639.0 828905321.0 1.662305e+13 \n", + "16 0.000000 7.922261 165.0 11762.0 11136476.0 2.558908e+10 \n", + "17 1.609438 8.669743 126.0 29981.0 88989143.0 4.395026e+11 \n", + "18 0.000000 9.208439 187.0 36781.0 134903627.0 1.066277e+12 \n", + "19 0.000000 9.137662 319.0 56324.0 180817952.0 1.160272e+12 \n", + "20 0.000000 9.432924 169.0 36030.0 189039444.0 2.029805e+12 \n", + "21 0.000000 7.961370 166.0 27412.0 30194820.0 5.825853e+10 \n", + "22 0.000000 9.193296 166.0 81943.0 351898425.0 2.292421e+12 \n", + "23 0.000000 8.376551 116.0 41030.0 94661544.0 3.027321e+11 \n", + "24 0.000000 8.411833 133.0 34747.0 66006241.0 2.072867e+11 \n", + "25 0.000000 8.919186 76.0 35724.0 116030450.0 6.091646e+11 \n", + "26 0.000000 7.941296 75.0 11939.0 18132095.0 4.276567e+10 \n", + "27 0.693147 6.553933 42.0 3056.0 964256.0 5.605717e+08 \n", + "28 0.000000 7.169350 38.0 3930.0 1942072.0 2.222922e+09 \n", + "29 0.693147 6.102559 86.0 5300.0 532602.0 1.162811e+08 \n", + "... ... ... ... ... ... ... \n", + "119253 0.000000 0.000000 1.0 1.0 1.0 1.000000e+00 \n", + "119254 1.791759 1.791759 1.0 6.0 36.0 2.160000e+02 \n", + "119255 2.197225 8.721276 2.0 6141.0 37601505.0 2.305719e+11 \n", + "119256 3.332205 3.332205 1.0 28.0 784.0 2.195200e+04 \n", + "119257 2.708050 2.708050 1.0 15.0 225.0 3.375000e+03 \n", + "119258 2.708050 5.697093 3.0 431.0 102953.0 2.811000e+07 \n", + "119259 0.693147 0.693147 1.0 2.0 4.0 8.000000e+00 \n", + "119260 1.386294 8.521185 2.0 5024.0 25200416.0 1.265060e+11 \n", + "119261 0.693147 3.850148 3.0 53.0 2229.0 1.038950e+05 \n", + "119262 3.891820 3.891820 1.0 49.0 2401.0 1.176490e+05 \n", + "119263 3.891820 3.891820 1.0 49.0 2401.0 1.176490e+05 \n", + "119264 3.737670 4.430817 2.0 126.0 8820.0 6.667920e+05 \n", + "119265 3.091042 5.153292 2.0 195.0 30413.0 5.188365e+06 \n", + "119266 4.248495 4.248495 1.0 70.0 4900.0 3.430000e+05 \n", + "119267 0.000000 2.890372 2.0 19.0 325.0 5.833000e+03 \n", + "119268 7.494986 7.494986 1.0 1799.0 3236401.0 5.822285e+09 \n", + "119269 3.332205 8.181441 3.0 7016.0 24429656.0 8.544395e+10 \n", + "119270 5.252273 9.073375 2.0 8911.0 76074881.0 6.630618e+11 \n", + "119271 4.867534 5.252273 2.0 321.0 53381.0 9.164871e+06 \n", + "119272 7.843064 7.843064 1.0 2548.0 6492304.0 1.654239e+10 \n", + "119273 2.484907 7.044033 4.0 1244.0 1317608.0 1.505279e+09 \n", + "119274 4.477337 4.477337 1.0 88.0 7744.0 6.814720e+05 \n", + "119275 6.082219 6.082219 1.0 438.0 191844.0 8.402767e+07 \n", + "119276 4.564348 4.564348 1.0 96.0 9216.0 8.847360e+05 \n", + "119277 10.184787 10.184787 1.0 26497.0 702091009.0 1.860331e+13 \n", + "119278 1.791759 6.056784 3.0 632.0 221966.0 8.573530e+07 \n", + "119279 5.220356 5.220356 1.0 185.0 34225.0 6.331625e+06 \n", + "119280 4.919981 4.919981 1.0 137.0 18769.0 2.571353e+06 \n", + "119281 4.077537 4.077537 1.0 59.0 3481.0 2.053790e+05 \n", + "119282 3.332205 3.332205 1.0 28.0 784.0 2.195200e+04 \n", + "\n", + " m4 m5 lm0 lm1 lm2 \\\n", + "0 3.906250e+05 9.765625e+06 1.0 3.218876 10.361162 \n", + "1 6.587312e+15 5.931574e+19 149.0 623.865921 2750.866322 \n", + "2 5.669284e+13 1.124865e+17 185.0 778.450524 3439.849081 \n", + "3 4.631962e+13 1.011019e+17 105.0 409.865354 1737.233189 \n", + "4 7.222052e+14 3.743906e+18 64.0 222.911934 854.601760 \n", + "5 2.606315e+13 4.699229e+16 79.0 339.670259 1597.102698 \n", + "6 3.783324e+14 1.295618e+18 63.0 306.030236 1616.485245 \n", + "7 4.700515e+13 9.415283e+16 122.0 533.746768 2632.325235 \n", + "8 4.519400e+13 8.302905e+16 185.0 847.318981 4131.665057 \n", + "9 4.288884e+13 8.480738e+16 166.0 755.340678 3693.919162 \n", + "10 1.171073e+16 9.747945e+19 118.0 505.935799 2471.909237 \n", + "11 4.322428e+16 6.199923e+20 109.0 485.395483 2407.659640 \n", + "12 1.954983e+17 4.003327e+21 148.0 633.654016 3107.065691 \n", + "13 1.492376e+17 2.910235e+21 205.0 917.447350 4610.700216 \n", + "14 2.099017e+17 4.096753e+21 216.0 957.165239 4692.063959 \n", + "15 3.926093e+17 9.629692e+21 207.0 967.286478 4983.871534 \n", + "16 6.514402e+13 1.714289e+17 165.0 561.789354 2125.120317 \n", + "17 2.415290e+15 1.372286e+19 126.0 507.100169 2243.439188 \n", + "18 1.007978e+16 9.939758e+19 187.0 710.183006 3039.636619 \n", + "19 9.029371e+15 7.672157e+19 319.0 1189.529105 5003.134721 \n", + "20 2.457818e+16 3.049638e+20 169.0 641.152635 2752.672525 \n", + "21 1.326486e+14 3.273461e+17 166.0 669.355818 2995.182800 \n", + "22 1.814749e+16 1.579637e+20 166.0 693.088089 3395.679190 \n", + "23 1.094226e+15 4.217602e+18 116.0 471.483026 2320.786841 \n", + "24 7.757698e+14 3.141152e+18 133.0 551.993346 2618.555596 \n", + "25 3.818369e+15 2.606292e+19 76.0 335.035936 1753.130422 \n", + "26 1.072078e+14 2.762855e+17 75.0 289.623485 1250.400635 \n", + "27 3.626704e+11 2.399057e+14 42.0 146.596532 582.824468 \n", + "28 2.851957e+12 3.699472e+15 38.0 151.273501 665.242663 \n", + "29 4.249718e+10 1.812852e+13 86.0 337.751266 1368.923662 \n", + "... ... ... ... ... ... \n", + "119253 1.000000e+00 1.000000e+00 1.0 0.000000 0.000000 \n", + "119254 1.296000e+03 7.776000e+03 1.0 1.791759 3.210402 \n", + "119255 1.413867e+15 8.669833e+18 2.0 10.918501 80.888455 \n", + "119256 6.146560e+05 1.721037e+07 1.0 3.332205 11.103587 \n", + "119257 5.062500e+04 7.593750e+05 1.0 2.708050 7.333536 \n", + "119258 8.080079e+09 2.372951e+12 3.0 13.175828 62.549842 \n", + "119259 1.600000e+01 3.200000e+01 1.0 0.693147 0.480453 \n", + "119260 6.350602e+14 3.188002e+18 2.0 9.907480 74.532409 \n", + "119261 4.879953e+06 2.293461e+08 3.0 5.929589 17.225902 \n", + "119262 5.764801e+06 2.824752e+08 1.0 3.891820 15.146265 \n", + "119263 5.764801e+06 2.824752e+08 1.0 3.891820 15.146265 \n", + "119264 5.289883e+07 4.312811e+09 2.0 8.168486 33.602312 \n", + "119265 8.959793e+08 1.549690e+11 2.0 8.244334 36.110958 \n", + "119266 2.401000e+07 1.680700e+09 1.0 4.248495 18.049712 \n", + "119267 1.049770e+05 1.889569e+06 2.0 2.890372 8.354249 \n", + "119268 1.047429e+13 1.884325e+16 1.0 7.494986 56.174819 \n", + "119269 2.990099e+14 1.046926e+18 3.0 19.649285 144.228195 \n", + "119270 5.781840e+15 5.041763e+19 2.0 14.325648 109.912501 \n", + "119271 1.616473e+09 2.913242e+11 2.0 10.119808 51.279268 \n", + "119272 4.215001e+13 1.073982e+17 1.0 7.843064 61.513653 \n", + "119273 1.724811e+12 1.976620e+15 4.0 16.921587 83.383945 \n", + "119274 5.996954e+07 5.277319e+09 1.0 4.477337 20.046545 \n", + "119275 3.680412e+10 1.612020e+13 1.0 6.082219 36.993387 \n", + "119276 8.493466e+07 8.153727e+09 1.0 4.564348 20.833274 \n", + "119277 4.929318e+17 1.306121e+22 1.0 10.184787 103.729882 \n", + "119278 3.481210e+10 1.450721e+13 3.0 13.141848 67.914111 \n", + "119279 1.171351e+09 2.166999e+11 1.0 5.220356 27.252115 \n", + "119280 3.522754e+08 4.826172e+10 1.0 4.919981 24.206212 \n", + "119281 1.211736e+07 7.149243e+08 1.0 4.077537 16.626312 \n", + "119282 6.146560e+05 1.721037e+07 1.0 3.332205 11.103587 \n", + "\n", + " lm3 lm4 lm5 \n", + "0 33.351293 107.353669 3.455581e+02 \n", + "1 12573.995231 60228.024755 3.084326e+05 \n", + "2 15837.660630 76250.085528 3.871860e+05 \n", + "3 7792.116092 37124.511576 1.893863e+05 \n", + "4 3555.379528 16586.848939 8.970690e+04 \n", + "5 7916.110671 41284.402101 2.266564e+05 \n", + "6 9106.009984 54137.728987 3.373556e+05 \n", + "7 13890.844091 77252.101951 4.486258e+05 \n", + "8 21252.017440 114463.683242 6.416142e+05 \n", + "9 19119.675276 103717.481404 5.852788e+05 \n", + "10 13561.437222 82846.690914 5.542719e+05 \n", + "11 13061.331524 76704.563149 4.838209e+05 \n", + "12 16850.137822 100333.756458 6.513246e+05 \n", + "13 25085.612638 146173.142790 9.073211e+05 \n", + "14 25194.242050 148637.342307 9.627241e+05 \n", + "15 27972.605325 170348.071088 1.118231e+06 \n", + "16 8656.205080 38185.491863 1.846790e+05 \n", + "17 11084.672780 61604.603177 3.816690e+05 \n", + "18 14510.236033 77742.351888 4.661678e+05 \n", + "19 23442.962217 123315.694340 7.269031e+05 \n", + "20 13162.823765 70715.579455 4.268993e+05 \n", + "21 14509.811177 76283.256699 4.338066e+05 \n", + "22 19308.399786 124886.042468 8.875766e+05 \n", + "23 12923.651010 79754.002744 5.325321e+05 \n", + "24 13703.861938 78719.835498 4.903965e+05 \n", + "25 10269.091366 65447.315751 4.447135e+05 \n", + "26 5902.212860 30618.199278 1.747318e+05 \n", + "27 2498.180334 11387.289389 5.507752e+04 \n", + "28 3040.736624 14415.383172 7.132009e+04 \n", + "29 5647.867090 23665.550578 1.008164e+05 \n", + "... ... ... ... \n", + "119253 0.000000 0.000000 0.000000e+00 \n", + "119254 5.752268 10.306681 1.846709e+01 \n", + "119255 673.953772 5808.531499 5.050575e+04 \n", + "119256 36.999422 123.289642 4.108263e+02 \n", + "119257 19.859583 53.780749 1.456410e+02 \n", + "119258 313.347501 1625.221166 8.618412e+03 \n", + "119259 0.333025 0.230835 1.600027e-01 \n", + "119260 621.392546 5275.992221 4.493136e+04 \n", + "119261 60.070411 223.664397 8.513123e+02 \n", + "119262 58.946542 229.409351 8.928200e+02 \n", + "119263 58.946542 229.409351 8.928200e+02 \n", + "119264 139.202300 580.586590 2.437194e+03 \n", + "119265 166.386446 796.532439 3.916503e+03 \n", + "119266 76.684115 325.792097 1.384126e+03 \n", + "119267 24.146885 69.793475 2.017291e+02 \n", + "119268 421.029492 3155.610250 2.365126e+04 \n", + "119269 1123.119020 8984.649590 7.270887e+04 \n", + "119270 891.866956 7538.599028 6.549264e+04 \n", + "119271 260.217157 1322.361263 6.729429e+03 \n", + "119272 482.455520 3783.929526 2.967760e+04 \n", + "119273 468.802091 2895.230560 1.895162e+04 \n", + "119274 89.755134 401.863964 1.799280e+03 \n", + "119275 225.001877 1368.510672 8.323581e+03 \n", + "119276 95.090318 434.025323 1.981043e+03 \n", + "119277 1056.466734 10759.888445 1.095872e+05 \n", + "119278 376.256674 2141.137567 1.232507e+04 \n", + "119279 142.265737 742.677769 3.877042e+03 \n", + "119280 119.094103 585.940714 2.882817e+03 \n", + "119281 67.794408 276.434238 1.127171e+03 \n", + "119282 36.999422 123.289642 4.108263e+02 \n", + "\n", + "[119283 rows x 19 columns]" + ] + }, + "execution_count": 219, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "big_wiki_cube = big_wiki_data.groupby(attributes + [pd.TimeGrouper(freq='H')]).agg({metric: [\n", + " 'min',\n", + " 'max',\n", + " log_min(),\n", + " log_max(),\n", + " moment(0),\n", + " moment(1),\n", + " moment(2),\n", + " moment(3),\n", + " moment(4),\n", + " moment(5),\n", + " log_moment(0),\n", + " log_moment(1),\n", + " log_moment(2),\n", + " log_moment(3),\n", + " log_moment(4),\n", + " log_moment(5)\n", + "]}).reset_index(col_level=1)\n", + "big_wiki_cube.columns = big_wiki_cube.columns.get_level_values(1)\n", + "big_wiki_cube" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "big_wiki_cube.to_csv('lib/src/test/resources/big_wiki_moments_cubed.csv')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java index 65727097f..3d0a8a114 100644 --- a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java +++ b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java @@ -5,10 +5,7 @@ import edu.stanford.futuredata.macrobase.analysis.classify.PredicateCubeClassifier; import edu.stanford.futuredata.macrobase.analysis.classify.QuantileClassifier; import edu.stanford.futuredata.macrobase.analysis.classify.RawClassifier; -import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLExplanation; -import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLMeanSummarizer; -import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLOutlierSummarizer; -import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLSummarizer; +import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.*; import edu.stanford.futuredata.macrobase.datamodel.DataFrame; import edu.stanford.futuredata.macrobase.datamodel.Schema; import edu.stanford.futuredata.macrobase.ingest.CSVDataFrameWriter; @@ -50,7 +47,10 @@ public class CubePipeline implements Pipeline { private boolean includeLo; private Optional meanColumn; private Optional stdColumn; + private Optional minColumn; + private Optional maxColumn; private LinkedHashMap quantileColumns; + private List momentColumns; // Explanation private List attributes; @@ -88,7 +88,10 @@ public CubePipeline(PipelineConfig conf) { includeLo = conf.get("includeLo", true); meanColumn = Optional.ofNullable(conf.get("meanColumn")); stdColumn = Optional.ofNullable(conf.get("stdColumn")); + minColumn = Optional.ofNullable(conf.get("minColumn")); + maxColumn = Optional.ofNullable(conf.get("maxColumn")); quantileColumns = conf.get("quantileColumns", new LinkedHashMap()); + momentColumns = conf.get("momentColumns", new ArrayList()); attributes = conf.get("attributes"); minSupport = conf.get("minSupport", 3.0); @@ -173,6 +176,18 @@ private Map getColTypes() throws MacrobaseException { } return colTypes; } + case "moment": { + for (String col : momentColumns) { + colTypes.put(col, Schema.ColType.DOUBLE); + } + colTypes.put(minColumn + .orElseThrow(() -> new MacrobaseException("min column not present in config")), + Schema.ColType.DOUBLE); + colTypes.put(maxColumn + .orElseThrow(() -> new MacrobaseException("max column not present in config")), + Schema.ColType.DOUBLE); + return colTypes; + } case "raw": { colTypes.put(meanColumn.orElseThrow( () -> new MacrobaseException("mean column not present in config")), @@ -216,6 +231,12 @@ private CubeClassifier getClassifier() throws MacrobaseException { () -> new MacrobaseException("metric column not present in config")), predicateStr, cutoff); } + case "moment": { + return new RawClassifier( + countColumn, + null + ); + } case "meanshift": case "raw": { @@ -244,6 +265,19 @@ private APLSummarizer getSummarizer(CubeClassifier classifier) throws Exception summarizer.setMinStdDev(minRatioMetric); return summarizer; } +// case "moment": { +// APLMomentSummarizer summarizer = new APLMomentSummarizer(); +// summarizer.setMinColumn(minColumn.orElseThrow( +// () -> new MacrobaseException("min column not present in config"))); +// summarizer.setMaxColumn(maxColumn.orElseThrow( +// () -> new MacrobaseException("max column not present in config"))); +// summarizer.setMomentColumns(momentColumns); +// summarizer.setAttributes(attributes); +// summarizer.setMinSupport(minSupport); +// summarizer.setMinRatioMetric(minRatioMetric); +// summarizer.setPercentile(cutoff); +// return summarizer; +// } default: { APLOutlierSummarizer summarizer = new APLOutlierSummarizer(); summarizer.setOutlierColumn(classifier.getOutputColumnName()); diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLOutlierSummarizer.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLOutlierSummarizer.java index 37d9c2a6d..1e0bab036 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLOutlierSummarizer.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLOutlierSummarizer.java @@ -9,6 +9,7 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; /** @@ -18,6 +19,7 @@ public class APLOutlierSummarizer extends APLSummarizer { private Logger log = LoggerFactory.getLogger("APLOutlierSummarizer"); private String countColumn = null; + private boolean onlyUseSupport = false; @Override public List getAggregateNames() { @@ -42,15 +44,21 @@ public List getQualityMetricList() { qualityMetricList.add( new SupportMetric(0) ); - qualityMetricList.add( - new GlobalRatioMetric(0, 1) - ); + if (!onlyUseSupport) { + qualityMetricList.add( + new GlobalRatioMetric(0, 1) + ); + } return qualityMetricList; } @Override public List getThresholds() { - return Arrays.asList(minOutlierSupport, minRatioMetric); + if (onlyUseSupport) { + return Collections.singletonList(minOutlierSupport); + } else { + return Arrays.asList(minOutlierSupport, minRatioMetric); + } } @Override @@ -72,4 +80,5 @@ public void setCountColumn(String countColumn) { public double getMinRatioMetric() { return minRatioMetric; } + public void onlyUseSupport(boolean onlyUseSupport) { this.onlyUseSupport = onlyUseSupport; } } diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLSummarizer.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLSummarizer.java index 58759b17c..833fcf1ba 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLSummarizer.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLSummarizer.java @@ -9,7 +9,9 @@ import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import java.util.Map; /** * Generic summarizer superclass that can be customized with @@ -18,11 +20,12 @@ */ public abstract class APLSummarizer extends BatchSummarizer { Logger log = LoggerFactory.getLogger("APLSummarizer"); - AttributeEncoder encoder; - APLExplanation explanation; - APrioriLinear aplKernel; - List qualityMetricList; - List thresholds; + protected AttributeEncoder encoder; + protected APLExplanation explanation; + protected APrioriLinear aplKernel; + protected boolean doContainment = true; + public List qualityMetricList; + protected List thresholds; protected long numEvents = 0; protected long numOutliers = 0; @@ -66,10 +69,12 @@ public void process(DataFrame input) throws Exception { qualityMetricList, thresholds ); + aplKernel.setDoContainment(doContainment); double[][] aggregateColumns = getAggregateColumns(input); List aggregateNames = getAggregateNames(); - List aplResults = aplKernel.explain(encoded, aggregateColumns); + Map aggregationOps = getAggregationOps(); + List aplResults = aplKernel.explain(encoded, aggregateColumns, aggregationOps); numOutliers = (long)getNumberOutliers(aggregateColumns); explanation = new APLExplanation( @@ -87,4 +92,9 @@ public APLExplanation getResults() { return explanation; } + public Map getAggregationOps() { + return null; + } + + public void setDoContainment(boolean doContainment) { this.doContainment = doContainment; } } diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APrioriLinear.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APrioriLinear.java index 9f5851ce8..82b2617e3 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APrioriLinear.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APrioriLinear.java @@ -1,5 +1,6 @@ package edu.stanford.futuredata.macrobase.analysis.summary.aplinear; +import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.metrics.AggregationOp; import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.metrics.QualityMetric; import edu.stanford.futuredata.macrobase.analysis.summary.apriori.APrioriSummarizer; import edu.stanford.futuredata.macrobase.analysis.summary.apriori.IntSet; @@ -8,8 +9,6 @@ import org.slf4j.LoggerFactory; import java.util.*; -import java.util.concurrent.Callable; -import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CountDownLatch; /** @@ -24,6 +23,11 @@ public class APrioriLinear { // **Parameters** private QualityMetric[] qualityMetrics; private double[] thresholds; + private boolean doContainment = true; + + public long mergeTime = 0; + public long queryTime = 0; + private long start; // **Cached values** @@ -50,6 +54,14 @@ public APrioriLinear( public List explain( final List attributes, double[][] aggregateColumns + ) { + return explain(attributes, aggregateColumns, null); + } + + public List explain( + final List attributes, + double[][] aggregateColumns, + AggregationOp[] aggregationOps ) { final int numAggregates = aggregateColumns.length; final int numRows = aggregateColumns[0].length; @@ -57,16 +69,21 @@ public List explain( // Quality metrics are initialized with global aggregates to // allow them to determine the appropriate relative thresholds double[] globalAggregates = new double[numAggregates]; + start = System.nanoTime(); for (int j = 0; j < numAggregates; j++) { - globalAggregates[j] = 0; + AggregationOp curOp = aggregationOps[j]; + globalAggregates[j] = curOp.initValue(); double[] curColumn = aggregateColumns[j]; for (int i = 0; i < numRows; i++) { - globalAggregates[j] += curColumn[i]; + globalAggregates[j] = curOp.combine(globalAggregates[j], curColumn[i]); } } + mergeTime += System.nanoTime() - start; + start = System.nanoTime(); for (QualityMetric q : qualityMetrics) { q.initialize(globalAggregates); } + queryTime += System.nanoTime() - start; // Row store for more convenient access final double[][] aRows = new double[numRows][numAggregates]; @@ -89,6 +106,7 @@ public List explain( threadSetAggregates.add(new HashMap<>()); } final CountDownLatch doneSignal = new CountDownLatch(numThreads); + start = System.nanoTime(); for (int threadNum = 0; threadNum < numThreads; threadNum++) { final int startIndex = (numRows * threadNum) / numThreads; final int endIndex = (numRows * (threadNum + 1)) / numThreads; @@ -108,10 +126,20 @@ public List explain( double[] candidateVal = thisThreadSetAggregates.get(curCandidate); if (candidateVal == null) { thisThreadSetAggregates.put(curCandidate, Arrays.copyOf(aRows[i], numAggregates)); - } else { + } else if (aggregationOps == null) { for (int a = 0; a < numAggregates; a++) { candidateVal[a] += aRows[i][a]; } + } else { + for (int a : aggregationOps.getOrDefault("add", new int[0])) { + candidateVal[a] += aRows[i][a]; + } + for (int a : aggregationOps.getOrDefault("min", new int[0])) { + candidateVal[a] = Math.min(candidateVal[a], aRows[i][a]); + } + for (int a : aggregationOps.getOrDefault("max", new int[0])) { + candidateVal[a] = Math.max(candidateVal[a], aRows[i][a]); + } } } } @@ -134,38 +162,49 @@ public List explain( double[] candidateVal = setAggregates.get(curCandidateKey); if (candidateVal == null) { setAggregates.put(curCandidateKey, Arrays.copyOf(curCandidateValue, numAggregates)); - } else { + } else if (aggregationOps == null) { for (int a = 0; a < numAggregates; a++) { candidateVal[a] += curCandidateValue[a]; } + } else { + for (int a : aggregationOps.getOrDefault("add", new int[0])) { + candidateVal[a] += curCandidateValue[a]; + } + for (int a : aggregationOps.getOrDefault("min", new int[0])) { + candidateVal[a] = Math.min(candidateVal[a], curCandidateValue[a]); + } + for (int a : aggregationOps.getOrDefault("max", new int[0])) { + candidateVal[a] = Math.max(candidateVal[a], curCandidateValue[a]); + } } } } + mergeTime += System.nanoTime() - start; HashSet curOrderNext = new HashSet<>(); HashSet curOrderSaved = new HashSet<>(); int pruned = 0; for (IntSet curCandidate: setAggregates.keySet()) { double[] curAggregates = setAggregates.get(curCandidate); - boolean canPassThreshold = true; - boolean isPastThreshold = true; + QualityMetric.Action action = QualityMetric.Action.KEEP; + start = System.nanoTime(); for (int i = 0; i < qualityMetrics.length; i++) { QualityMetric q = qualityMetrics[i]; double t = thresholds[i]; - canPassThreshold &= q.maxSubgroupValue(curAggregates) >= t; - isPastThreshold &= q.value(curAggregates) >= t; + action = QualityMetric.Action.combine(action, q.getAction(curAggregates, t)); } - if (canPassThreshold) { + queryTime += System.nanoTime() - start; + if (action == QualityMetric.Action.KEEP) { // if a set is already past the threshold on all metrics, - // save it and no need for further exploration - if (isPastThreshold) { - curOrderSaved.add(curCandidate); - } - else { - // otherwise if a set still has potentially good subsets, - // save it for further examination + // save it and no need for further exploration if we do containment + curOrderSaved.add(curCandidate); + if (!doContainment) { curOrderNext.add(curCandidate); } + } else if (action == QualityMetric.Action.NEXT) { + // otherwise if a set still has potentially good subsets, + // save it for further examination + curOrderNext.add(curCandidate); } else { pruned++; } @@ -269,4 +308,6 @@ private ArrayList getCandidates( } return candidates; } + + public void setDoContainment(boolean doContainment) { this.doContainment = doContainment; } } diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/metrics/AggregationOp.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/metrics/AggregationOp.java new file mode 100644 index 000000000..85c9116ce --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/metrics/AggregationOp.java @@ -0,0 +1,41 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.aplinear.metrics; + +import edu.stanford.futuredata.macrobase.util.MacrobaseInternalError; + +public enum AggregationOp { + SUM, MIN, MAX; + + public double combine(double a, double b) { + switch(this) { + case SUM: { + return a+b; + } + case MIN: { + return a < b ? a : b; + } + case MAX: { + return a > b ? a : b; + } + default: { + throw new MacrobaseInternalError("Invalid Aggregation Op"); + } + } + } + + public double initValue() { + switch(this) { + case SUM: { + return 0; + } + case MIN: { + return Double.MAX_VALUE; + } + case MAX: { + return -Double.MAX_VALUE; + } + default: { + throw new MacrobaseInternalError("Invalid Aggregation Op"); + } + } + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/metrics/QualityMetric.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/metrics/QualityMetric.java index 4753208e7..a66a366e4 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/metrics/QualityMetric.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/metrics/QualityMetric.java @@ -10,6 +10,26 @@ public interface QualityMetric { double value(double[] aggregates); boolean isMonotonic(); + enum Action { + KEEP(2), + NEXT(1), + PRUNE(0); + + private int val; + + Action(int val) { + this.val = val; + } + + public static Action combine(Action a, Action b) { + if (a.val <= b.val) { + return a; + } else { + return b; + } + } + } + // can override for more fancy tight quality metric bounds default double maxSubgroupValue(double[] aggregates) { if (isMonotonic()) { @@ -18,4 +38,22 @@ default double maxSubgroupValue(double[] aggregates) { return Double.POSITIVE_INFINITY; } } + + default Action getAction(double[] aggregates, double threshold) { + if (isPastThreshold(aggregates, threshold)) { + return Action.KEEP; + } else if (canPassThreshold(aggregates, threshold)) { + return Action.NEXT; + } else { + return Action.PRUNE; + } + } + + default boolean isPastThreshold(double[] aggregates, double threshold) { + return value(aggregates) >= threshold; + } + + default boolean canPassThreshold(double[] aggregates, double threshold) { + return maxSubgroupValue(aggregates) >= threshold; + } } diff --git a/lib/src/main/resources/log4j.properties b/lib/src/main/resources/log4j.properties new file mode 100644 index 000000000..dc217f5e4 --- /dev/null +++ b/lib/src/main/resources/log4j.properties @@ -0,0 +1,8 @@ +log4j.rootLogger=INFO, A1 + +# A1 is set to be a ConsoleAppender. +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n \ No newline at end of file