modified setup

jope35 · jope35 · commit c0eb8675fe7a · 2024-02-09T21:42:01.000+01:00
diff --git a/nbs/tsfeatures.ipynb b/nbs/tsfeatures.ipynb
@@ -94,17 +94,6 @@
     "from tsfeatures.utils import *"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# |export\n",
-    "\n",
-    "FREQS = {\"H\": 24, \"D\": 1, \"M\": 12, \"Q\": 4, \"W\": 1, \"Y\": 1}"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -153,7 +142,12 @@
     "    else:\n",
     "        acfdiff2x = [np.nan] * 2\n",
     "    # first autocorrelation coefficient\n",
-    "    acf_1 = acfx[1]\n",
+    "\n",
+    "    try:\n",
+    "        acf_1 = acfx[1]\n",
+    "    except:\n",
+    "        acf_1 = np.nan\n",
+    "\n",
     "    # sum of squares of first 10 autocorrelation coefficients\n",
     "    sum_of_sq_acf10 = np.sum((acfx[1:11]) ** 2) if size_x > 10 else np.nan\n",
     "    # first autocorrelation ciefficient of differenced series\n",
@@ -255,7 +249,7 @@
     "    if len(x) <= lags + 1:\n",
     "        return {\"arch_lm\": np.nan}\n",
     "    if demean:\n",
-    "        x -= np.mean(x)\n",
+    "         x = x - np.mean(x)\n",
     "\n",
     "    size_x = len(x)\n",
     "    mat = embed(x**2, lags + 1)\n",
@@ -431,7 +425,9 @@
     "    except:\n",
     "        return {\"flat_spots\": np.nan}\n",
     "\n",
-    "    rlex = np.array([sum(1 for i in g) for k, g in groupby(cutx)]).max()"
+    "    rlex = np.array([sum(1 for i in g) for k, g in groupby(cutx)]).max()\n",
+    "    return {\"flat_spots\": rlex}\n",
+    "\n"
    ]
   },
   {
@@ -1103,8 +1099,15 @@
     "    time_x = add_constant(poly_m)\n",
     "    coefs = OLS(trend0, time_x).fit().params\n",
     "\n",
-    "    linearity = coefs[1]\n",
-    "    curvature = -coefs[2]\n",
+    "\n",
+    "    try:\n",
+    "        linearity = coefs[1]\n",
+    "    except:\n",
+    "        linearity = np.nan\n",
+    "    try:\n",
+    "        curvature = -coefs[2]\n",
+    "    except:\n",
+    "        curvature = np.nan\n",
     "    # ACF features\n",
     "    acfremainder = acf_features(remainder, m)\n",
     "    # Assemble features\n",
@@ -1194,6 +1197,56 @@
     "    return {\"unitroot_pp\": test_pp}"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def statistics(x: np.array, freq: int = 1) -> Dict[str, float]:\n",
+    "    \"\"\"Computes basic statistics of x.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    x: numpy array\n",
+    "        The time series.\n",
+    "    freq: int\n",
+    "        Frequency of the time series\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    dict\n",
+    "        'total_sum': Total sum of the series.\n",
+    "        'mean': Mean value.\n",
+    "        'variance': variance of the time series.\n",
+    "        'median': Median value.\n",
+    "        'p2point5': 2.5 Percentile.\n",
+    "        'p5': 5 percentile.\n",
+    "        'p25': 25 percentile.\n",
+    "        'p75': 75 percentile.\n",
+    "        'p95': 95 percentile.\n",
+    "        'p97point5': 97.5 percentile.\n",
+    "        'max': Max value.\n",
+    "        'min': Min value.\n",
+    "    \"\"\"\n",
+    "    res = dict(\n",
+    "        total_sum=np.sum(x),\n",
+    "        mean=np.mean(x),\n",
+    "        variance=np.var(x, ddof=1),\n",
+    "        median=np.median(x),\n",
+    "        p2point5=np.quantile(x, q=0.025),\n",
+    "        p5=np.quantile(x, q=0.05),\n",
+    "        p25=np.quantile(x, q=0.25),\n",
+    "        p75=np.quantile(x, q=0.75),\n",
+    "        p95=np.quantile(x, q=0.95),\n",
+    "        p97point5=np.quantile(x, q=0.975),\n",
+    "        max=np.max(x),\n",
+    "        min=np.min(x),\n",
+    "    )\n",
+    "\n",
+    "    return res"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1227,7 +1280,6 @@
     "    ],\n",
     "    dict_freqs=FREQS,\n",
     "):\n",
-    "    print(\"dict_freq\")\n",
     "    if freq is None:\n",
     "        inf_freq = pd.infer_freq(ts[\"ds\"])\n",
     "        if inf_freq is None:\n",
@@ -1334,6 +1386,79 @@
     "    return ts_features"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _get_feats_wide(index,\n",
+    "                    ts,\n",
+    "                    scale = True,\n",
+    "                    features = [acf_features, arch_stat, crossing_points,\n",
+    "                                entropy, flat_spots, heterogeneity, holt_parameters,\n",
+    "                                lumpiness, nonlinearity, pacf_features, stl_features,\n",
+    "                                stability, hw_parameters, unitroot_kpss, unitroot_pp,\n",
+    "                                series_length, hurst]):\n",
+    "    seasonality = ts['seasonality'].item()\n",
+    "    y = ts['y'].item()\n",
+    "    y = np.array(y)\n",
+    "\n",
+    "    if scale:\n",
+    "        y = scalets(y)\n",
+    "\n",
+    "    c_map = ChainMap(*[dict_feat for dict_feat in [func(y, seasonality) for func in features]])\n",
+    "\n",
+    "    return pd.DataFrame(dict(c_map), index = [index])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tsfeatures_wide(ts: pd.DataFrame,\n",
+    "                    features: List[Callable] = [acf_features, arch_stat, crossing_points,\n",
+    "                                                entropy, flat_spots, heterogeneity,\n",
+    "                                                holt_parameters, lumpiness, nonlinearity,\n",
+    "                                                pacf_features, stl_features, stability,\n",
+    "                                                hw_parameters, unitroot_kpss, unitroot_pp,\n",
+    "                                                series_length, hurst],\n",
+    "                    scale: bool = True,\n",
+    "                    threads: Optional[int] = None) -> pd.DataFrame:\n",
+    "    \"\"\"Calculates features for time series.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    ts: pandas df\n",
+    "        Pandas DataFrame with columns ['unique_id', 'seasonality', 'y'].\n",
+    "        Wide panel of time series.\n",
+    "    features: iterable\n",
+    "        Iterable of features functions.\n",
+    "    scale: bool\n",
+    "        Whether (mean-std)scale data.\n",
+    "    threads: int\n",
+    "        Number of threads to use. Use None (default) for parallel processing.\n",
+    "\n",
+    "    Returns\n",
+    "    -------\n",
+    "    pandas df\n",
+    "        Pandas DataFrame where each column is a feature and each row\n",
+    "        a time series.\n",
+    "    \"\"\"\n",
+    "    partial_get_feats = partial(_get_feats_wide, scale=scale,\n",
+    "                                features=features)\n",
+    "\n",
+    "    with Pool(threads) as pool:\n",
+    "        ts_features = pool.starmap(partial_get_feats, ts.groupby('unique_id'))\n",
+    "\n",
+    "    ts_features = pd.concat(ts_features).rename_axis('unique_id')\n",
+    "    ts_features = ts_features.reset_index()\n",
+    "\n",
+    "    return ts_features"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/setup.py b/setup.py
@@ -1,28 +1,94 @@
+import shlex
+from configparser import ConfigParser
+
 import setuptools
+from pkg_resources import parse_version
+
+assert parse_version(setuptools.__version__) >= parse_version("36.2")
+
+# note: all settings are in settings.ini; edit there, not here
+config = ConfigParser(delimiters=["="])
+config.read("settings.ini", encoding="utf-8")
+cfg = config["DEFAULT"]
+
+cfg_keys = "version description keywords author author_email".split()
+expected = (
+    cfg_keys
+    + "lib_name user branch license status min_python audience language".split()
+)
+for o in expected:
+    assert o in cfg, "missing expected setting: {}".format(o)
+setup_cfg = {o: cfg[o] for o in cfg_keys}
 
-with open("README.md", "r") as fh:
-    long_description = fh.read()
+licenses = {
+    "apache2": (
+        "Apache Software License 2.0",
+        "OSI Approved :: Apache Software License",
+    ),
+    "mit": ("MIT License", "OSI Approved :: MIT License"),
+    "gpl2": (
+        "GNU General Public License v2",
+        "OSI Approved :: GNU General Public License v2 (GPLv2)",
+    ),
+    "gpl3": (
+        "GNU General Public License v3",
+        "OSI Approved :: GNU General Public License v3 (GPLv3)",
+    ),
+    "agpl3": (
+        "GNU Affero General Public License v3",
+        "OSI Approved :: GNU Affero General Public License (AGPLv3)",
+    ),
+    "bsd3": ("BSD License", "OSI Approved :: BSD License"),
+}
+statuses = [
+    "0 - Pre-Planning",
+    "1 - Planning",
+    "2 - Pre-Alpha",
+    "3 - Alpha",
+    "4 - Beta",
+    "5 - Production/Stable",
+    "6 - Mature",
+    "7 - Inactive",
+]
+py_versions = "3.7 3.8 3.9 3.10 3.11".split()
+
+requirements = shlex.split(cfg.get("requirements", ""))
+if cfg.get("pip_requirements"):
+    requirements += shlex.split(cfg.get("pip_requirements", ""))
+min_python = cfg["min_python"]
+lic = licenses.get(cfg["license"].lower(), (cfg["license"], None))
+dev_requirements = (cfg.get("dev_requirements") or "").split()
+project_urls = {}
+if cfg.get("doc_host"):
+    project_urls["Documentation"] = cfg["doc_host"] + cfg.get("doc_baseurl", "")
 
 setuptools.setup(
-    name="tsfeatures",
-    version="0.4.5",
-    description="Calculates various features from time series data.",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/Nixtla/tsfeatures",
-    packages=setuptools.find_packages(),
+    name=cfg["lib_name"],
+    license=lic[0],
     classifiers=[
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: OS Independent",
-    ],
-    python_requires=">=3.7",
-    install_requires=[
-        "antropy>=0.1.4",
-        "arch>=4.11",
-        "pandas>=1.0.5",
-        "scikit-learn>=0.23.1",
-        "statsmodels>=0.13.2",
-        "supersmoother>=0.4",
-    ],
+        "Development Status :: " + statuses[int(cfg["status"])],
+        "Intended Audience :: " + cfg["audience"].title(),
+        "Natural Language :: " + cfg["language"].title(),
+    ]
+    + [
+        "Programming Language :: Python :: " + o
+        for o in py_versions[py_versions.index(min_python) :]
+    ]
+    + (["License :: " + lic[1]] if lic[1] else []),
+    url=cfg["git_url"],
+    packages=setuptools.find_packages(),
+    include_package_data=True,
+    install_requires=requirements,
+    extras_require={"dev": dev_requirements},
+    dependency_links=cfg.get("dep_links", "").split(),
+    python_requires=">=" + cfg["min_python"],
+    long_description=open("README.md", encoding="utf8").read(),
+    long_description_content_type="text/markdown",
+    zip_safe=False,
+    entry_points={
+        "console_scripts": cfg.get("console_scripts", "").split(),
+        "nbdev": [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'],
+    },
+    project_urls=project_urls,
+    **setup_cfg,
 )