|
94 | 94 | "from tsfeatures.utils import *"
|
95 | 95 | ]
|
96 | 96 | },
|
97 |
| - { |
98 |
| - "cell_type": "code", |
99 |
| - "execution_count": null, |
100 |
| - "metadata": {}, |
101 |
| - "outputs": [], |
102 |
| - "source": [ |
103 |
| - "# |export\n", |
104 |
| - "\n", |
105 |
| - "FREQS = {\"H\": 24, \"D\": 1, \"M\": 12, \"Q\": 4, \"W\": 1, \"Y\": 1}" |
106 |
| - ] |
107 |
| - }, |
108 | 97 | {
|
109 | 98 | "cell_type": "code",
|
110 | 99 | "execution_count": null,
|
|
153 | 142 | " else:\n",
|
154 | 143 | " acfdiff2x = [np.nan] * 2\n",
|
155 | 144 | " # first autocorrelation coefficient\n",
|
156 |
| - " acf_1 = acfx[1]\n", |
| 145 | + "\n", |
| 146 | + " try:\n", |
| 147 | + " acf_1 = acfx[1]\n", |
| 148 | + " except:\n", |
| 149 | + " acf_1 = np.nan\n", |
| 150 | + "\n", |
157 | 151 | " # sum of squares of first 10 autocorrelation coefficients\n",
|
158 | 152 | " sum_of_sq_acf10 = np.sum((acfx[1:11]) ** 2) if size_x > 10 else np.nan\n",
|
159 | 153 | " # first autocorrelation ciefficient of differenced series\n",
|
|
255 | 249 | " if len(x) <= lags + 1:\n",
|
256 | 250 | " return {\"arch_lm\": np.nan}\n",
|
257 | 251 | " if demean:\n",
|
258 |
| - " x -= np.mean(x)\n", |
| 252 | + " x = x - np.mean(x)\n", |
259 | 253 | "\n",
|
260 | 254 | " size_x = len(x)\n",
|
261 | 255 | " mat = embed(x**2, lags + 1)\n",
|
|
431 | 425 | " except:\n",
|
432 | 426 | " return {\"flat_spots\": np.nan}\n",
|
433 | 427 | "\n",
|
434 |
| - " rlex = np.array([sum(1 for i in g) for k, g in groupby(cutx)]).max()" |
| 428 | + " rlex = np.array([sum(1 for i in g) for k, g in groupby(cutx)]).max()\n", |
| 429 | + " return {\"flat_spots\": rlex}\n", |
| 430 | + "\n" |
435 | 431 | ]
|
436 | 432 | },
|
437 | 433 | {
|
|
1103 | 1099 | " time_x = add_constant(poly_m)\n",
|
1104 | 1100 | " coefs = OLS(trend0, time_x).fit().params\n",
|
1105 | 1101 | "\n",
|
1106 |
| - " linearity = coefs[1]\n", |
1107 |
| - " curvature = -coefs[2]\n", |
| 1102 | + "\n", |
| 1103 | + " try:\n", |
| 1104 | + " linearity = coefs[1]\n", |
| 1105 | + " except:\n", |
| 1106 | + " linearity = np.nan\n", |
| 1107 | + " try:\n", |
| 1108 | + " curvature = -coefs[2]\n", |
| 1109 | + " except:\n", |
| 1110 | + " curvature = np.nan\n", |
1108 | 1111 | " # ACF features\n",
|
1109 | 1112 | " acfremainder = acf_features(remainder, m)\n",
|
1110 | 1113 | " # Assemble features\n",
|
|
1194 | 1197 | " return {\"unitroot_pp\": test_pp}"
|
1195 | 1198 | ]
|
1196 | 1199 | },
|
| 1200 | + { |
| 1201 | + "cell_type": "code", |
| 1202 | + "execution_count": null, |
| 1203 | + "metadata": {}, |
| 1204 | + "outputs": [], |
| 1205 | + "source": [ |
| 1206 | + "def statistics(x: np.array, freq: int = 1) -> Dict[str, float]:\n", |
| 1207 | + " \"\"\"Computes basic statistics of x.\n", |
| 1208 | + "\n", |
| 1209 | + " Parameters\n", |
| 1210 | + " ----------\n", |
| 1211 | + " x: numpy array\n", |
| 1212 | + " The time series.\n", |
| 1213 | + " freq: int\n", |
| 1214 | + " Frequency of the time series\n", |
| 1215 | + "\n", |
| 1216 | + " Returns\n", |
| 1217 | + " -------\n", |
| 1218 | + " dict\n", |
| 1219 | + " 'total_sum': Total sum of the series.\n", |
| 1220 | + " 'mean': Mean value.\n", |
| 1221 | + " 'variance': variance of the time series.\n", |
| 1222 | + " 'median': Median value.\n", |
| 1223 | + " 'p2point5': 2.5 Percentile.\n", |
| 1224 | + " 'p5': 5 percentile.\n", |
| 1225 | + " 'p25': 25 percentile.\n", |
| 1226 | + " 'p75': 75 percentile.\n", |
| 1227 | + " 'p95': 95 percentile.\n", |
| 1228 | + " 'p97point5': 97.5 percentile.\n", |
| 1229 | + " 'max': Max value.\n", |
| 1230 | + " 'min': Min value.\n", |
| 1231 | + " \"\"\"\n", |
| 1232 | + " res = dict(\n", |
| 1233 | + " total_sum=np.sum(x),\n", |
| 1234 | + " mean=np.mean(x),\n", |
| 1235 | + " variance=np.var(x, ddof=1),\n", |
| 1236 | + " median=np.median(x),\n", |
| 1237 | + " p2point5=np.quantile(x, q=0.025),\n", |
| 1238 | + " p5=np.quantile(x, q=0.05),\n", |
| 1239 | + " p25=np.quantile(x, q=0.25),\n", |
| 1240 | + " p75=np.quantile(x, q=0.75),\n", |
| 1241 | + " p95=np.quantile(x, q=0.95),\n", |
| 1242 | + " p97point5=np.quantile(x, q=0.975),\n", |
| 1243 | + " max=np.max(x),\n", |
| 1244 | + " min=np.min(x),\n", |
| 1245 | + " )\n", |
| 1246 | + "\n", |
| 1247 | + " return res" |
| 1248 | + ] |
| 1249 | + }, |
1197 | 1250 | {
|
1198 | 1251 | "cell_type": "code",
|
1199 | 1252 | "execution_count": null,
|
|
1227 | 1280 | " ],\n",
|
1228 | 1281 | " dict_freqs=FREQS,\n",
|
1229 | 1282 | "):\n",
|
1230 |
| - " print(\"dict_freq\")\n", |
1231 | 1283 | " if freq is None:\n",
|
1232 | 1284 | " inf_freq = pd.infer_freq(ts[\"ds\"])\n",
|
1233 | 1285 | " if inf_freq is None:\n",
|
|
1334 | 1386 | " return ts_features"
|
1335 | 1387 | ]
|
1336 | 1388 | },
|
| 1389 | + { |
| 1390 | + "cell_type": "code", |
| 1391 | + "execution_count": null, |
| 1392 | + "metadata": {}, |
| 1393 | + "outputs": [], |
| 1394 | + "source": [ |
| 1395 | + "def _get_feats_wide(index,\n", |
| 1396 | + " ts,\n", |
| 1397 | + " scale = True,\n", |
| 1398 | + " features = [acf_features, arch_stat, crossing_points,\n", |
| 1399 | + " entropy, flat_spots, heterogeneity, holt_parameters,\n", |
| 1400 | + " lumpiness, nonlinearity, pacf_features, stl_features,\n", |
| 1401 | + " stability, hw_parameters, unitroot_kpss, unitroot_pp,\n", |
| 1402 | + " series_length, hurst]):\n", |
| 1403 | + " seasonality = ts['seasonality'].item()\n", |
| 1404 | + " y = ts['y'].item()\n", |
| 1405 | + " y = np.array(y)\n", |
| 1406 | + "\n", |
| 1407 | + " if scale:\n", |
| 1408 | + " y = scalets(y)\n", |
| 1409 | + "\n", |
| 1410 | + " c_map = ChainMap(*[dict_feat for dict_feat in [func(y, seasonality) for func in features]])\n", |
| 1411 | + "\n", |
| 1412 | + " return pd.DataFrame(dict(c_map), index = [index])\n" |
| 1413 | + ] |
| 1414 | + }, |
| 1415 | + { |
| 1416 | + "cell_type": "code", |
| 1417 | + "execution_count": null, |
| 1418 | + "metadata": {}, |
| 1419 | + "outputs": [], |
| 1420 | + "source": [ |
| 1421 | + "def tsfeatures_wide(ts: pd.DataFrame,\n", |
| 1422 | + " features: List[Callable] = [acf_features, arch_stat, crossing_points,\n", |
| 1423 | + " entropy, flat_spots, heterogeneity,\n", |
| 1424 | + " holt_parameters, lumpiness, nonlinearity,\n", |
| 1425 | + " pacf_features, stl_features, stability,\n", |
| 1426 | + " hw_parameters, unitroot_kpss, unitroot_pp,\n", |
| 1427 | + " series_length, hurst],\n", |
| 1428 | + " scale: bool = True,\n", |
| 1429 | + " threads: Optional[int] = None) -> pd.DataFrame:\n", |
| 1430 | + " \"\"\"Calculates features for time series.\n", |
| 1431 | + "\n", |
| 1432 | + " Parameters\n", |
| 1433 | + " ----------\n", |
| 1434 | + " ts: pandas df\n", |
| 1435 | + " Pandas DataFrame with columns ['unique_id', 'seasonality', 'y'].\n", |
| 1436 | + " Wide panel of time series.\n", |
| 1437 | + " features: iterable\n", |
| 1438 | + " Iterable of features functions.\n", |
| 1439 | + " scale: bool\n", |
| 1440 | + " Whether (mean-std)scale data.\n", |
| 1441 | + " threads: int\n", |
| 1442 | + " Number of threads to use. Use None (default) for parallel processing.\n", |
| 1443 | + "\n", |
| 1444 | + " Returns\n", |
| 1445 | + " -------\n", |
| 1446 | + " pandas df\n", |
| 1447 | + " Pandas DataFrame where each column is a feature and each row\n", |
| 1448 | + " a time series.\n", |
| 1449 | + " \"\"\"\n", |
| 1450 | + " partial_get_feats = partial(_get_feats_wide, scale=scale,\n", |
| 1451 | + " features=features)\n", |
| 1452 | + "\n", |
| 1453 | + " with Pool(threads) as pool:\n", |
| 1454 | + " ts_features = pool.starmap(partial_get_feats, ts.groupby('unique_id'))\n", |
| 1455 | + "\n", |
| 1456 | + " ts_features = pd.concat(ts_features).rename_axis('unique_id')\n", |
| 1457 | + " ts_features = ts_features.reset_index()\n", |
| 1458 | + "\n", |
| 1459 | + " return ts_features" |
| 1460 | + ] |
| 1461 | + }, |
1337 | 1462 | {
|
1338 | 1463 | "cell_type": "code",
|
1339 | 1464 | "execution_count": null,
|
|
0 commit comments