-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain_functions.py
164 lines (129 loc) · 5.72 KB
/
main_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
Script for Thailand GW data pre-processing and post-plotting
Thai GW data from http://tgms.dgr.go.th/
Article Title: Hybrid data-driven, physics-based modeling of ground-
water and subsidence with application to Bangkok, Thailand
Jenny Soonthornrangsan 2023
TU Delft
"""
###############################################################################
# import statements
###############################################################################
# import statements
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import datetime as dt
# Closing all figures
plt.close("all")
###############################################################################
###############################################################################
###############################################################################
# String formatting
# Function to preprocess groundwater data from http://tgms.dgr.go.th/#/home
# Assuming excel format
def GW_Data_Process(GW_Data, well_name=None):
# # Inputs:
# GW data assuming as dataframe and first five lines irrevelevant with - to be
# replaced with nan
# Thai years need to be converted to english years
# Well name - if None, returning all wells with same dates
# # Outputs:
# Data frame with dates and groundwater data (HEAD REL to 0 NOT DEPTH TO WATER)
# Subset of data for only the well from well_name - If well name not None
# Ignoring first five lines
# Replacing no date (-) with nans
data = GW_Data.iloc[0:len(GW_Data)-5, :]
data = data.replace("-", np.nan)
# Creating data frame
well_data = data.iloc[:, 1:]
df_data = well_data.rename(columns={"วันที่": "Date"})
df_data.Date = df_data["Date"].astype(str)
# Reformating date from thai years to english years
len_head = len(df_data.iloc[:, 1])
date_list = []
df_data["Year"] = np.nan
df_data["Month"] = np.nan
df_data["Day"] = np.nan
# For each data point
for i in range(len_head):
# Current date
date = df_data.Date.loc[i]
# If leap day
if date[0:2] == "29" and date[3:5] == "02":
# Thai years - 543 = english years
df_data.Year.loc[i] = int(date[6:10]) - 543
df_data.Month.loc[i] = int(date[3:5])
df_data.Day.loc[i] = int(date[0:2])
# Converting to date time format
date_list.append(dt.datetime.strptime(str(df_data.Day.loc[i]).replace(
".0", "") + "/" +
str(df_data.Month.loc[i]).replace(".0", "") +
"/" + str(df_data.Year.loc[i]).replace(".0", ""),
"%d/%m/%Y").date())
# If not a leap date
else:
# Converting to date time format
date_list.append(dt.datetime.strptime(date, "%d/%m/%Y %M:%S").date())
df_data.Year.loc[i] = (date_list[i].year) - 543
df_data.Month.loc[i] = (date_list[i].month)
df_data.Day.loc[i] = (date_list[i].day)
# Saving new english date
df_data["EngDate"] = pd.to_datetime(df_data[["Year", "Month", "Day"]])
# If individual well name given
if well_name is not None:
# Subset of dataframe to get ready for Pastas
# DTW data converted to approximate head using negative sign
head_subsetdata = {"Date": df_data.EngDate,
"Head": - df_data[well_name].astype("float")}
Head_Data = pd.DataFrame(head_subsetdata, columns=["Date", "Head"])
Head_Data.index = pd.to_datetime(Head_Data.Date)
# If individual head date not given
else:
Head_Data = None
# Cleaning all data up
# Returning head data not depth to water!
if len(well_data.columns.values[1:]) > 1:
all_data = pd.concat([df_data["EngDate"],
-well_data.iloc[:, 1:].astype("float")],
axis=1,
keys=["Date"] + well_data.columns.values[1:])
else:
all_data = pd.concat([df_data["EngDate"],
-well_data.iloc[:, 1:].astype("float")],
axis=1)
if (np.size(-well_data.iloc[:, 1:], axis=1)) > 1:
all_data = all_data.droplevel(level=0, axis=1)
return all_data, Head_Data
###############################################################################
###############################################################################
###############################################################################
# Checks for outliers
def is_outlier(points, thresh=3.5):
"""
Returns a boolean array with True if points are outliers and False
otherwise.
Parameters:
-----------
points : An numobservations by numdimensions array of observations
thresh : The modified z-score to use as a threshold. Observations with
a modified z-score (based on the median absolute deviation) greater
than this value will be classified as outliers.
Returns:
--------
mask : A numobservations-length boolean array.
References:
----------
Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
Handle Outliers", The ASQC Basic References in Quality Control:
Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
"""
if np.size(points) == 1:
points = points[:, None]
median = np.median(points)
diff = (points - median)**2
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > thresh