-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsIndex.sh
executable file
·179 lines (162 loc) · 4 KB
/
sIndex.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env bash
# sIndex.sh by Amory Meltzer
# Licensed under the WTFPL http://www.wtfpl.net/
# Run everything
function get_help {
cat <<END_HELP
Usage: $(basename "$0") [-dpgr] <opt1> [opt2] ...
opt Calculating option(s) (month, rollN, year, or fixedN). Required with -p and -g/-r.
-d Downlaod data
-p Process data
-g, -r Graph data
-h This help
END_HELP
}
# Simple error handling
function dienice() {
echo "$1"
exit 1
}
function download_data() {
# Should use API after https://phabricator.wikimedia.org/T205652
urlBase="https://xtools.wmflabs.org/adminstats/enWiki/"
# Keep track of latest data grab
latest="latest"
if [[ -a "$latest" ]]; then
latest=$(cat $latest)
else
latest='initialize';
fi
# All missing data
# Continue even if there are none, nothing will happen unless we messed up
dates=$(perl getDates.pl "$latest")
if [ -z "$dates" ]; then
echo "No more dates to download!"
else
# Bulk download monthly data from https://xtools.wmflabs.org/adminstats
for date in $dates
do
mon=${date:0:7}
raw=$rawD/$mon.'html'
echo "Downloading $date..."
url="$urlBase$date"
echo "$url"
curl -d '' "$url" -o "$raw.tmp"
# Remove variable/easter egg content
perl cleanRaw.pl "$raw.tmp" > "$raw"
rm "$raw.tmp"
md5 -r "$raw" >> "md5raw.txt"
# Verify date as expected, not likely to be a problem
timestamp=$(grep -A 2 "Ending date" "$raw" |tail -n 1|xargs)
timestamp=${timestamp:0:7}
if [ "$timestamp" != "$mon" ]; then
dienice "Timestamp for $date seems erroneous"
fi
csv=$csvD/$mon.'csv'
perl table2csv.pl "$raw" > "$csv"
md5 -r "$csv" >> "md5csv.txt"
echo -n "$mon" > latest
done
# Check data for duplication events
rawDups=$(sort "md5raw.txt" | uniq -d)
csvDups=$(sort "md5csv.txt" | uniq -d)
echo
if [[ -n $rawDups ]]; then
echo "Duplicate raw data files found"
for dup in $rawDups
do
echo "$dup"
done
dienice "You should investigate manually"
fi
if [[ -n $csvDups ]]; then
echo "Duplicate csv data files found"
for dup in $csvDups
do
echo "$dup"
done
dienice "You should investigate manually"
fi
fi
}
function process_data() {
echo "Building $sinFile"
perl calcH.pl "$behav" "$sinFile" "$csvD/"
}
function graph_data() {
echo "Graphing $rPass data from $sinFile"
Rscript sindex.r "$sinFile" "$rPass"
rm Rplots.pdf # Christ R is stupid
}
while getopts ':dDpPgGrRhH' opt; do
case $opt in
d|D) download='1';;
p|P) process='1';;
g|G|r|R) graph='1';;
h|H) get_help "$0"
exit 0;;
*) printf "Invalid option provided, try %s -h\n" "$0" >&2
exit 1;;
esac
done
# Directories
rawD="rawData"
csvD="csvData"
sinD="procData"
if [[ -n $download ]]; then
download_data
fi
if [[ -n $process || -n $graph ]]; then
shift $((OPTIND -1))
# Quick option to rebuild everything
if [[ $1 =~ ^all$ ]]; then
opts="month roll3 roll6 roll12 year"
else
opts="$@"
fi
for behav in $opts
do
if [[ $behav =~ ^month$ || $behav =~ ^roll1$ || $behav =~ ^fixed1$ ]]; then
sinFile=$sinD/'sindex-monthly.csv'
rPass='monthly'
elif [[ $behav =~ ^roll[0-9]+$ ]]; then
rPass=${behav:4}
if [[ $rPass -ge 1 && $rPass -le 24 ]]; then
sinFile=$sinD/'sindex-'$behav'.csv'
rPass='rolling ('$rPass'mos)'
else
echo "$rPass"
get_help "$0"
exit 0
fi
elif [[ $behav =~ ^year$ || $behav =~ ^fixed12$ ]]; then
sinFile=$sinD/'sindex-annual.csv'
rPass='annual'
elif [[ $behav =~ ^fixed[0-9]+$ ]]; then
rPass=${behav:5}
if [[ $rPass -ge 1 && $rPass -le 24 ]]; then
sinFile=$sinD/'sindex-'$behav'.csv'
rPass='fixed ('$rPass'mos)'
else
echo "$rPass"
get_help "$0"
exit 0
fi
else
get_help "$0"
exit 0
fi
if [[ -n $process ]]; then
process_data
fi
if [[ -n $graph ]]; then
if [[ -a "$sinFile" ]]; then
graph_data
else
echo "$sinFile doesn't exist"
get_help "$0"
exit 0
fi
fi
done
fi