1
+ #! /bin/bash
2
+
3
+ # Databend Meta Server Diagnostic Script
4
+ # Purpose: Identify root causes for node failures, OOM errors, and query log issues
5
+ # Author: Generated for databend-meta troubleshooting
6
+
7
+ set -euo pipefail
8
+
9
+ # Colors for output
10
+ RED=' \033[0;31m'
11
+ GREEN=' \033[0;32m'
12
+ YELLOW=' \033[1;33m'
13
+ BLUE=' \033[0;34m'
14
+ NC=' \033[0m' # No Color
15
+
16
+ # Global variables
17
+ REPORT_FILE=" databend-meta-diagnostic-$( date +%Y%m%d-%H%M%S) .txt"
18
+ TEMP_DIR=" /tmp/databend-meta-diag-$$ "
19
+ DATABEND_PROCESSES=" "
20
+
21
+ # Helper functions
22
+ log_step () {
23
+ echo -e " ${BLUE} [$( date ' +%H:%M:%S' ) ] STEP: $1 ${NC} "
24
+ echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] STEP: $1 " >> " $REPORT_FILE "
25
+ }
26
+
27
+ log_result () {
28
+ echo -e " ${GREEN} [$( date ' +%H:%M:%S' ) ] RESULT: $1 ${NC} "
29
+ echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] RESULT: $1 " >> " $REPORT_FILE "
30
+ }
31
+
32
+ log_warning () {
33
+ echo -e " ${YELLOW} [$( date ' +%H:%M:%S' ) ] WARNING: $1 ${NC} "
34
+ echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] WARNING: $1 " >> " $REPORT_FILE "
35
+ }
36
+
37
+ log_error () {
38
+ echo -e " ${RED} [$( date ' +%H:%M:%S' ) ] ERROR: $1 ${NC} "
39
+ echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] ERROR: $1 " >> " $REPORT_FILE "
40
+ }
41
+
42
+ create_temp_dir () {
43
+ mkdir -p " $TEMP_DIR "
44
+ trap " rm -rf $TEMP_DIR " EXIT
45
+ }
46
+
47
+ # Initialize diagnostic report
48
+ init_report () {
49
+ log_step " Initializing databend-meta diagnostic report"
50
+
51
+ cat > " $REPORT_FILE " << EOF
52
+ ================================================================================
53
+ DATABEND META SERVER DIAGNOSTIC REPORT
54
+ Generated: $( date ' +%Y-%m-%d %H:%M:%S' )
55
+ Hostname: $( hostname)
56
+ ================================================================================
57
+
58
+ EOF
59
+
60
+ log_result " Report initialized: $REPORT_FILE "
61
+ }
62
+
63
+ # Check system basic information
64
+ check_system_info () {
65
+ log_step " Collecting system information"
66
+
67
+ {
68
+ echo " === SYSTEM INFORMATION ==="
69
+ echo " Hostname: $( hostname) "
70
+ echo " Uptime: $( uptime) "
71
+ echo " Kernel: $( uname -r) "
72
+ echo " Distribution: $( cat /etc/os-release | grep PRETTY_NAME | cut -d' "' -f2 2> /dev/null || echo ' Unknown' ) "
73
+ echo " Architecture: $( uname -m) "
74
+ echo " CPU cores: $( nproc) "
75
+ echo " "
76
+ } >> " $REPORT_FILE "
77
+
78
+ log_result " System information collected"
79
+ }
80
+
81
+ # Check memory and OOM killer activity
82
+ check_memory_oom () {
83
+ log_step " Checking memory status and OOM killer activity"
84
+
85
+ {
86
+ echo " === MEMORY STATUS ==="
87
+ free -h
88
+ echo " "
89
+
90
+ echo " === SWAP USAGE ==="
91
+ swapon --show 2> /dev/null || echo " No swap configured"
92
+ echo " "
93
+
94
+ echo " === OOM KILLER ACTIVITY (last 100 entries) ==="
95
+ dmesg | grep -i " killed process\|out of memory\|oom-killer\|memory: usage" | tail -100 || echo " No OOM killer activity found in dmesg"
96
+ echo " "
97
+
98
+ echo " === RECENT OOM KILLS IN SYSTEM LOG ==="
99
+ journalctl --since " 7 days ago" | grep -i " killed process\|out of memory\|oom" | tail -50 || echo " No recent OOM kills found in journal"
100
+ echo " "
101
+ } >> " $REPORT_FILE "
102
+
103
+ # Check if databend processes were killed by OOM
104
+ local oom_databend=$( dmesg | grep -i " killed process" | grep -i databend | wc -l)
105
+ if [ " $oom_databend " -gt 0 ]; then
106
+ log_warning " Found $oom_databend databend processes killed by OOM killer"
107
+ {
108
+ echo " === DATABEND PROCESSES KILLED BY OOM ==="
109
+ dmesg | grep -i " killed process" | grep -i databend
110
+ echo " "
111
+ } >> " $REPORT_FILE "
112
+ else
113
+ log_result " No databend processes found in OOM killer logs"
114
+ fi
115
+ }
116
+
117
+ # Check databend-meta processes
118
+ check_databend_processes () {
119
+ log_step " Analyzing databend-meta processes"
120
+
121
+ # Find running databend processes
122
+ DATABEND_PROCESSES=$( pgrep -f " databend.*meta" || true)
123
+
124
+ {
125
+ echo " === DATABEND META PROCESSES ==="
126
+ if [ -n " $DATABEND_PROCESSES " ]; then
127
+ echo " Running databend-meta processes:"
128
+ ps aux | grep -E " databend.*meta" | grep -v grep
129
+ echo " "
130
+
131
+ echo " === PROCESS RESOURCE USAGE ==="
132
+ for pid in $DATABEND_PROCESSES ; do
133
+ if [ -d " /proc/$pid " ]; then
134
+ echo " PID $pid :"
135
+ echo " Command: $( cat /proc/$pid /cmdline | tr ' \0' ' ' ) "
136
+ echo " Memory (VmRSS): $( grep VmRSS /proc/$pid /status 2> /dev/null || echo ' N/A' ) "
137
+ echo " Memory (VmSize): $( grep VmSize /proc/$pid /status 2> /dev/null || echo ' N/A' ) "
138
+ echo " Threads: $( grep Threads /proc/$pid /status 2> /dev/null || echo ' N/A' ) "
139
+ echo " File descriptors: $( ls /proc/$pid /fd 2> /dev/null | wc -l || echo ' N/A' ) "
140
+ echo " "
141
+ fi
142
+ done
143
+ else
144
+ echo " No databend-meta processes currently running"
145
+ fi
146
+ echo " "
147
+ } >> " $REPORT_FILE "
148
+
149
+ if [ -n " $DATABEND_PROCESSES " ]; then
150
+ log_result " Found $( echo $DATABEND_PROCESSES | wc -w) databend-meta processes"
151
+ else
152
+ log_warning " No databend-meta processes currently running"
153
+ fi
154
+ }
155
+
156
+ # Check system resource limits and usage
157
+ check_system_resources () {
158
+ log_step " Checking system resource usage and limits"
159
+
160
+ {
161
+ echo " === CPU USAGE ==="
162
+ top -bn1 | head -20
163
+ echo " "
164
+
165
+ echo " === LOAD AVERAGE ==="
166
+ cat /proc/loadavg
167
+ echo " "
168
+
169
+ echo " === DISK USAGE ==="
170
+ df -h
171
+ echo " "
172
+
173
+ echo " === INODE USAGE ==="
174
+ df -i
175
+ echo " "
176
+
177
+ echo " === MEMORY USAGE BY PROCESS ==="
178
+ ps aux --sort=-%mem | head -20
179
+ echo " "
180
+
181
+ echo " === SYSTEM LIMITS ==="
182
+ echo " Max open files (system): $( cat /proc/sys/fs/file-max) "
183
+ echo " Current open files: $( cat /proc/sys/fs/file-nr | cut -f1) "
184
+ echo " Max processes: $( cat /proc/sys/kernel/pid_max) "
185
+ echo " Max memory map areas: $( cat /proc/sys/vm/max_map_count) "
186
+ echo " "
187
+
188
+ if [ -n " $DATABEND_PROCESSES " ]; then
189
+ echo " === DATABEND PROCESS LIMITS ==="
190
+ for pid in $DATABEND_PROCESSES ; do
191
+ if [ -d " /proc/$pid " ]; then
192
+ echo " PID $pid limits:"
193
+ cat /proc/$pid /limits 2> /dev/null | grep -E " open files|processes|address space" || echo " Unable to read limits"
194
+ echo " "
195
+ fi
196
+ done
197
+ fi
198
+ } >> " $REPORT_FILE "
199
+
200
+ log_result " System resource information collected"
201
+ }
202
+
203
+ # Check databend-meta logs
204
+ check_databend_logs () {
205
+ log_step " Analyzing databend-meta logs"
206
+
207
+ {
208
+ echo " === DATABEND META LOG ANALYSIS ==="
209
+
210
+ # Common log locations to check
211
+ local log_paths=(
212
+ " /var/log/databend"
213
+ " /opt/databend/logs"
214
+ " /usr/local/databend/logs"
215
+ " /home/*/databend/logs"
216
+ " $( pwd) /logs"
217
+ " ./logs"
218
+ )
219
+
220
+ local found_logs=false
221
+
222
+ for log_path in " ${log_paths[@]} " ; do
223
+ if [ -d " $log_path " ] && [ " $( find " $log_path " -name " *meta*log*" -o -name " *databend*log*" 2> /dev/null | wc -l) " -gt 0 ]; then
224
+ echo " Found logs in: $log_path "
225
+ find " $log_path " -name " *meta*log*" -o -name " *databend*log*" | head -10
226
+ echo " "
227
+ found_logs=true
228
+
229
+ # Analyze recent errors in logs
230
+ echo " === RECENT ERRORS IN LOGS (last 100 lines) ==="
231
+ find " $log_path " -name " *meta*log*" -o -name " *databend*log*" | while read -r logfile; do
232
+ if [ -f " $logfile " ]; then
233
+ echo " Analyzing: $logfile "
234
+ tail -100 " $logfile " | grep -i -E " error|panic|fatal|oom|memory|fail" | tail -20 || echo " No recent errors found"
235
+ echo " "
236
+ fi
237
+ done
238
+ break
239
+ fi
240
+ done
241
+
242
+ if [ " $found_logs " = false ]; then
243
+ echo " No databend-meta log files found in standard locations"
244
+ echo " Checked locations: ${log_paths[*]} "
245
+ echo " "
246
+
247
+ # Try to find logs using systemd if service is running
248
+ echo " === CHECKING SYSTEMD LOGS ==="
249
+ journalctl -u databend-meta --since " 24 hours ago" --no-pager | tail -100 2> /dev/null || echo " No systemd logs found for databend-meta service"
250
+ echo " "
251
+ fi
252
+ } >> " $REPORT_FILE "
253
+
254
+ log_result " Log analysis completed"
255
+ }
256
+
257
+ # Check network and connectivity
258
+ check_network () {
259
+ log_step " Checking network configuration and connectivity"
260
+
261
+ {
262
+ echo " === NETWORK CONFIGURATION ==="
263
+ ss -tlnp | grep -E " :9191|:8080|:3307|:8000" || echo " No databend-related ports found listening"
264
+ echo " "
265
+
266
+ echo " === NETWORK CONNECTIONS ==="
267
+ if [ -n " $DATABEND_PROCESSES " ]; then
268
+ for pid in $DATABEND_PROCESSES ; do
269
+ echo " Connections for PID $pid :"
270
+ lsof -p " $pid " -i 2> /dev/null | head -20 || echo " Unable to check connections for PID $pid "
271
+ echo " "
272
+ done
273
+ fi
274
+
275
+ echo " === FIREWALL STATUS ==="
276
+ if command -v ufw > /dev/null; then
277
+ ufw status 2> /dev/null || echo " UFW not active"
278
+ elif command -v firewall-cmd > /dev/null; then
279
+ firewall-cmd --state 2> /dev/null || echo " Firewalld not active"
280
+ elif command -v iptables > /dev/null; then
281
+ iptables -L -n | head -20 2> /dev/null || echo " Unable to check iptables"
282
+ else
283
+ echo " No common firewall tools found"
284
+ fi
285
+ echo " "
286
+ } >> " $REPORT_FILE "
287
+
288
+ log_result " Network analysis completed"
289
+ }
290
+
291
+ # Generate final diagnostic report
292
+ generate_final_report () {
293
+ log_step " Generating final diagnostic summary"
294
+
295
+ {
296
+ echo " "
297
+ echo " ================================================================================"
298
+ echo " DIAGNOSTIC SUMMARY"
299
+ echo " ================================================================================"
300
+ echo " "
301
+
302
+ echo " === CRITICAL FINDINGS ==="
303
+
304
+ # Check for OOM issues
305
+ local oom_count=$( dmesg | grep -i " killed process" | grep -i databend | wc -l)
306
+ if [ " $oom_count " -gt 0 ]; then
307
+ echo " ❌ CRITICAL: $oom_count databend processes killed by OOM killer"
308
+ fi
309
+
310
+ # Check memory pressure
311
+ local mem_available=$( free -m | awk ' NR==2{printf "%.0f", $7/$2*100}' )
312
+ if [ " $mem_available " -lt 10 ]; then
313
+ echo " ❌ CRITICAL: Very low available memory (${mem_available} %)"
314
+ elif [ " $mem_available " -lt 20 ]; then
315
+ echo " ⚠️ WARNING: Low available memory (${mem_available} %)"
316
+ fi
317
+
318
+ # Check if processes are running
319
+ if [ -z " $DATABEND_PROCESSES " ]; then
320
+ echo " ❌ CRITICAL: No databend-meta processes currently running"
321
+ else
322
+ echo " ✅ INFO: $( echo $DATABEND_PROCESSES | wc -w) databend-meta processes running"
323
+ fi
324
+
325
+ # Check disk space
326
+ local disk_usage=$( df / | awk ' NR==2{print $5}' | sed ' s/%//' )
327
+ if [ " $disk_usage " -gt 90 ]; then
328
+ echo " ❌ CRITICAL: Root filesystem ${disk_usage} % full"
329
+ elif [ " $disk_usage " -gt 80 ]; then
330
+ echo " ⚠️ WARNING: Root filesystem ${disk_usage} % full"
331
+ fi
332
+
333
+ echo " "
334
+ echo " === RECOMMENDATIONS ==="
335
+ echo " 1. Check system logs for OOM killer activity"
336
+ echo " 2. Monitor memory usage during peak loads"
337
+ echo " 3. Consider increasing system memory or optimizing databend-meta configuration"
338
+ echo " 4. Verify databend-meta service configuration and startup scripts"
339
+ echo " 5. Check application logs for query execution patterns"
340
+ echo " "
341
+
342
+ echo " === NEXT STEPS ==="
343
+ echo " 1. Review the detailed findings above"
344
+ echo " 2. Share this report with databend support team"
345
+ echo " 3. Consider implementing monitoring for memory usage"
346
+ echo " 4. Set up log rotation if not already configured"
347
+ echo " "
348
+
349
+ echo " Report generated: $( date ' +%Y-%m-%d %H:%M:%S' ) "
350
+ echo " Report location: $( pwd) /$REPORT_FILE "
351
+
352
+ } >> " $REPORT_FILE "
353
+
354
+ log_result " Final diagnostic report generated"
355
+ }
356
+
357
+ # Main execution
358
+ main () {
359
+ echo -e " ${BLUE} Databend Meta Server Diagnostic Tool${NC} "
360
+ echo " ======================================"
361
+
362
+ create_temp_dir
363
+ init_report
364
+
365
+ check_system_info
366
+ check_memory_oom
367
+ check_databend_processes
368
+ check_system_resources
369
+ check_databend_logs
370
+ check_network
371
+ generate_final_report
372
+
373
+ echo " "
374
+ echo -e " ${GREEN} Diagnostic completed successfully!${NC} "
375
+ echo -e " Report saved to: ${YELLOW} $REPORT_FILE ${NC} "
376
+ echo " "
377
+ echo -e " ${BLUE} To view the report:${NC} "
378
+ echo -e " cat $REPORT_FILE "
379
+ echo " "
380
+ echo -e " ${BLUE} To share with support:${NC} "
381
+ echo -e " Send the file: $REPORT_FILE "
382
+ }
383
+
384
+ # Run main function
385
+ main " $@ "
0 commit comments