Skip to content

Commit 2d1617a

Browse files
committed
chore(meta-service): add diagnostics script
1 parent 4130573 commit 2d1617a

File tree

1 file changed

+385
-0
lines changed

1 file changed

+385
-0
lines changed
Lines changed: 385 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,385 @@
1+
#!/bin/bash
2+
3+
# Databend Meta Server Diagnostic Script
4+
# Purpose: Identify root causes for node failures, OOM errors, and query log issues
5+
# Author: Generated for databend-meta troubleshooting
6+
7+
set -euo pipefail
8+
9+
# Colors for output
10+
RED='\033[0;31m'
11+
GREEN='\033[0;32m'
12+
YELLOW='\033[1;33m'
13+
BLUE='\033[0;34m'
14+
NC='\033[0m' # No Color
15+
16+
# Global variables
17+
REPORT_FILE="databend-meta-diagnostic-$(date +%Y%m%d-%H%M%S).txt"
18+
TEMP_DIR="/tmp/databend-meta-diag-$$"
19+
DATABEND_PROCESSES=""
20+
21+
# Helper functions
22+
log_step() {
23+
echo -e "${BLUE}[$(date '+%H:%M:%S')] STEP: $1${NC}"
24+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] STEP: $1" >> "$REPORT_FILE"
25+
}
26+
27+
log_result() {
28+
echo -e "${GREEN}[$(date '+%H:%M:%S')] RESULT: $1${NC}"
29+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] RESULT: $1" >> "$REPORT_FILE"
30+
}
31+
32+
log_warning() {
33+
echo -e "${YELLOW}[$(date '+%H:%M:%S')] WARNING: $1${NC}"
34+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] WARNING: $1" >> "$REPORT_FILE"
35+
}
36+
37+
log_error() {
38+
echo -e "${RED}[$(date '+%H:%M:%S')] ERROR: $1${NC}"
39+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1" >> "$REPORT_FILE"
40+
}
41+
42+
create_temp_dir() {
43+
mkdir -p "$TEMP_DIR"
44+
trap "rm -rf $TEMP_DIR" EXIT
45+
}
46+
47+
# Initialize diagnostic report
48+
init_report() {
49+
log_step "Initializing databend-meta diagnostic report"
50+
51+
cat > "$REPORT_FILE" << EOF
52+
================================================================================
53+
DATABEND META SERVER DIAGNOSTIC REPORT
54+
Generated: $(date '+%Y-%m-%d %H:%M:%S')
55+
Hostname: $(hostname)
56+
================================================================================
57+
58+
EOF
59+
60+
log_result "Report initialized: $REPORT_FILE"
61+
}
62+
63+
# Check system basic information
64+
check_system_info() {
65+
log_step "Collecting system information"
66+
67+
{
68+
echo "=== SYSTEM INFORMATION ==="
69+
echo "Hostname: $(hostname)"
70+
echo "Uptime: $(uptime)"
71+
echo "Kernel: $(uname -r)"
72+
echo "Distribution: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2 2>/dev/null || echo 'Unknown')"
73+
echo "Architecture: $(uname -m)"
74+
echo "CPU cores: $(nproc)"
75+
echo ""
76+
} >> "$REPORT_FILE"
77+
78+
log_result "System information collected"
79+
}
80+
81+
# Check memory and OOM killer activity
82+
check_memory_oom() {
83+
log_step "Checking memory status and OOM killer activity"
84+
85+
{
86+
echo "=== MEMORY STATUS ==="
87+
free -h
88+
echo ""
89+
90+
echo "=== SWAP USAGE ==="
91+
swapon --show 2>/dev/null || echo "No swap configured"
92+
echo ""
93+
94+
echo "=== OOM KILLER ACTIVITY (last 100 entries) ==="
95+
dmesg | grep -i "killed process\|out of memory\|oom-killer\|memory: usage" | tail -100 || echo "No OOM killer activity found in dmesg"
96+
echo ""
97+
98+
echo "=== RECENT OOM KILLS IN SYSTEM LOG ==="
99+
journalctl --since "7 days ago" | grep -i "killed process\|out of memory\|oom" | tail -50 || echo "No recent OOM kills found in journal"
100+
echo ""
101+
} >> "$REPORT_FILE"
102+
103+
# Check if databend processes were killed by OOM
104+
local oom_databend=$(dmesg | grep -i "killed process" | grep -i databend | wc -l)
105+
if [ "$oom_databend" -gt 0 ]; then
106+
log_warning "Found $oom_databend databend processes killed by OOM killer"
107+
{
108+
echo "=== DATABEND PROCESSES KILLED BY OOM ==="
109+
dmesg | grep -i "killed process" | grep -i databend
110+
echo ""
111+
} >> "$REPORT_FILE"
112+
else
113+
log_result "No databend processes found in OOM killer logs"
114+
fi
115+
}
116+
117+
# Check databend-meta processes
118+
check_databend_processes() {
119+
log_step "Analyzing databend-meta processes"
120+
121+
# Find running databend processes
122+
DATABEND_PROCESSES=$(pgrep -f "databend.*meta" || true)
123+
124+
{
125+
echo "=== DATABEND META PROCESSES ==="
126+
if [ -n "$DATABEND_PROCESSES" ]; then
127+
echo "Running databend-meta processes:"
128+
ps aux | grep -E "databend.*meta" | grep -v grep
129+
echo ""
130+
131+
echo "=== PROCESS RESOURCE USAGE ==="
132+
for pid in $DATABEND_PROCESSES; do
133+
if [ -d "/proc/$pid" ]; then
134+
echo "PID $pid:"
135+
echo " Command: $(cat /proc/$pid/cmdline | tr '\0' ' ')"
136+
echo " Memory (VmRSS): $(grep VmRSS /proc/$pid/status 2>/dev/null || echo 'N/A')"
137+
echo " Memory (VmSize): $(grep VmSize /proc/$pid/status 2>/dev/null || echo 'N/A')"
138+
echo " Threads: $(grep Threads /proc/$pid/status 2>/dev/null || echo 'N/A')"
139+
echo " File descriptors: $(ls /proc/$pid/fd 2>/dev/null | wc -l || echo 'N/A')"
140+
echo ""
141+
fi
142+
done
143+
else
144+
echo "No databend-meta processes currently running"
145+
fi
146+
echo ""
147+
} >> "$REPORT_FILE"
148+
149+
if [ -n "$DATABEND_PROCESSES" ]; then
150+
log_result "Found $(echo $DATABEND_PROCESSES | wc -w) databend-meta processes"
151+
else
152+
log_warning "No databend-meta processes currently running"
153+
fi
154+
}
155+
156+
# Check system resource limits and usage
157+
check_system_resources() {
158+
log_step "Checking system resource usage and limits"
159+
160+
{
161+
echo "=== CPU USAGE ==="
162+
top -bn1 | head -20
163+
echo ""
164+
165+
echo "=== LOAD AVERAGE ==="
166+
cat /proc/loadavg
167+
echo ""
168+
169+
echo "=== DISK USAGE ==="
170+
df -h
171+
echo ""
172+
173+
echo "=== INODE USAGE ==="
174+
df -i
175+
echo ""
176+
177+
echo "=== MEMORY USAGE BY PROCESS ==="
178+
ps aux --sort=-%mem | head -20
179+
echo ""
180+
181+
echo "=== SYSTEM LIMITS ==="
182+
echo "Max open files (system): $(cat /proc/sys/fs/file-max)"
183+
echo "Current open files: $(cat /proc/sys/fs/file-nr | cut -f1)"
184+
echo "Max processes: $(cat /proc/sys/kernel/pid_max)"
185+
echo "Max memory map areas: $(cat /proc/sys/vm/max_map_count)"
186+
echo ""
187+
188+
if [ -n "$DATABEND_PROCESSES" ]; then
189+
echo "=== DATABEND PROCESS LIMITS ==="
190+
for pid in $DATABEND_PROCESSES; do
191+
if [ -d "/proc/$pid" ]; then
192+
echo "PID $pid limits:"
193+
cat /proc/$pid/limits 2>/dev/null | grep -E "open files|processes|address space" || echo " Unable to read limits"
194+
echo ""
195+
fi
196+
done
197+
fi
198+
} >> "$REPORT_FILE"
199+
200+
log_result "System resource information collected"
201+
}
202+
203+
# Check databend-meta logs
204+
check_databend_logs() {
205+
log_step "Analyzing databend-meta logs"
206+
207+
{
208+
echo "=== DATABEND META LOG ANALYSIS ==="
209+
210+
# Common log locations to check
211+
local log_paths=(
212+
"/var/log/databend"
213+
"/opt/databend/logs"
214+
"/usr/local/databend/logs"
215+
"/home/*/databend/logs"
216+
"$(pwd)/logs"
217+
"./logs"
218+
)
219+
220+
local found_logs=false
221+
222+
for log_path in "${log_paths[@]}"; do
223+
if [ -d "$log_path" ] && [ "$(find "$log_path" -name "*meta*log*" -o -name "*databend*log*" 2>/dev/null | wc -l)" -gt 0 ]; then
224+
echo "Found logs in: $log_path"
225+
find "$log_path" -name "*meta*log*" -o -name "*databend*log*" | head -10
226+
echo ""
227+
found_logs=true
228+
229+
# Analyze recent errors in logs
230+
echo "=== RECENT ERRORS IN LOGS (last 100 lines) ==="
231+
find "$log_path" -name "*meta*log*" -o -name "*databend*log*" | while read -r logfile; do
232+
if [ -f "$logfile" ]; then
233+
echo "Analyzing: $logfile"
234+
tail -100 "$logfile" | grep -i -E "error|panic|fatal|oom|memory|fail" | tail -20 || echo "No recent errors found"
235+
echo ""
236+
fi
237+
done
238+
break
239+
fi
240+
done
241+
242+
if [ "$found_logs" = false ]; then
243+
echo "No databend-meta log files found in standard locations"
244+
echo "Checked locations: ${log_paths[*]}"
245+
echo ""
246+
247+
# Try to find logs using systemd if service is running
248+
echo "=== CHECKING SYSTEMD LOGS ==="
249+
journalctl -u databend-meta --since "24 hours ago" --no-pager | tail -100 2>/dev/null || echo "No systemd logs found for databend-meta service"
250+
echo ""
251+
fi
252+
} >> "$REPORT_FILE"
253+
254+
log_result "Log analysis completed"
255+
}
256+
257+
# Check network and connectivity
258+
check_network() {
259+
log_step "Checking network configuration and connectivity"
260+
261+
{
262+
echo "=== NETWORK CONFIGURATION ==="
263+
ss -tlnp | grep -E ":9191|:8080|:3307|:8000" || echo "No databend-related ports found listening"
264+
echo ""
265+
266+
echo "=== NETWORK CONNECTIONS ==="
267+
if [ -n "$DATABEND_PROCESSES" ]; then
268+
for pid in $DATABEND_PROCESSES; do
269+
echo "Connections for PID $pid:"
270+
lsof -p "$pid" -i 2>/dev/null | head -20 || echo "Unable to check connections for PID $pid"
271+
echo ""
272+
done
273+
fi
274+
275+
echo "=== FIREWALL STATUS ==="
276+
if command -v ufw >/dev/null; then
277+
ufw status 2>/dev/null || echo "UFW not active"
278+
elif command -v firewall-cmd >/dev/null; then
279+
firewall-cmd --state 2>/dev/null || echo "Firewalld not active"
280+
elif command -v iptables >/dev/null; then
281+
iptables -L -n | head -20 2>/dev/null || echo "Unable to check iptables"
282+
else
283+
echo "No common firewall tools found"
284+
fi
285+
echo ""
286+
} >> "$REPORT_FILE"
287+
288+
log_result "Network analysis completed"
289+
}
290+
291+
# Generate final diagnostic report
292+
generate_final_report() {
293+
log_step "Generating final diagnostic summary"
294+
295+
{
296+
echo ""
297+
echo "================================================================================"
298+
echo "DIAGNOSTIC SUMMARY"
299+
echo "================================================================================"
300+
echo ""
301+
302+
echo "=== CRITICAL FINDINGS ==="
303+
304+
# Check for OOM issues
305+
local oom_count=$(dmesg | grep -i "killed process" | grep -i databend | wc -l)
306+
if [ "$oom_count" -gt 0 ]; then
307+
echo "❌ CRITICAL: $oom_count databend processes killed by OOM killer"
308+
fi
309+
310+
# Check memory pressure
311+
local mem_available=$(free -m | awk 'NR==2{printf "%.0f", $7/$2*100}')
312+
if [ "$mem_available" -lt 10 ]; then
313+
echo "❌ CRITICAL: Very low available memory (${mem_available}%)"
314+
elif [ "$mem_available" -lt 20 ]; then
315+
echo "⚠️ WARNING: Low available memory (${mem_available}%)"
316+
fi
317+
318+
# Check if processes are running
319+
if [ -z "$DATABEND_PROCESSES" ]; then
320+
echo "❌ CRITICAL: No databend-meta processes currently running"
321+
else
322+
echo "✅ INFO: $(echo $DATABEND_PROCESSES | wc -w) databend-meta processes running"
323+
fi
324+
325+
# Check disk space
326+
local disk_usage=$(df / | awk 'NR==2{print $5}' | sed 's/%//')
327+
if [ "$disk_usage" -gt 90 ]; then
328+
echo "❌ CRITICAL: Root filesystem ${disk_usage}% full"
329+
elif [ "$disk_usage" -gt 80 ]; then
330+
echo "⚠️ WARNING: Root filesystem ${disk_usage}% full"
331+
fi
332+
333+
echo ""
334+
echo "=== RECOMMENDATIONS ==="
335+
echo "1. Check system logs for OOM killer activity"
336+
echo "2. Monitor memory usage during peak loads"
337+
echo "3. Consider increasing system memory or optimizing databend-meta configuration"
338+
echo "4. Verify databend-meta service configuration and startup scripts"
339+
echo "5. Check application logs for query execution patterns"
340+
echo ""
341+
342+
echo "=== NEXT STEPS ==="
343+
echo "1. Review the detailed findings above"
344+
echo "2. Share this report with databend support team"
345+
echo "3. Consider implementing monitoring for memory usage"
346+
echo "4. Set up log rotation if not already configured"
347+
echo ""
348+
349+
echo "Report generated: $(date '+%Y-%m-%d %H:%M:%S')"
350+
echo "Report location: $(pwd)/$REPORT_FILE"
351+
352+
} >> "$REPORT_FILE"
353+
354+
log_result "Final diagnostic report generated"
355+
}
356+
357+
# Main execution
358+
main() {
359+
echo -e "${BLUE}Databend Meta Server Diagnostic Tool${NC}"
360+
echo "======================================"
361+
362+
create_temp_dir
363+
init_report
364+
365+
check_system_info
366+
check_memory_oom
367+
check_databend_processes
368+
check_system_resources
369+
check_databend_logs
370+
check_network
371+
generate_final_report
372+
373+
echo ""
374+
echo -e "${GREEN}Diagnostic completed successfully!${NC}"
375+
echo -e "Report saved to: ${YELLOW}$REPORT_FILE${NC}"
376+
echo ""
377+
echo -e "${BLUE}To view the report:${NC}"
378+
echo -e " cat $REPORT_FILE"
379+
echo ""
380+
echo -e "${BLUE}To share with support:${NC}"
381+
echo -e " Send the file: $REPORT_FILE"
382+
}
383+
384+
# Run main function
385+
main "$@"

0 commit comments

Comments
 (0)