Skip to content

Commit 8393636

Browse files
alibakhshoude-afkaliibii
authored andcommitted
feat: add plugin execution tracing to OpenTelemetry plugin
This commit adds plugin execution tracing capability to the OpenTelemetry plugin, allowing users to trace individual plugin phases (rewrite, access, header_filter, body_filter, log) as child spans of the main request trace. Changes: - Added trace_plugins configuration option (default: false, opt-in) - Added plugin_span_kind configuration for observability provider compatibility - Enhanced plugin execution with OpenTelemetry span creation and finishing - Added comprehensive request context attributes to plugin spans - Updated documentation with examples and usage instructions - Added comprehensive test suite for the new functionality Features: - Plugin Phase Tracing: Creates child spans for each plugin phase execution - Rich Context: Includes HTTP method, URI, hostname, user agent, route info, and service info - Configurable: Can be enabled/disabled via trace_plugins configuration - Span Kind Control: Supports internal (default) and server span kinds for observability provider compatibility - Proper Hierarchy: Plugin spans are correctly nested under main request spans - Performance: Minimal overhead when disabled (default behavior) Configuration: - trace_plugins: boolean (default: false) - Enable/disable plugin tracing - plugin_span_kind: string (default: 'internal') - Span kind for plugin spans - 'internal': Standard internal operation (may be excluded from metrics) - 'server': Server-side operation (typically included in service-level metrics) This addresses GitHub issue #12510 and provides end-to-end tracing visibility for APISIX plugin execution phases.
1 parent ebe8d6c commit 8393636

File tree

5 files changed

+654
-4
lines changed

5 files changed

+654
-4
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ A/B testing, canary release, blue-green deployment, limit rate, defense against
123123
- **OPS friendly**
124124

125125
- Zipkin tracing: [Zipkin](docs/en/latest/plugins/zipkin.md)
126+
- OpenTelemetry tracing: [OpenTelemetry](docs/en/latest/plugins/opentelemetry.md) with plugin execution tracing
126127
- Open source APM: support [Apache SkyWalking](docs/en/latest/plugins/skywalking.md)
127128
- Works with external service discovery: In addition to the built-in etcd, it also supports [Consul](docs/en/latest/discovery/consul.md), [Consul_kv](docs/en/latest/discovery/consul_kv.md), [Nacos](docs/en/latest/discovery/nacos.md), [Eureka](docs/en/latest/discovery/eureka.md) and [Zookeeper (CP)](https://github.com/api7/apisix-seed/blob/main/docs/en/latest/zookeeper.md).
128129
- Monitoring And Metrics: [Prometheus](docs/en/latest/plugins/prometheus.md)

apisix/plugin.lua

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ local tostring = tostring
3838
local error = error
3939
local getmetatable = getmetatable
4040
local setmetatable = setmetatable
41+
local string_format = string.format
4142
-- make linter happy to avoid error: getting the Lua global "load"
4243
-- luacheck: globals load, ignore lua_load
4344
local lua_load = load
@@ -1169,6 +1170,9 @@ function _M.run_plugin(phase, plugins, api_ctx)
11691170
return api_ctx
11701171
end
11711172

1173+
-- Get OpenTelemetry plugin for tracing
1174+
local otel_plugin = _M.get("opentelemetry")
1175+
11721176
if phase ~= "log"
11731177
and phase ~= "header_filter"
11741178
and phase ~= "body_filter"
@@ -1188,11 +1192,44 @@ function _M.run_plugin(phase, plugins, api_ctx)
11881192
goto CONTINUE
11891193
end
11901194

1195+
-- Start OpenTelemetry plugin span (with performance guard)
1196+
if otel_plugin and otel_plugin.start_plugin_span and api_ctx.otel_trace_plugins then
1197+
otel_plugin.start_plugin_span(api_ctx, plugins[i]["name"], phase)
1198+
end
1199+
11911200
run_meta_pre_function(conf, api_ctx, plugins[i]["name"])
11921201
plugin_run = true
11931202
api_ctx._plugin_name = plugins[i]["name"]
1194-
local code, body = phase_func(conf, api_ctx)
1203+
local success = true
1204+
local error_msg = nil
1205+
local code, body
1206+
1207+
-- Execute plugin with error handling
1208+
local ok, result = pcall(phase_func, conf, api_ctx)
1209+
if not ok then
1210+
-- Lua exception occurred
1211+
success = false
1212+
error_msg = string_format("plugin execution failed: %s", result)
1213+
code = 500
1214+
body = nil
1215+
else
1216+
-- Plugin executed successfully, check return values
1217+
code, body = result, nil
1218+
if code and code >= 400 then
1219+
success = false
1220+
error_msg = string_format("plugin exited with status code %d", code)
1221+
end
1222+
end
1223+
11951224
api_ctx._plugin_name = nil
1225+
1226+
-- Finish OpenTelemetry plugin span (with performance guard)
1227+
if otel_plugin and otel_plugin.finish_plugin_span and
1228+
api_ctx.otel_trace_plugins then
1229+
otel_plugin.finish_plugin_span(api_ctx, plugins[i]["name"], phase, success,
1230+
error_msg)
1231+
end
1232+
11961233
if code or body then
11971234
if is_http then
11981235
if code >= 400 then
@@ -1226,11 +1263,30 @@ function _M.run_plugin(phase, plugins, api_ctx)
12261263
local phase_func = plugins[i][phase]
12271264
local conf = plugins[i + 1]
12281265
if phase_func and meta_filter(api_ctx, plugins[i]["name"], conf) then
1266+
-- Start OpenTelemetry plugin span (with performance guard)
1267+
if otel_plugin and otel_plugin.start_plugin_span and api_ctx.otel_trace_plugins then
1268+
otel_plugin.start_plugin_span(api_ctx, plugins[i]["name"], phase)
1269+
end
1270+
12291271
plugin_run = true
12301272
run_meta_pre_function(conf, api_ctx, plugins[i]["name"])
12311273
api_ctx._plugin_name = plugins[i]["name"]
1232-
phase_func(conf, api_ctx)
1274+
1275+
local success = true
1276+
local error_msg = nil
1277+
local ok, err = pcall(phase_func, conf, api_ctx)
1278+
if not ok then
1279+
success = false
1280+
error_msg = err
1281+
end
1282+
12331283
api_ctx._plugin_name = nil
1284+
1285+
-- Finish OpenTelemetry plugin span (with performance guard)
1286+
if otel_plugin and otel_plugin.finish_plugin_span and api_ctx.otel_trace_plugins then
1287+
otel_plugin.finish_plugin_span(api_ctx, plugins[i]["name"], phase, success,
1288+
error_msg)
1289+
end
12341290
end
12351291
end
12361292

apisix/plugins/opentelemetry.lua

Lines changed: 207 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,20 @@ local schema = {
182182
type = "string",
183183
minLength = 1,
184184
}
185+
},
186+
trace_plugins = {
187+
type = "boolean",
188+
description = "whether to trace individual plugin execution",
189+
default = false
190+
},
191+
plugin_span_kind = {
192+
type = "string",
193+
enum = {"internal", "server"},
194+
description = "span kind for plugin execution spans. "
195+
.. "Some observability providers may exclude internal spans from metrics "
196+
.. "and dashboards. Use 'server' if you need plugin spans included in "
197+
.. "service-level metrics.",
198+
default = "internal"
185199
}
186200
}
187201
}
@@ -306,6 +320,119 @@ local function inject_attributes(attributes, wanted_attributes, source, with_pre
306320
end
307321

308322

323+
-- Plugin tracing helper functions
324+
local function create_plugin_span(tracer, plugin_name, phase, parent_context, api_ctx)
325+
-- Use a more descriptive span name that includes the plugin and phase
326+
local span_name = string_format("plugin.%s.%s", plugin_name, phase)
327+
local attributes = {
328+
attr.string("apisix.plugin.name", plugin_name),
329+
attr.string("apisix.plugin.phase", phase),
330+
attr.string("apisix.plugin.operation", "execute"),
331+
-- Set resource name to distinguish plugin spans from main spans
332+
attr.string("resource.name", span_name)
333+
}
334+
335+
-- Add request context attributes if available
336+
if api_ctx and api_ctx.var then
337+
local vars = api_ctx.var
338+
table.insert(attributes, attr.string("http.method", vars.method or "unknown"))
339+
table.insert(attributes, attr.string("http.target", vars.request_uri or "unknown"))
340+
table.insert(attributes, attr.string("net.host.name", vars.host or "unknown"))
341+
table.insert(attributes, attr.string("http.user_agent", vars.http_user_agent or "unknown"))
342+
343+
-- Add route information if available
344+
if api_ctx.route_id then
345+
table.insert(attributes, attr.string("apisix.route_id", api_ctx.route_id))
346+
end
347+
if api_ctx.route_name then
348+
table.insert(attributes, attr.string("apisix.route_name", api_ctx.route_name))
349+
end
350+
if api_ctx.curr_req_matched and api_ctx.curr_req_matched._path then
351+
table.insert(attributes, attr.string("http.route", api_ctx.curr_req_matched._path))
352+
end
353+
354+
-- Add service information if available
355+
if api_ctx.service_id then
356+
table.insert(attributes, attr.string("apisix.service_id", api_ctx.service_id))
357+
end
358+
if api_ctx.service_name then
359+
table.insert(attributes, attr.string("apisix.service_name", api_ctx.service_name))
360+
end
361+
end
362+
363+
-- Get the configured span kind or default to internal
364+
local kind = span_kind.internal
365+
if api_ctx.otel_plugin_span_kind then
366+
kind = api_ctx.otel_plugin_span_kind
367+
end
368+
369+
-- Create child span with proper parent context
370+
local ctx = tracer:start(parent_context, span_name, {
371+
kind = kind,
372+
attributes = attributes,
373+
})
374+
375+
return ctx
376+
end
377+
378+
379+
local function finish_plugin_span(plugin_span_ctx, success, error_msg)
380+
if not plugin_span_ctx then
381+
return
382+
end
383+
384+
local span = plugin_span_ctx:span()
385+
if not span then
386+
return
387+
end
388+
389+
if not success then
390+
span:set_status(span_status.ERROR, error_msg or "plugin execution failed")
391+
end
392+
393+
span:finish()
394+
end
395+
396+
397+
-- Store plugin spans in api_ctx for cleanup
398+
local function store_plugin_span(api_ctx, plugin_name, phase, span_ctx)
399+
if not api_ctx.otel_plugin_spans then
400+
api_ctx.otel_plugin_spans = {}
401+
end
402+
403+
-- Use nested structure to avoid key collisions
404+
if not api_ctx.otel_plugin_spans[plugin_name] then
405+
api_ctx.otel_plugin_spans[plugin_name] = {}
406+
end
407+
api_ctx.otel_plugin_spans[plugin_name][phase] = {
408+
span_ctx = span_ctx,
409+
finished = false
410+
}
411+
end
412+
413+
414+
local function cleanup_plugin_spans(api_ctx)
415+
if not api_ctx.otel_plugin_spans then
416+
return
417+
end
418+
419+
for plugin_name, phases in pairs(api_ctx.otel_plugin_spans) do
420+
if phases then
421+
for phase, span_data in pairs(phases) do
422+
if span_data and not span_data.finished and span_data.span_ctx then
423+
local span = span_data.span_ctx:span()
424+
if span then
425+
span:finish()
426+
end
427+
end
428+
end
429+
end
430+
end
431+
432+
api_ctx.otel_plugin_spans = nil
433+
end
434+
435+
309436
function _M.rewrite(conf, api_ctx)
310437
local metadata = plugin.plugin_metadata(plugin_name)
311438
if metadata == nil then
@@ -323,7 +450,7 @@ function _M.rewrite(conf, api_ctx)
323450
return
324451
end
325452

326-
local span_name = vars.method
453+
local span_name = string_format("http.%s", vars.method)
327454

328455
local attributes = {
329456
attr.string("net.host.name", vars.host),
@@ -337,7 +464,7 @@ function _M.rewrite(conf, api_ctx)
337464
table.insert(attributes, attr.string("apisix.route_id", api_ctx.route_id))
338465
table.insert(attributes, attr.string("apisix.route_name", api_ctx.route_name))
339466
table.insert(attributes, attr.string("http.route", api_ctx.curr_req_matched._path))
340-
span_name = span_name .. " " .. api_ctx.curr_req_matched._path
467+
span_name = string_format("http.%s %s", vars.method, api_ctx.curr_req_matched._path)
341468
end
342469

343470
if api_ctx.service_id then
@@ -378,6 +505,23 @@ function _M.rewrite(conf, api_ctx)
378505

379506
api_ctx.otel_context_token = ctx:attach()
380507

508+
-- Store tracer and configuration for plugin tracing
509+
if conf.trace_plugins then
510+
api_ctx.otel_tracer = tracer
511+
api_ctx.otel_main_context = ctx
512+
api_ctx.otel_trace_plugins = true
513+
514+
-- Map string span kind to span_kind constant
515+
local kind_mapping = {
516+
internal = span_kind.internal,
517+
server = span_kind.server,
518+
}
519+
local span_kind_value = conf.plugin_span_kind or "internal"
520+
api_ctx.otel_plugin_span_kind = kind_mapping[span_kind_value] or span_kind.internal
521+
522+
-- Note: We don't create a span for the OpenTelemetry plugin itself to avoid recursion
523+
end
524+
381525
-- inject trace context into the headers of upstream HTTP request
382526
trace_context_propagator:inject(ctx, ngx.req)
383527
end
@@ -400,6 +544,14 @@ function _M.delayed_body_filter(conf, api_ctx)
400544
span:set_attributes(attr.int("http.status_code", upstream_status))
401545

402546
span:finish()
547+
548+
-- Finish OpenTelemetry plugin span if it exists
549+
if api_ctx.otel_trace_plugins then
550+
_M.finish_plugin_span(api_ctx, "opentelemetry", "rewrite", true, nil)
551+
end
552+
553+
-- Cleanup plugin spans
554+
cleanup_plugin_spans(api_ctx)
403555
end
404556
end
405557

@@ -419,7 +571,60 @@ function _M.log(conf, api_ctx)
419571
end
420572

421573
span:finish()
574+
575+
-- Cleanup plugin spans (guaranteed cleanup on request end)
576+
cleanup_plugin_spans(api_ctx)
577+
end
578+
end
579+
580+
581+
-- Public functions for plugin tracing integration
582+
function _M.start_plugin_span(api_ctx, plugin_name, phase)
583+
if not api_ctx.otel_trace_plugins or not api_ctx.otel_tracer then
584+
return nil
422585
end
586+
587+
-- Prevent recursion: don't trace the OpenTelemetry plugin itself
588+
if plugin_name == "opentelemetry" then
589+
return nil
590+
end
591+
592+
-- Always use the stored main context to ensure proper parent-child relationship
593+
local parent_ctx = api_ctx.otel_main_context
594+
if not parent_ctx then
595+
return nil
596+
end
597+
598+
local plugin_span_ctx = create_plugin_span(api_ctx.otel_tracer, plugin_name, phase, parent_ctx,
599+
api_ctx)
600+
store_plugin_span(api_ctx, plugin_name, phase, plugin_span_ctx)
601+
602+
return plugin_span_ctx
603+
end
604+
605+
606+
function _M.finish_plugin_span(api_ctx, plugin_name, phase, success, error_msg)
607+
if not api_ctx.otel_trace_plugins or not api_ctx.otel_plugin_spans then
608+
return
609+
end
610+
611+
-- Prevent recursion: don't trace the OpenTelemetry plugin itself
612+
if plugin_name == "opentelemetry" then
613+
return
614+
end
615+
616+
local plugin_data = api_ctx.otel_plugin_spans[plugin_name]
617+
if not plugin_data then
618+
return
619+
end
620+
621+
local span_data = plugin_data[phase]
622+
if not span_data or span_data.finished then
623+
return
624+
end
625+
626+
finish_plugin_span(span_data.span_ctx, success, error_msg)
627+
span_data.finished = true
423628
end
424629

425630

0 commit comments

Comments
 (0)