raystack
diff --git a/‎.env.sample
+5-1 b/‎.env.sample
+5-1
diff --git a/‎.env.test
+3-1 b/‎.env.test
+3-1
diff --git a/‎.github/workflows/build.yaml
+4-4 b/‎.github/workflows/build.yaml
+4-4
diff --git a/‎.github/workflows/integration-test.yaml
+4 b/‎.github/workflows/integration-test.yaml
+4
diff --git a/‎Dockerfile
+3-2 b/‎Dockerfile
+3-2
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎app/server.go
+11-12 b/‎app/server.go
+11-12
diff --git a/‎config/load.go
+3 b/‎config/load.go
+3
diff --git a/‎config/metric.go
+33 b/‎config/metric.go
+33
diff --git a/‎docker-compose.yml
+12-10 b/‎docker-compose.yml
+12-10
diff --git a/‎docs/docs/reference/configurations.md
+35 b/‎docs/docs/reference/configurations.md
+35
diff --git a/‎docs/docs/reference/metrics.md
+46-1 b/‎docs/docs/reference/metrics.md
+46-1
@@ -17,6 +17,8 @@ SERVER_CORS_ALLOWED_HEADERS=""
 
 SERVER_GRPC_PORT=8081
 
+
+
 WORKER_BUFFER_CHANNEL_SIZE=5
 WORKER_BUFFER_FLUSH_TIMEOUT_MS=5000
 WORKER_POOL_SIZE=5
@@ -32,7 +34,9 @@ PUBLISHER_KAFKA_CLIENT_STATISTICS_INTERVAL_MS=5000
 PUBLISHER_KAFKA_CLIENT_QUEUE_BUFFERING_MAX_MESSAGES=100000
 PUBLISHER_KAFKA_FLUSH_INTERVAL_MS=1000
 
+METRIC_RUNTIME_STATS_RECORD_INTERVAL_MS=1000
+METRIC_PROMETHEUS_ENABLED="true"
 METRIC_STATSD_ADDRESS=":8125"
-METRIC_STATSD_FLUSH_PERIOD_MS=100
+METRIC_STATSD_FLUSH_PERIOD_MS=1000
 
 LOG_LEVEL="info"
@@ -33,7 +33,9 @@ PUBLISHER_KAFKA_CLIENT_STATISTICS_INTERVAL_MS=5000
 PUBLISHER_KAFKA_CLIENT_QUEUE_BUFFERING_MAX_MESSAGES=100000
 PUBLISHER_KAFKA_FLUSH_INTERVAL_MS=1000
 
+METRIC_RUNTIME_STATS_RECORD_INTERVAL_MS=1000
+METRIC_PROMETHEUS_ENABLED="true"
 METRIC_STATSD_ADDRESS=":8125"
-METRIC_STATSD_FLUSH_PERIOD_MS=100
+METRIC_STATSD_FLUSH_PERIOD_MS=1000
 
 LOG_LEVEL="info"
@@ -9,9 +9,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Setup Go
-        uses: actions/setup-go@v2.1.3
+        uses: actions/setup-go@v3
         with:
-          go-version: "1.14"
+          go-version: "1.18"
       - name: Checkout repo
         uses: actions/checkout@v2
         with:
@@ -25,9 +25,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Setup Go
-        uses: actions/setup-go@v2.1.3
+        uses: actions/setup-go@v3
         with:
-          go-version: "1.14"
+          go-version: "1.18"
       - name: Install Protoc
         uses: arduino/setup-protoc@v1
       - uses: actions/checkout@v2
 
@@ -14,6 +14,10 @@ jobs:
         uses: arduino/setup-protoc@v1
       - name: Checkout repo
         uses: actions/checkout@v2
+      - name: Setup Go
+        uses: actions/setup-go@v3
+        with:
+          go-version: "1.18"
       - name: Copy integration config
         run: cp .env.test .env
       - name: Run Raccoon
 
@@ -1,4 +1,4 @@
-FROM golang:1.14
+FROM golang:1.18
 
 WORKDIR /app
 RUN apt-get update && apt-get install unzip  --no-install-recommends --assume-yes
@@ -10,7 +10,8 @@ RUN PROTOC_ZIP=protoc-3.17.3-linux-x86_64.zip && \
 COPY . .
 RUN  make build
 
-FROM debian:buster-slim
+
+FROM debian:bookworm-slim
 WORKDIR /app
 COPY --from=0 /app/raccoon ./raccoon
 COPY . .
 
@@ -77,7 +77,7 @@ You can consume the published events from the host machine by using `localhost:9
 
 Prerequisite:
 
-- You need to have [GO](https://golang.org/) 1.14 or above installed
+- You need to have [GO](https://golang.org/) 1.18 or above installed
 - You need `protoc` [installed](https://github.com/protocolbuffers/protobuf#protocol-compiler-installation)
 
 ```sh
 
@@ -63,7 +63,7 @@ func shutDownServer(ctx context.Context, cancel context.CancelFunc, httpServices
 			Until then we fall back to approximation */
 			eventsInChannel := len(bufferChannel) * 7
 			logger.Info(fmt.Sprintf("Outstanding unprocessed events in the channel, data lost ~ (No batches %d * 5 events) = ~%d", len(bufferChannel), eventsInChannel))
-			metrics.Count("kafka_messages_delivered_total", eventsInChannel+eventsInProducer, "success=false")
+			metrics.Count("kafka_messages_delivered_total", int64(eventsInChannel+eventsInProducer), map[string]string{"success": "false", "conn_group": "NA", "event_type": "NA"})
 			logger.Info("Exiting server")
 			cancel()
 		default:
@@ -73,20 +73,19 @@ func shutDownServer(ctx context.Context, cancel context.CancelFunc, httpServices
 }
 
 func reportProcMetrics() {
-	t := time.Tick(config.MetricStatsd.FlushPeriodMs)
+	t := time.Tick(config.MetricInfo.RuntimeStatsRecordInterval)
 	m := &runtime.MemStats{}
 	for {
 		<-t
-		metrics.Gauge("server_go_routines_count_current", runtime.NumGoroutine(), "")
-
+		metrics.Gauge("server_go_routines_count_current", runtime.NumGoroutine(), map[string]string{})
 		runtime.ReadMemStats(m)
-		metrics.Gauge("server_mem_heap_alloc_bytes_current", m.HeapAlloc, "")
-		metrics.Gauge("server_mem_heap_inuse_bytes_current", m.HeapInuse, "")
-		metrics.Gauge("server_mem_heap_objects_total_current", m.HeapObjects, "")
-		metrics.Gauge("server_mem_stack_inuse_bytes_current", m.StackInuse, "")
-		metrics.Gauge("server_mem_gc_triggered_current", m.LastGC/1000, "")
-		metrics.Gauge("server_mem_gc_pauseNs_current", m.PauseNs[(m.NumGC+255)%256]/1000, "")
-		metrics.Gauge("server_mem_gc_count_current", m.NumGC, "")
-		metrics.Gauge("server_mem_gc_pauseTotalNs_current", m.PauseTotalNs, "")
+		metrics.Gauge("server_mem_heap_alloc_bytes_current", m.HeapAlloc, map[string]string{})
+		metrics.Gauge("server_mem_heap_inuse_bytes_current", m.HeapInuse, map[string]string{})
+		metrics.Gauge("server_mem_heap_objects_total_current", m.HeapObjects, map[string]string{})
+		metrics.Gauge("server_mem_stack_inuse_bytes_current", m.StackInuse, map[string]string{})
+		metrics.Gauge("server_mem_gc_triggered_current", m.LastGC/1000, map[string]string{})
+		metrics.Gauge("server_mem_gc_pauseNs_current", m.PauseNs[(m.NumGC+255)%256]/1000, map[string]string{})
+		metrics.Gauge("server_mem_gc_count_current", m.NumGC, map[string]string{})
+		metrics.Gauge("server_mem_gc_pauseTotalNs_current", m.PauseTotalNs, map[string]string{})
 	}
 }
@@ -21,13 +21,16 @@ func Load() {
 	viper.ReadInConfig()
 
 	logConfigLoader()
+
 	publisherKafkaConfigLoader()
 	serverConfigLoader()
 	serverWsConfigLoader()
 	serverGRPCConfigLoader()
 	serverCorsConfigLoader()
 	workerConfigLoader()
+	metricCommonConfigLoader()
 	metricStatsdConfigLoader()
+	metricPrometheusConfigLoader()
 	eventDistributionConfigLoader()
 	eventConfigLoader()
 }
@@ -9,17 +9,50 @@ import (
 )
 
 var MetricStatsd metricStatsdCfg
+var MetricPrometheus metricPrometheusCfg
+var MetricInfo metricInfoCfg
 
 type metricStatsdCfg struct {
+	Enabled       bool
 	Address       string
 	FlushPeriodMs time.Duration
 }
 
+type metricPrometheusCfg struct {
+	Enabled bool
+	Port    int
+	Path    string
+}
+
+type metricInfoCfg struct {
+	RuntimeStatsRecordInterval time.Duration
+}
+
 func metricStatsdConfigLoader() {
+	viper.SetDefault("METRIC_STATSD_ENABLED", false)
 	viper.SetDefault("METRIC_STATSD_ADDRESS", ":8125")
 	viper.SetDefault("METRIC_STATSD_FLUSH_PERIOD_MS", 10000)
 	MetricStatsd = metricStatsdCfg{
+		Enabled:       util.MustGetBool("METRIC_STATSD_ENABLED"),
 		Address:       util.MustGetString("METRIC_STATSD_ADDRESS"),
 		FlushPeriodMs: util.MustGetDuration("METRIC_STATSD_FLUSH_PERIOD_MS", time.Millisecond),
 	}
 }
+
+func metricPrometheusConfigLoader() {
+	viper.SetDefault("METRIC_PROMETHEUS_ENABLED", false)
+	viper.SetDefault("METRIC_PROMETHEUS_PORT", 9090)
+	viper.SetDefault("METRIC_PROMETHEUS_PATH", "/metrics")
+	MetricPrometheus = metricPrometheusCfg{
+		Enabled: util.MustGetBool("METRIC_PROMETHEUS_ENABLED"),
+		Port:    util.MustGetInt("METRIC_PROMETHEUS_PORT"),
+		Path:    util.MustGetString("METRIC_PROMETHEUS_PATH"),
+	}
+}
+
+func metricCommonConfigLoader() {
+	viper.SetDefault("METRIC_RUNTIME_STATS_RECORD_INTERVAL_MS", 10000)
+	MetricInfo = metricInfoCfg{
+		RuntimeStatsRecordInterval: util.MustGetDuration("METRIC_RUNTIME_STATS_RECORD_INTERVAL_MS", time.Millisecond),
+	}
+}
@@ -3,6 +3,7 @@ version: '3.9'
 networks:
   cs-network:
 
+
 services:
   zookeeper:
     image: confluentinc/cp-zookeeper:5.1.2
@@ -41,14 +42,14 @@ services:
   cs:
     build:
       context: .
-    command: ["/bin/sh", "-c", "./raccoon"]
+    command: [ "/bin/sh", "-c", "./raccoon" ]
     hostname: cs
     container_name: cs
     stdin_open: true
     tty: true
     depends_on:
       - kafka
-      - telegraf
+      # - telegraf
     environment:
       SERVER_WEBSOCKET_PORT: "8080"
       SERVER_WEBSOCKET_CHECK_ORIGIN: "true"
@@ -74,6 +75,7 @@ services:
       PUBLISHER_KAFKA_CLIENT_STATISTICS_INTERVAL_MS: 5000
       PUBLISHER_KAFKA_CLIENT_QUEUE_BUFFERING_MAX_MESSAGES: 100000
       PUBLISHER_KAFKA_FLUSH_INTERVAL_MS: 1000
+      METRIC_PROMETHEUS_ENABLED: "true"
       METRIC_STATSD_ADDRESS: "telegraf:8125"
       METRIC_STATSD_FLUSH_PERIOD_MS: 100
       LOG_LEVEL: "info"
@@ -82,11 +84,11 @@ services:
       - "8081:8081"
     networks:
       - cs-network
-  telegraf:
-    image: telegraf
-    volumes:
-    - ./.telegraf.sample.conf:/etc/telegraf/telegraf.conf:ro
-    ports:
-      - "8125:8125"
-    networks:
-      - cs-network
+  # telegraf:
+  #   image: telegraf
+  #   volumes:
+  #     - ./.telegraf.sample.conf:/etc/telegraf/telegraf.conf:ro
+  #   ports:
+  #     - "8125:8125"
+  #   networks:
+  #     - cs-network
@@ -249,6 +249,20 @@ Upon shutdown, the publisher will try to finish processing events in buffer befo
 
 ## Metric
 
+### `METRIC_RUNTIME_STATS_RECORD_INTERVAL_MS`
+
+The time interval between recording runtime stats of the application in the insturmentation. It's recommended to keep this value equivalent to flush interval when using statsd and your collector's scrape interval when using prometheus as your instrumentation.
+
+- Type `Optional`
+- Default Value: `10000`
+
+### `METRIC_STATSD_ENABLED`
+
+Flag to enable export of statsd metric
+
+- Type `Optional`
+- Default value: `false`
+
 ### `METRIC_STATSD_ADDRESS`
 
 Address to reports the service metrics.
@@ -263,6 +277,27 @@ Interval for the service to push metrics.
 - Type `Optional`
 - Default value: `10000`
 
+### `METRIC_PROMETHEUS_ENABLED`
+
+Flag to enable a prometheus http server to expose metrics.
+
+- Type `Optional`
+- Default value: `false`
+
+### `METRIC_PROMETHEUS_PATH`
+
+The path at which prometheus server should serve metrics.
+
+- Type `Optional`
+- Default value: `/metrics`
+
+### `METRIC_PROMETHEUS_PORT`
+
+The port number on which prometheus server will be listening for metric scraping requests.
+
+- Type `Optional`
+- Default value: `9090`
+
 ## Log
 
 ### `LOG_LEVEL`
 
@@ -53,15 +53,30 @@ Duration of alive connection per session per connection
 - Type: `Timing`
 - Tags: `conn_group=*`
 
+### `conn_close_err_count`
+
+Number of connection close errors encountered
+
+- Type: `Count`
+- Tags: NA
+
 ## Kafka Publisher
 
 ### `kafka_messages_delivered_total`
 
-Number of delivered events to Kafka
+Number of delivered events to Kafka. The metric also contains false increments. To find the true value, one should use the difference between this and `kafka_messages_undelivered_total` metric for the same tag/labels.
+
+- Type: `Count`
+- Tags: `success=false` `success=true` `conn_group=*` `event_type=*`
+
+### `kafka_messages_undelivered_total`
+
+The count of false increments done by `kafka_messages_delivered_total`. To be used in conjunction with the former for accurate metrics.
 
 - Type: `Count`
 - Tags: `success=false` `success=true` `conn_group=*` `event_type=*`
 
+
 ### `kafka_unknown_topic_failure_total`
 
 Number of delivery failure caused by topic does not exist in kafka.
@@ -102,6 +117,29 @@ Broker latency / round-trip time in microseconds
 - Type: `Gauge`
 - Tags: `broker=broker_nodes`
 
+### `ack_event_rtt_ms`
+
+Time taken from ack function called by kafka producer to processed by the ack handler.
+
+- Type: `Timing`
+- Tags: NA
+
+### `event_rtt_ms`
+
+Time taken from event is consumed from the queue to be acked by the ack handler.
+
+- Type: `Timing`
+- Tags: NA
+
+### `kafka_producebulk_tt_ms`
+
+Response time of produce batch method of the kafka producer
+
+- Type `Timing`
+- Tags: NA
+
+
+
 ## Resource Usage
 
 ### `server_mem_gc_triggered_current`
@@ -178,6 +216,13 @@ Number of events received in requests
 - Type: `Count`
 - Tags: `conn_group=*` `event_type=*`
 
+### `events_duplicate_total`
+
+Number of duplicate events
+
+- Type: `Count`
+- Tags: `conn_group=*` `reason=*`
+
 ### `batches_read_total`
 
 Request count