Skip to content

Commit ec03fab

Browse files
olegbetobetsun
andauthored
KAR-616: setup kubearchive logging kflux-ocp-p01 config (#8277)
* KAR-616: setup kubearchive logging kflux-ocp-p01 config Signed-off-by: obetsun <[email protected]> rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED * correct AWS annotation and kubearchive-logging configmap for kflux-ocp-p01 Signed-off-by: obetsun <[email protected]> rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED --------- Co-authored-by: obetsun <[email protected]>
1 parent bdf27b1 commit ec03fab

File tree

10 files changed

+575
-2
lines changed

10 files changed

+575
-2
lines changed

argo-cd-apps/base/member/infra-deployments/vector-kubearchive-log-collector/vector-kubearchive-log-collector.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ spec:
2121
- nameNormalized: stone-stg-rh01
2222
values.clusterDir: stone-stg-rh01
2323
# Private
24-
# - nameNormalized: kflux-ocp-p01
25-
# values.clusterDir: kflux-ocp-p01
24+
- nameNormalized: kflux-ocp-p01
25+
values.clusterDir: kflux-ocp-p01
2626
# - nameNormalized: stone-prod-p01
2727
# values.clusterDir: stone-prod-p01
2828
- nameNormalized: stone-prod-p02
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
---
2+
apiVersion: external-secrets.io/v1beta1
3+
kind: ExternalSecret
4+
metadata:
5+
name: kubearchive-logging
6+
namespace: product-kubearchive
7+
annotations:
8+
argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
9+
argocd.argoproj.io/sync-wave: "-1"
10+
spec:
11+
dataFrom:
12+
- extract:
13+
key: production/kubearchive/logging
14+
refreshInterval: 1h
15+
secretStoreRef:
16+
kind: ClusterSecretStore
17+
name: appsre-stonesoup-vault
18+
target:
19+
creationPolicy: Owner
20+
deletionPolicy: Delete
21+
name: kubearchive-logging
22+
template:
23+
metadata:
24+
annotations:
25+
argocd.argoproj.io/sync-options: Prune=false
26+
argocd.argoproj.io/compare-options: IgnoreExtraneous

components/kubearchive/production/kflux-ocp-p01/kustomization.yaml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,42 @@ resources:
88

99
namespace: product-kubearchive
1010

11+
# Generate kubearchive-logging ConfigMap with hash for automatic restarts
12+
# Due to quoting limitations of generators we need to introduce the values with the |
13+
# See https://github.com/kubernetes-sigs/kustomize/issues/4845#issuecomment-1671570428
14+
configMapGenerator:
15+
- name: kubearchive-logging
16+
literals:
17+
- |
18+
POD_ID=cel:metadata.uid
19+
- |
20+
NAMESPACE=cel:metadata.namespace
21+
- |
22+
START=cel:status.?startTime == optional.none() ? int(now()-duration('1h'))*1000000000: status.startTime
23+
- |
24+
END=cel:status.?startTime == optional.none() ? int(now()+duration('1h'))*1000000000: int(timestamp(status.startTime)+duration('6h'))*1000000000
25+
- |
26+
LOG_URL=http://loki-gateway.product-kubearchive-logging.svc.cluster.local:80/loki/api/v1/query_range?query=%7Bstream%3D%22{NAMESPACE}%22%7D%20%7C%20pod_id%20%3D%20%60{POD_ID}%60%20%7C%20container%20%3D%20%60{CONTAINER_NAME}%60&start={START}&end={END}&direction=forward
27+
- |
28+
LOG_URL_JSONPATH=$.data.result[*].values[*][1]
29+
1130
patches:
31+
- patch: |-
32+
$patch: delete
33+
apiVersion: v1
34+
kind: ConfigMap
35+
metadata:
36+
name: kubearchive-logging
37+
namespace: kubearchive
38+
39+
- patch: |-
40+
$patch: delete
41+
apiVersion: v1
42+
kind: Secret
43+
metadata:
44+
name: kubearchive-logging
45+
namespace: kubearchive
46+
1247
- patch: |-
1348
apiVersion: batch/v1
1449
kind: Job
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
4+
commonAnnotations:
5+
ignore-check.kube-linter.io/drop-net-raw-capability: |
6+
"Vector runs requires access to socket."
7+
ignore-check.kube-linter.io/run-as-non-root: |
8+
"Vector runs as Root and attach host Path."
9+
ignore-check.kube-linter.io/sensitive-host-mounts: |
10+
"Vector runs requires certain host mounts to watch files being created by pods."
11+
ignore-check.kube-linter.io/pdb-unhealthy-pod-eviction-policy: |
12+
"Managed by upstream Loki chart (no value exposed for unhealthyPodEvictionPolicy)."
13+
14+
resources:
15+
- ../base
16+
17+
generators:
18+
- vector-helm-generator.yaml
19+
- loki-helm-generator.yaml
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
apiVersion: builtin
2+
kind: HelmChartInflationGenerator
3+
metadata:
4+
name: loki
5+
name: loki
6+
repo: https://grafana.github.io/helm-charts
7+
version: 6.30.1
8+
releaseName: loki
9+
namespace: product-kubearchive-logging
10+
valuesFile: loki-helm-values.yaml
11+
additionalValuesFiles:
12+
- loki-helm-prod-values.yaml
13+
valuesInline:
14+
# Cluster-specific overrides
15+
serviceAccount:
16+
create: true
17+
name: loki-sa
18+
annotations:
19+
eks.amazonaws.com/role-arn: "arn:aws:iam::442042531708:role/kflux-ocp-p01-loki-storage-role"
20+
loki:
21+
storage:
22+
bucketNames:
23+
chunks: kflux-ocp-p01-loki-storage
24+
admin: kflux-ocp-p01-loki-storage
25+
storage_config:
26+
aws:
27+
bucketnames: kflux-ocp-p01-loki-storage
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
---
2+
gateway:
3+
service:
4+
type: LoadBalancer
5+
resources:
6+
requests:
7+
cpu: 100m
8+
memory: 128Mi
9+
limits:
10+
memory: 256Mi
11+
12+
# Basic Loki configuration with S3 storage
13+
loki:
14+
commonConfig:
15+
replication_factor: 3
16+
# Required storage configuration for Helm chart
17+
storage:
18+
type: s3
19+
# bucketNames: Fill it on the generator for each cluster
20+
s3:
21+
region: us-east-1
22+
storage_config:
23+
aws:
24+
# bucketnames: Fill it on the generator for each cluster
25+
region: us-east-1
26+
s3forcepathstyle: false
27+
# Configure ingestion limits to handle Vector's data volume
28+
limits_config:
29+
retention_period: 744h # 31 days retention
30+
ingestion_rate_mb: 50
31+
ingestion_burst_size_mb: 100
32+
ingestion_rate_strategy: "local"
33+
max_streams_per_user: 0
34+
max_line_size: 2097152
35+
per_stream_rate_limit: 50M
36+
per_stream_rate_limit_burst: 200M
37+
reject_old_samples: false
38+
reject_old_samples_max_age: 168h
39+
discover_service_name: []
40+
discover_log_levels: false
41+
volume_enabled: true
42+
max_global_streams_per_user: 75000
43+
max_entries_limit_per_query: 100000
44+
increment_duplicate_timestamp: true
45+
allow_structured_metadata: true
46+
ingester:
47+
chunk_target_size: 8388608 # 8MB
48+
chunk_idle_period: 5m
49+
max_chunk_age: 2h
50+
chunk_encoding: snappy # Compress data (reduces S3 transfer size)
51+
chunk_retain_period: 1h # Keep chunks in memory after flush
52+
flush_op_timeout: 10m # Add timeout for S3 operations
53+
54+
# Tuning for high-load queries
55+
querier:
56+
max_concurrent: 8
57+
query_range:
58+
# split_queries_by_interval deprecated in Loki 3.x - removed
59+
parallelise_shardable_queries: true
60+
61+
# Distributed components configuration
62+
ingester:
63+
replicas: 3
64+
autoscaling:
65+
enabled: true
66+
zoneAwareReplication:
67+
enabled: true
68+
maxUnavailable: 1
69+
resources:
70+
requests:
71+
cpu: 500m
72+
memory: 1Gi
73+
limits:
74+
cpu: 2000m
75+
memory: 2Gi
76+
persistence:
77+
enabled: true
78+
size: 10Gi
79+
affinity: {}
80+
podAntiAffinity:
81+
soft: {}
82+
hard: {}
83+
84+
querier:
85+
replicas: 3
86+
autoscaling:
87+
enabled: true
88+
maxUnavailable: 1
89+
resources:
90+
requests:
91+
cpu: 300m
92+
memory: 512Mi
93+
limits:
94+
memory: 1Gi
95+
affinity: {}
96+
97+
queryFrontend:
98+
replicas: 2
99+
maxUnavailable: 1
100+
resources:
101+
requests:
102+
cpu: 200m
103+
memory: 256Mi
104+
limits:
105+
memory: 512Mi
106+
107+
queryScheduler:
108+
replicas: 2
109+
maxUnavailable: 1
110+
resources:
111+
requests:
112+
cpu: 200m
113+
memory: 256Mi
114+
limits:
115+
memory: 512Mi
116+
117+
distributor:
118+
replicas: 3
119+
autoscaling:
120+
enabled: true
121+
maxUnavailable: 1
122+
resources:
123+
requests:
124+
cpu: 300m
125+
memory: 512Mi
126+
limits:
127+
memory: 1Gi
128+
affinity: {}
129+
130+
compactor:
131+
replicas: 1
132+
retention_enabled: true
133+
retention_delete_delay: 2h
134+
retention_delete_worker_count: 150
135+
resources:
136+
requests:
137+
cpu: 200m
138+
memory: 512Mi
139+
limits:
140+
memory: 1Gi
141+
142+
indexGateway:
143+
replicas: 2
144+
maxUnavailable: 0
145+
resources:
146+
requests:
147+
cpu: 300m
148+
memory: 512Mi
149+
limits:
150+
memory: 1Gi
151+
affinity: {}
152+
153+
# Enable Memcached caches for performance
154+
chunksCache:
155+
enabled: true
156+
replicas: 1
157+
158+
resultsCache:
159+
enabled: true
160+
replicas: 1
161+
162+
memcached:
163+
enabled: true
164+
165+
memcachedResults:
166+
enabled: true
167+
168+
memcachedChunks:
169+
enabled: true
170+
171+
memcachedFrontend:
172+
enabled: true
173+
174+
memcachedIndexQueries:
175+
enabled: true
176+
177+
memcachedIndexWrites:
178+
enabled: true
179+
180+
# Disable Minio - staging uses S3 with IAM role
181+
minio:
182+
enabled: false
183+
184+
# Resources for memcached exporter to satisfy linter
185+
memcachedExporter:
186+
resources:
187+
requests:
188+
cpu: 50m
189+
memory: 64Mi
190+
limits:
191+
memory: 128Mi

0 commit comments

Comments
 (0)