diff --git a/argo-cd-apps/base/member/infra-deployments/vector-kubearchive-log-collector/vector-kubearchive-log-collector.yaml b/argo-cd-apps/base/member/infra-deployments/vector-kubearchive-log-collector/vector-kubearchive-log-collector.yaml index e0b7309fd2f..80fc2ca9b44 100644 --- a/argo-cd-apps/base/member/infra-deployments/vector-kubearchive-log-collector/vector-kubearchive-log-collector.yaml +++ b/argo-cd-apps/base/member/infra-deployments/vector-kubearchive-log-collector/vector-kubearchive-log-collector.yaml @@ -32,12 +32,10 @@ spec: # Public # - nameNormalized: stone-prd-rh01 # values.clusterDir: stone-prd-rh01 - # - nameNormalized: kflux-prd-rh02 - # values.clusterDir: kflux-prd-rh02 - # - nameNormalized: kflux-prd-rh03 - # values.clusterDir: kflux-prd-rh03 - nameNormalized: kflux-rhel-p01 values.clusterDir: kflux-rhel-p01 + - nameNormalized: kflux-prd-rh03 + values.clusterDir: kflux-prd-rh03 template: metadata: name: vector-kubearchive-log-collector-{{nameNormalized}} diff --git a/components/kubearchive/production/kflux-prd-rh03/external-secret.yaml b/components/kubearchive/production/kflux-prd-rh03/external-secret.yaml new file mode 100644 index 00000000000..e44eb9db470 --- /dev/null +++ b/components/kubearchive/production/kflux-prd-rh03/external-secret.yaml @@ -0,0 +1,26 @@ +--- +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: kubearchive-logging + namespace: product-kubearchive + annotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + argocd.argoproj.io/sync-wave: "-1" +spec: + dataFrom: + - extract: + key: production/kubearchive/logging + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: appsre-stonesoup-vault + target: + creationPolicy: Owner + deletionPolicy: Delete + name: kubearchive-logging + template: + metadata: + annotations: + argocd.argoproj.io/sync-options: Prune=false + argocd.argoproj.io/compare-options: IgnoreExtraneous diff --git a/components/kubearchive/production/kflux-prd-rh03/kustomization.yaml b/components/kubearchive/production/kflux-prd-rh03/kustomization.yaml index 62068f16684..82432585a19 100644 --- a/components/kubearchive/production/kflux-prd-rh03/kustomization.yaml +++ b/components/kubearchive/production/kflux-prd-rh03/kustomization.yaml @@ -4,11 +4,47 @@ kind: Kustomization resources: - ../../base - ../base + - external-secret.yaml - kubearchive.yaml namespace: product-kubearchive +# Generate kubearchive-logging ConfigMap with hash for automatic restarts +# Due to quoting limitations of generators we need to introduce the values with the | +# See https://github.com/kubernetes-sigs/kustomize/issues/4845#issuecomment-1671570428 +configMapGenerator: + - name: kubearchive-logging + literals: + - | + POD_ID=cel:metadata.uid + - | + NAMESPACE=cel:metadata.namespace + - | + START=cel:status.?startTime == optional.none() ? int(now()-duration('1h'))*1000000000: status.startTime + - | + END=cel:status.?startTime == optional.none() ? int(now()+duration('1h'))*1000000000: int(timestamp(status.startTime)+duration('6h'))*1000000000 + - | + LOG_URL=http://loki-gateway.product-kubearchive-logging.svc.cluster.local:80/loki/api/v1/query_range?query=%7Bstream%3D%22{NAMESPACE}%22%7D%20%7C%20pod_id%20%3D%20%60{POD_ID}%60%20%7C%20container%20%3D%20%60{CONTAINER_NAME}%60&start={START}&end={END}&direction=forward + - | + LOG_URL_JSONPATH=$.data.result[*].values[*][1] + patches: + - patch: |- + $patch: delete + apiVersion: v1 + kind: ConfigMap + metadata: + name: kubearchive-logging + namespace: kubearchive + + - patch: |- + $patch: delete + apiVersion: v1 + kind: Secret + metadata: + name: kubearchive-logging + namespace: kubearchive + - patch: |- apiVersion: batch/v1 kind: Job diff --git a/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/kustomization.yaml b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/kustomization.yaml new file mode 100644 index 00000000000..8a676aa13a0 --- /dev/null +++ b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/kustomization.yaml @@ -0,0 +1,19 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +commonAnnotations: + ignore-check.kube-linter.io/drop-net-raw-capability: | + "Vector runs requires access to socket." + ignore-check.kube-linter.io/run-as-non-root: | + "Vector runs as Root and attach host Path." + ignore-check.kube-linter.io/sensitive-host-mounts: | + "Vector runs requires certain host mounts to watch files being created by pods." + ignore-check.kube-linter.io/pdb-unhealthy-pod-eviction-policy: | + "Managed by upstream Loki chart (no value exposed for unhealthyPodEvictionPolicy)." + +resources: +- ../base + +generators: +- vector-helm-generator.yaml +- loki-helm-generator.yaml diff --git a/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/loki-helm-generator.yaml b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/loki-helm-generator.yaml new file mode 100644 index 00000000000..01749fd3dee --- /dev/null +++ b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/loki-helm-generator.yaml @@ -0,0 +1,27 @@ +apiVersion: builtin +kind: HelmChartInflationGenerator +metadata: + name: loki +name: loki +repo: https://grafana.github.io/helm-charts +version: 6.30.1 +releaseName: loki +namespace: product-kubearchive-logging +valuesFile: loki-helm-values.yaml +additionalValuesFiles: + - loki-helm-prod-values.yaml +valuesInline: + # Cluster-specific overrides + serviceAccount: + create: true + name: loki-sa + annotations: + eks.amazonaws.com/role-arn: "arn:aws:iam::593793029194:role/kflux-prd-rh03-loki-storage-role" + loki: + storage: + bucketNames: + chunks: kflux-prd-rh03-loki-storage + admin: kflux-prd-rh03-loki-storage + storage_config: + aws: + bucketnames: kflux-prd-rh03-loki-storage diff --git a/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/loki-helm-prod-values.yaml b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/loki-helm-prod-values.yaml new file mode 100644 index 00000000000..6e847976b18 --- /dev/null +++ b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/loki-helm-prod-values.yaml @@ -0,0 +1,191 @@ +--- +gateway: + service: + type: LoadBalancer + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + memory: 256Mi + +# Basic Loki configuration with S3 storage +loki: + commonConfig: + replication_factor: 3 + # Required storage configuration for Helm chart + storage: + type: s3 + # bucketNames: Fill it on the generator for each cluster + s3: + region: us-east-1 + storage_config: + aws: + # bucketnames: Fill it on the generator for each cluster + region: us-east-1 + s3forcepathstyle: false + # Configure ingestion limits to handle Vector's data volume + limits_config: + retention_period: 744h # 31 days retention + ingestion_rate_mb: 50 + ingestion_burst_size_mb: 100 + ingestion_rate_strategy: "local" + max_streams_per_user: 0 + max_line_size: 2097152 + per_stream_rate_limit: 50M + per_stream_rate_limit_burst: 200M + reject_old_samples: false + reject_old_samples_max_age: 168h + discover_service_name: [] + discover_log_levels: false + volume_enabled: true + max_global_streams_per_user: 75000 + max_entries_limit_per_query: 100000 + increment_duplicate_timestamp: true + allow_structured_metadata: true + ingester: + chunk_target_size: 8388608 # 8MB + chunk_idle_period: 5m + max_chunk_age: 2h + chunk_encoding: snappy # Compress data (reduces S3 transfer size) + chunk_retain_period: 1h # Keep chunks in memory after flush + flush_op_timeout: 10m # Add timeout for S3 operations + + # Tuning for high-load queries + querier: + max_concurrent: 8 + query_range: + # split_queries_by_interval deprecated in Loki 3.x - removed + parallelise_shardable_queries: true + +# Distributed components configuration +ingester: + replicas: 3 + autoscaling: + enabled: true + zoneAwareReplication: + enabled: true + maxUnavailable: 1 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 2000m + memory: 2Gi + persistence: + enabled: true + size: 10Gi + affinity: {} + podAntiAffinity: + soft: {} + hard: {} + +querier: + replicas: 3 + autoscaling: + enabled: true + maxUnavailable: 1 + resources: + requests: + cpu: 300m + memory: 512Mi + limits: + memory: 1Gi + affinity: {} + +queryFrontend: + replicas: 2 + maxUnavailable: 1 + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + memory: 512Mi + +queryScheduler: + replicas: 2 + maxUnavailable: 1 + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + memory: 512Mi + +distributor: + replicas: 3 + autoscaling: + enabled: true + maxUnavailable: 1 + resources: + requests: + cpu: 300m + memory: 512Mi + limits: + memory: 1Gi + affinity: {} + +compactor: + replicas: 1 + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + memory: 1Gi + +indexGateway: + replicas: 2 + maxUnavailable: 0 + resources: + requests: + cpu: 300m + memory: 512Mi + limits: + memory: 1Gi + affinity: {} + +# Enable Memcached caches for performance +chunksCache: + enabled: true + replicas: 1 + +resultsCache: + enabled: true + replicas: 1 + +memcached: + enabled: true + +memcachedResults: + enabled: true + +memcachedChunks: + enabled: true + +memcachedFrontend: + enabled: true + +memcachedIndexQueries: + enabled: true + +memcachedIndexWrites: + enabled: true + +# Disable Minio - staging uses S3 with IAM role +minio: + enabled: false + +# Resources for memcached exporter to satisfy linter +memcachedExporter: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + memory: 128Mi diff --git a/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/loki-helm-values.yaml b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/loki-helm-values.yaml new file mode 100644 index 00000000000..4f6ff72bec7 --- /dev/null +++ b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/loki-helm-values.yaml @@ -0,0 +1,83 @@ +--- +# simplified Loki configuration for staging +deploymentMode: Distributed + + # This exposes the Loki gateway so it can be written to and queried externally +gateway: + image: + registry: quay.io # Use Quay.io registry to prevent docker hub rate limit + repository: nginx/nginx-unprivileged + tag: 1.24-alpine + nginxConfig: + resolver: "dns-default.openshift-dns.svc.cluster.local." + +# Basic Loki configuration +loki: + # Enable multi-tenancy to handle X-Scope-OrgID headers + auth_enabled: true + commonConfig: + path_prefix: /var/loki # This directory will be writable via volume mount + storage: + type: s3 + schemaConfig: + configs: + - from: "2024-04-01" + store: tsdb + object_store: s3 + schema: v13 + index: + prefix: loki_index_ + period: 24h + # Configure compactor to use writable volumes + compactor: + working_directory: /var/loki/compactor + +# Security contexts for OpenShift +podSecurityContext: + runAsNonRoot: false + allowPrivilegeEscalation: false + +containerSecurityContext: + runAsNonRoot: false + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true # Keep read-only root filesystem for security + +# Disable test pods +test: + enabled: false + +# Disable sidecar completely to avoid loki-sc-rules container +sidecar: + rules: + enabled: false + datasources: + enabled: false + +# Zero out replica counts of other deployment modes + +singleBinary: + replicas: 0 +backend: + replicas: 0 +read: + replicas: 0 +write: + replicas: 0 + +bloomPlanner: + replicas: 0 +bloomBuilder: + replicas: 0 +bloomGateway: + replicas: 0 + +# Disable lokiCanary - not essential for core functionality +lokiCanary: + enabled: false + +# Disable the ruler - not needed as we aren't using metrics +ruler: + enabled: false diff --git a/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/vector-helm-generator.yaml b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/vector-helm-generator.yaml new file mode 100644 index 00000000000..fd1d1d4e3b9 --- /dev/null +++ b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/vector-helm-generator.yaml @@ -0,0 +1,12 @@ +apiVersion: builtin +kind: HelmChartInflationGenerator +metadata: + name: vector +name: vector +repo: https://helm.vector.dev +version: 0.43.0 +releaseName: vector +namespace: product-kubearchive-logging +valuesFile: vector-helm-values.yaml +additionalValuesFiles: + - vector-helm-prod-values.yaml diff --git a/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/vector-helm-prod-values.yaml b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/vector-helm-prod-values.yaml new file mode 100644 index 00000000000..d6698dada2e --- /dev/null +++ b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/vector-helm-prod-values.yaml @@ -0,0 +1,17 @@ +--- +resources: + requests: + cpu: 512m + memory: 4096Mi + limits: + cpu: 2000m + memory: 4096Mi + +customConfig: + sources: + k8s_logs: + extra_label_selector: "app.kubernetes.io/managed-by in (tekton-pipelines,pipelinesascode.tekton.dev)" + extra_field_selector: "metadata.namespace!=product-kubearchive-logging" + +podLabels: + vector.dev/exclude: "false" diff --git a/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/vector-helm-values.yaml b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/vector-helm-values.yaml new file mode 100644 index 00000000000..674d36ea29c --- /dev/null +++ b/components/vector-kubearchive-log-collector/production/kflux-prd-rh03/vector-helm-values.yaml @@ -0,0 +1,163 @@ +--- +role: Agent + +customConfig: + data_dir: /vector-data-dir + api: + enabled: true + address: 127.0.0.1:8686 + playground: false + sources: + k8s_logs: + type: kubernetes_logs + rotate_wait_secs: 5 + glob_minimum_cooldown_ms: 500 + max_line_bytes: 3145728 + auto_partial_merge: true + transforms: + reduce_events: + type: reduce + inputs: + - k8s_logs + group_by: + - file + max_events: 100 + expire_after_ms: 10000 + merge_strategies: + message: concat_newline + remap_app_logs: + type: remap + inputs: + - reduce_events + source: |- + .tmp = del(.) + # Preserve original kubernetes fields for Loki labels + if exists(.tmp.kubernetes.pod_uid) { + .pod_id = del(.tmp.kubernetes.pod_uid) + } else { + .pod_id = "unknown_pod_id" + } + if exists(.tmp.kubernetes.container_name) { + .container = del(.tmp.kubernetes.container_name) + } else { + .container = "unknown_container" + } + # Extract namespace for low cardinality labeling + if exists(.tmp.kubernetes.pod_namespace) { + .namespace = del(.tmp.kubernetes.pod_namespace) + } else { + .namespace = "unknown_namespace" + } + # Preserve the actual log message + if exists(.tmp.message) { + .message = to_string(del(.tmp.message)) ?? "no_message" + } else { + .message = "no_message" + } + if length(.message) > 1048576 { + .message = slice!(.message, 0, 1048576) + "...[TRUNCATED]" + } + # Clean up temporary fields + del(.tmp) + sinks: + loki: + type: loki + inputs: ["remap_app_logs"] + # Send to Loki gateway + endpoint: "http://loki-gateway.product-kubearchive-logging.svc.cluster.local:80" + encoding: + codec: "text" + except_fields: ["tmp"] + only_fields: + - message + structured_metadata: + pod_id: "{{`{{ pod_id }}`}}" + container: "{{`{{ container }}`}}" + auth: + strategy: "basic" + user: "${LOKI_USERNAME}" + password: "${LOKI_PASSWORD}" + tenant_id: "kubearchive" + request: + headers: + X-Scope-OrgID: kubearchive + timeout_secs: 60 + batch: + max_bytes: 10485760 # 10MB batches + max_events: 10000 + timeout_secs: 30 + compression: "gzip" + labels: + stream: "{{`{{ namespace }}`}}" + buffer: + type: "memory" + max_events: 10000 + when_full: "drop_newest" +env: + - name: LOKI_USERNAME + valueFrom: + secretKeyRef: + name: kubearchive-loki + key: USERNAME + - name: LOKI_PASSWORD + valueFrom: + secretKeyRef: + name: kubearchive-loki + key: PASSWORD +nodeSelector: + konflux-ci.dev/workload: konflux-tenants +tolerations: + - effect: NoSchedule + key: konflux-ci.dev/workload + operator: Equal + value: konflux-tenants +image: + repository: quay.io/kubearchive/vector + tag: 0.46.1-distroless-libc +serviceAccount: + create: true + name: vector +securityContext: + allowPrivilegeEscalation: false + runAsUser: 0 + capabilities: + drop: + - CHOWN + - DAC_OVERRIDE + - FOWNER + - FSETID + - KILL + - NET_BIND_SERVICE + - SETGID + - SETPCAP + - SETUID + readOnlyRootFilesystem: true + seLinuxOptions: + type: spc_t + seccompProfile: + type: RuntimeDefault + +# Override default volumes to be more specific and secure +extraVolumes: + - name: varlog + hostPath: + path: /var/log/pods + type: Directory + - name: varlibdockercontainers + hostPath: + path: /var/lib/containers + type: DirectoryOrCreate + +extraVolumeMounts: + - name: varlog + mountPath: /var/log/pods + readOnly: true + - name: varlibdockercontainers + mountPath: /var/lib/containers + readOnly: true + +# Configure Vector to use emptyDir for its default data volume instead of hostPath +persistence: + enabled: false + +