Parth prajapati
06/16/2025, 7:48 AMAbdulmalik Salawu
06/16/2025, 8:55 AMAbdulmalik Salawu
06/16/2025, 8:59 AMParth prajapati
06/16/2025, 9:54 AMAbdulmalik Salawu
06/16/2025, 5:29 PMSrikanth Chekuri
06/17/2025, 9:24 AMParth prajapati
06/18/2025, 12:01 PMSrikanth Chekuri
06/19/2025, 2:16 AMParth prajapati
06/19/2025, 6:16 AMAbdulmalik Salawu
06/19/2025, 7:29 AMParth prajapati
06/19/2025, 7:30 AMAbdulmalik Salawu
06/19/2025, 7:33 AMotelCollector:
name: "otel-collector"
replicaCount: 1
nodeSelector:
role: observability
<http://kubernetes.io/arch|kubernetes.io/arch>: arm64
<http://node.kubernetes.io/instance-type|node.kubernetes.io/instance-type>: t4g.xlarge
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: role
operator: In
values: ["observability"]
- key: <http://kubernetes.io/arch|kubernetes.io/arch>
operator: In
values: ["arm64"]
- key: <http://node.kubernetes.io/instance-type|node.kubernetes.io/instance-type>
operator: In
values: ["t4g.xlarge"]
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: role
operator: In
values: ["observability"]
topologyKey: <http://kubernetes.io/hostname|kubernetes.io/hostname>
tolerations:
- key: "<http://node.kubernetes.io/instance-type|node.kubernetes.io/instance-type>"
operator: "Equal"
value: "t4g.xlarge"
effect: "NoSchedule"
resources:
requests:
memory: "3Gi"
cpu: "1000m"
limits:
memory: "5Gi" # Increased to 8GB for more headroom
cpu: "1000m"
annotations:
"<http://helm.sh/hook-weight|helm.sh/hook-weight>": "3"
podAnnotations:
<http://signoz.io/scrape|signoz.io/scrape>: 'true'
<http://signoz.io/port|signoz.io/port>: '8888'
config:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
max_recv_msg_size_mib: 64 # Keep original size
http:
cors:
allowed_headers:
- '*'
allowed_origins:
- '*'
endpoint: 0.0.0.0:4318
max_request_body_size: 2097152 # Keep original size
filelog:
exclude: []
include:
- /var/log/pods/**/*.log
include_file_name: false
include_file_path: true
operators:
- id: parser-containerd
output: containerd-recombine
regex: ^(?P<time>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}) \| (?P<log>.*)
type: regex_parser
- combine_field: attributes.log
id: containerd-recombine
is_first_entry: body matches '^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{3} [|] '
max_log_size: 102400
source_identifier: attributes["log.file.path"]
type: recombine
output: extract_metadata_from_filepath
- id: extract_metadata_from_filepath
parse_from: attributes["log.file.path"]
regex: ^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$
type: regex_parser
- from: attributes.container_name
to: resource["k8s.container.name"]
type: move
- from: attributes.namespace
to: resource["k8s.namespace.name"]
type: move
- from: attributes.pod_name
to: resource["k8s.pod.name"]
type: move
- from: attributes.restart_count
to: resource["k8s.container.restart_count"]
type: move
- from: attributes.uid
to: resource["k8s.pod.uid"]
type: move
- from: attributes.log
to: body
type: move
preserve_leading_whitespaces: false
preserve_trailing_whitespaces: true
start_at: beginning
httplogreceiver/heroku:
endpoint: 0.0.0.0:8081
source: heroku
httplogreceiver/json:
endpoint: 0.0.0.0:8082
source: json
jaeger:
protocols:
grpc:
endpoint: 0.0.0.0:14250
thrift_http:
endpoint: 0.0.0.0:14268
processors:
# 🛡️ RELAXED MEMORY LIMITER - Won't refuse data as easily
memory_limiter:
limit_mib: 6144 # 6GB (leave 2GB for system from 8GB total)
spike_limit_mib: 1024 # 1GB spike allowance (increased)
check_interval: 2s # Check every 2s (less frequent)
# 🚀 KEEP FAST BATCHING for real-time traces
batch:
metadata_keys: []
send_batch_size: 100 # Keep small for real-time
send_batch_max_size: 200 # Keep small for real-time
timeout: 200ms # Keep fast for real-time
probabilistic_sampler:
fail_closed: false
hash_seed: 42
mode: hash_seed
sampling_percentage: 100
sampling_precision: 4
resourcedetection:
detectors:
- env
- system
override: false
timeout: 500ms # Keep fast
signozspanmetrics/delta:
aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
dimensions:
- default: default
name: service.namespace
- default: default
name: deployment.environment
- name: signoz.collector.id
dimensions_cache_size: 500 # Balanced cache size
latency_histogram_buckets:
- 100us
- 1ms
- 10ms
- 100ms
- 1000ms
- 10s
metrics_exporter: signozclickhousemetrics
tail_sampling:
decision_wait: 500ms # Keep fast
expected_new_traces_per_sec: 200
num_traces: 200 # Balanced
policies:
- name: error_traces
status_code:
status_codes:
- ERROR
type: status_code
- name: drop_noisy_traces_url
string_attribute:
enabled_regex_matching: true
invert_match: true
key: http.target
values:
- \/metrics
- \/actuator*
- opentelemetry\.proto
- favicon\.ico
- \/api\/[^/]+\/(?:svc|live)
type: string_attribute
exporters:
debug:
verbosity: basic
clickhousetraces:
use_new_schema: true
timeout: 10s # Keep fast timeouts for real-time
clickhousemetricswrite:
timeout: 20s
resource_to_telemetry_conversion:
enabled: true
disable_v2: true
signozclickhousemetrics:
timeout: 20s
clickhouselogsexporter:
timeout: 15s
use_new_schema: true
metadataexporter:
cache:
provider: in_memory
timeout: 5s
extensions:
health_check:
endpoint: 0.0.0.0:13133
pprof:
endpoint: localhost:1777
zpages:
endpoint: localhost:55679
service:
extensions:
- health_check
- zpages
- pprof
pipelines:
traces:
receivers: [otlp, jaeger]
processors: [memory_limiter, resourcedetection, probabilistic_sampler, tail_sampling, signozspanmetrics/delta, batch]
exporters: [clickhousetraces, metadataexporter, debug]
metrics:
receivers: [otlp]
processors: [memory_limiter, resourcedetection, batch]
exporters: [clickhousemetricswrite, metadataexporter, signozclickhousemetrics, debug]
logs:
receivers: [otlp, httplogreceiver/heroku, httplogreceiver/json, filelog]
processors: [memory_limiter, resourcedetection, probabilistic_sampler, batch]
exporters: [clickhouselogsexporter, metadataexporter, debug]
telemetry:
logs:
encoding: json
level: info
metrics:
address: 0.0.0.0:8888
level: detailed
Abdulmalik Salawu
06/19/2025, 7:34 AM# 🚀 KEEP FAST BATCHING for real-time traces
batch:
metadata_keys: []
send_batch_size: 100 # Keep small for real-time
send_batch_max_size: 200 # Keep small for real-time
timeout: 200ms # Keep fast for real-time
Srikanth Chekuri
06/19/2025, 10:10 AMParth prajapati
06/19/2025, 11:11 AMParth prajapati
06/19/2025, 11:13 AMParth prajapati
06/23/2025, 10:56 AMParth prajapati
06/23/2025, 10:56 AMAbdulmalik Salawu
06/23/2025, 6:05 PMSrikanth Chekuri
06/24/2025, 12:56 AMParth prajapati
06/26/2025, 5:46 AMAbdulmalik Salawu
06/26/2025, 4:39 PMSrikanth Chekuri
06/27/2025, 2:50 AMParth prajapati
06/27/2025, 5:50 AM