data:
adot-collector-config: |
receivers:
prometheus:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
metrics_path: "/metrics/executors/prometheus"
spec:
ports:
- name: executor-metrics
port: 4040
---
apiVersion: v1
kind: ConfigMap
metadata:
name: adot-collector-conf
namespace: adot-col
labels:
app: aws-adot
component: adot-collector-conf
data:
adot-collector-config: |
receivers:
prometheus:
config:
global:
evaluation_interval: 1m
scrape_interval: 1m
scrape_timeout: 30s
scrape_configs:
- job_name: 'kubernetes-service-endpoints-spark'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
metrics_path: "/metrics/executors/prometheus"
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: true
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
- action: replace
regex: (.+)
source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $$1:$$2
source_labels: [__address__,__meta_kubernetes_service_annotation_prometheus_io_port]
target_label: __address__
- job_name: 'kubernetes-executor-pods-spark'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
metrics_path: "/metrics/executors/prometheus"
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: kubernetes_pod_name
- job_name: 'kubernetes-executor-pods-spark-slow'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
metrics_path: "/metrics/executors/prometheus"
scrape_interval: 2m
scrape_timeout: 40s
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: kubernetes_pod_name
- job_name: 'kubernetes-driver-pods-spark'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
metrics_path: "/metrics/driver/prometheus" # spark.metrics.conf.*.sink.prometheusServlet.path
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
exporters:
awsprometheusremotewrite:
# replace this with your endpoint
endpoint: <removing this for privacy>
# replace this with your region
aws_auth:
region: <removing this for privacy>
service: "aps"
namespace: "adot"
logging:
loglevel: debug
extensions:
health_check:
endpoint: :13133
pprof:
endpoint: :1777
zpages:
endpoint: :55679
service:
extensions: [pprof, zpages, health_check]
pipelines:
metrics:
receivers: [prometheus]
exporters: [logging, awsprometheusremotewrite]
---
# create adot-col service account and role binding
apiVersion: v1
kind: ServiceAccount
metadata:
name: amp-iamproxy-ingest-service-account
namespace: adot-col
annotations:
eks.amazonaws.com/role-arn: arn:aws:iam::<removing this for privacy>:role/amp-iamproxy-ingest-role
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: adotcol-admin-role
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: adotcol-admin-role-binding
subjects:
- kind: ServiceAccount
name: amp-iamproxy-ingest-service-account
namespace: adot-col
roleRef:
kind: ClusterRole
name: adotcol-admin-role
apiGroup: rbac.authorization.k8s.io
---
apiVersion: v1
kind: Service
metadata:
name: adot-collector
namespace: adot-col
labels:
app: aws-adot
component: adot-collector
spec:
ports:
- name: executor-metrics
port: 4040
- name: metrics # Default endpoint for querying metrics.
port: 8888
selector:
component: adot-collector
type: NodePort
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: adot-collector
namespace: adot-col
labels:
app: aws-adot
component: adot-collector
spec:
selector:
matchLabels:
app: aws-adot
component: adot-collector
minReadySeconds: 5
template:
metadata:
labels:
app: aws-adot
component: adot-collector
spec:
serviceAccountName: amp-iamproxy-ingest-service-account
containers:
- command:
- "/awscollector"
- "--config=/conf/adot-collector-config.yaml"
image: public.ecr.aws/aws-observability/aws-otel-collector:latest
name: adot-collector
resources:
limits:
cpu: 1
memory: 2Gi
requests:
cpu: 200m
memory: 400Mi
ports:
- containerPort: 8888 # Default endpoint for querying metrics.
volumeMounts:
- name: adot-collector-config-vol
mountPath: /conf
livenessProbe:
httpGet:
path: /
port: 13133 # Health Check extension port.
readinessProbe:
httpGet:
path: /
port: 13133 # Health Check extension port.
volumes:
- configMap:
name: adot-collector-conf
items:
- key: adot-collector-config
path: adot-collector-config.yaml
name: adot-collector-config-vol
---
spark_info{version="3.1.1", revision=""} 1.0
metrics_executor_rddBlocks{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_memoryUsed_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 1816
metrics_executor_diskUsed_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_totalCores{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_maxTasks{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_activeTasks{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_failedTasks_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_completedTasks_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_totalTasks_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_totalDuration_seconds_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0.0
metrics_executor_totalGCTime_seconds_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0.0
metrics_executor_totalInputBytes_bytes_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_totalShuffleRead_bytes_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_totalShuffleWrite_bytes_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_maxMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 1078827417
metrics_executor_usedOnHeapStorageMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 1816
metrics_executor_usedOffHeapStorageMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_totalOnHeapStorageMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 1078827417
metrics_executor_totalOffHeapStorageMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_JVMHeapMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 1224267120
metrics_executor_JVMOffHeapMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 120520560
metrics_executor_OnHeapExecutionMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_OffHeapExecutionMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_OnHeapStorageMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 5000
metrics_executor_OffHeapStorageMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_OnHeapUnifiedMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 5000
metrics_executor_OffHeapUnifiedMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_DirectPoolMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 320589
metrics_executor_MappedPoolMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_ProcessTreeJVMVMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 5970055168
metrics_executor_ProcessTreeJVMRSSMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 1603010560
metrics_executor_ProcessTreePythonVMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_ProcessTreePythonRSSMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_ProcessTreeOtherVMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_ProcessTreeOtherRSSMemory_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0
metrics_executor_MinorGCCount_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 18366
metrics_executor_MajorGCCount_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 18
metrics_executor_MinorGCTime_seconds_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 147.54
metrics_executor_MajorGCTime_seconds_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="driver"} 0.45
metrics_executor_rddBlocks{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="2"} 0
metrics_executor_memoryUsed_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="2"} 1816
metrics_executor_diskUsed_bytes{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="2"} 0
metrics_executor_totalCores{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="2"} 2
metrics_executor_maxTasks{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="2"} 2
metrics_executor_activeTasks{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="2"} 2
metrics_executor_failedTasks_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="2"} 0
metrics_executor_completedTasks_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="2"} 450466
metrics_executor_totalTasks_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="2"} 450468
metrics_executor_totalDuration_seconds_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="2"} 2356.288
metrics_executor_totalGCTime_seconds_total{application_id="spark-af748c06812c416e946c4aedfd4b4c4b", application_name="Spark Pi", executor_id="2"} 11.77
...