Skip to content

Commit

Permalink
docs: Describe Otel log gateway PoC
Browse files Browse the repository at this point in the history
  • Loading branch information
chrkl committed Mar 1, 2024
1 parent 463d3b4 commit 475327b
Show file tree
Hide file tree
Showing 6 changed files with 315 additions and 6 deletions.
11 changes: 11 additions & 0 deletions docs/contributor/pocs/assets/otel-gateway-pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: otel-queue
spec:
storageClassName: default
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
150 changes: 150 additions & 0 deletions docs/contributor/pocs/assets/otel-log-agent-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
mode: daemonset

presets:
logsCollection:
enabled: true
storeCheckpoints: true
kubernetesAttributes:
enabled: false
extractAllPodLabels: false

config:
receivers:
filelog:
include: [ /var/log/pods/*/*/*.log ]
exclude: []
# Exclude collector container's logs. The file format is /var/log/pods/<namespace_name>_<pod_name>_<pod_uid>/<container_name>/<run_id>.log
start_at: beginning
retry_on_failure:
enabled: true
include_file_path: true
include_file_name: false
operators:
# Find out which format is used by kubernetes
- type: router
id: get-format
routes:
- output: parser-docker
expr: 'body matches "^\\{"'
- output: parser-crio
expr: 'body matches "^[^ Z]+ "'
- output: parser-containerd
expr: 'body matches "^[^ Z]+Z"'
# Parse CRI-O format
- type: regex_parser
id: parser-crio
regex: '^(?P<time>[^ Z]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$'
timestamp:
parse_from: attributes.time
layout_type: gotime
layout: '2006-01-02T15:04:05.999999999Z07:00'
- type: recombine
id: crio-recombine
output: extract_metadata_from_filepath
combine_field: attributes.log
source_identifier: attributes["log.file.path"]
is_last_entry: "attributes.logtag == 'F'"
combine_with: ""
# Parse CRI-Containerd format
- type: regex_parser
id: parser-containerd
regex: '^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$'
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
- type: recombine
id: containerd-recombine
output: extract_metadata_from_filepath
combine_field: attributes.log
source_identifier: attributes["log.file.path"]
is_last_entry: "attributes.logtag == 'F'"
combine_with: ""
# Parse Docker format
- type: json_parser
id: parser-docker
output: extract_metadata_from_filepath
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Extract metadata from file path
- type: regex_parser
id: extract_metadata_from_filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
parse_from: attributes["log.file.path"]
# Rename attributes
- type: move
from: attributes.stream
to: attributes["log.iostream"]
- type: move
from: attributes.container_name
to: resource["k8s.container.name"]
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod_name
to: resource["k8s.pod.name"]
- type: move
from: attributes.restart_count
to: resource["k8s.container.restart_count"]
- type: move
from: attributes.uid
to: resource["k8s.pod.uid"]
# Clean up log body
- type: move
from: attributes.log
to: body
# Extract JSON attributes
- type: json_parser
if: 'body matches "^{.*}$"'
parse_from: body
parse_to: attributes
- type: copy
from: body
to: attributes.original
- type: move
from: attributes.message
to: body
if: 'attributes.message != nil'
- type: move
from: attributes.msg
to: body
if: 'attributes.msg != nil'
- type: severity_parser
parse_from: attributes.level
if: 'attributes.level != nil'

exporters:
otlp:
endpoint: log-gateway-opentelemetry-collector:4317
tls:
insecure: true
service:
telemetry:
metrics:
address: ${env:MY_POD_IP}:8888
pipelines:
logs:
processors: {}
exporters:
- otlp

service:
# Enable the creation of a Service.
# By default, it's enabled on mode != daemonset.
# However, to enable it on mode = daemonset, its creation must be explicitly enabled
enabled: true

ports:
metrics:
# The metrics port is disabled by default. However you need to enable the port
# in order to use the ServiceMonitor (serviceMonitor.enabled) or PodMonitor (podMonitor.enabled).
enabled: true

serviceMonitor:
enabled: true
metricsEndpoints:
- port: metrics

image:
pullPolicy: Always
83 changes: 83 additions & 0 deletions docs/contributor/pocs/assets/otel-log-gateway-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
mode: deployment

presets:
logsCollection:
enabled: false
kubernetesAttributes:
enabled: true
extractAllPodLabels: true

config:
processors:
batch:
send_batch_size: 512
timeout: 10s
send_batch_max_size: 512
receivers:
otlp: {}
extensions:
health_check:
endpoint: ${env:MY_POD_IP}:13133
file_storage/queue:
exporters:
otlp:
endpoint: ${ingest-otlp-endpoint}
sending_queue:
storage: file_storage/queue
tls:
insecure: false
cert_pem: ${ingest-otlp-cert}
key_pem: ${ingest-otlp-key}
service:
telemetry:
metrics:
address: ${env:MY_POD_IP}:8888
extensions:
- file_storage/queue
- health_check
pipelines:
logs:
processors:
- batch
receivers:
- otlp
exporters:
- otlp

ports:
metrics:
# The metrics port is disabled by default. However you need to enable the port
# in order to use the ServiceMonitor (serviceMonitor.enabled) or PodMonitor (podMonitor.enabled).
enabled: true

extraEnvsFrom:
- secretRef:
name: sap-cloud-logging

extraVolumes:
- name: persistent-queue
persistentVolumeClaim:
claimName: otel-queue

extraVolumeMounts:
- name: persistent-queue
mountPath: /var/lib/otelcol/file_storage

serviceMonitor:
enabled: true
metricsEndpoints:
- port: metrics

securityContext:
runAsUser: 0

image:
pullPolicy: Always

rollout:
rollingUpdate: {}
# When 'mode: daemonset', maxSurge cannot be used when hostPort is set for any of the ports
# maxSurge: 25%
# maxUnavailable: 0
strategy: Recreate

4 changes: 4 additions & 0 deletions docs/contributor/pocs/assets/otlp-logs.drawio.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
73 changes: 67 additions & 6 deletions docs/contributor/pocs/otlp-logs.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# OpenTelemetry Logs PoC

## Scope and Goals
## 1. Log Record Parsing

### Scope and Goals

When integrating an OTLP compliant logging backend, applications can either ingest their logs directly or emit them to STDOUT and use a log collector to process and forward the logs.
With this PoC, we evaluated how the OpenTelemetry Collector's [filelog receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver) can be configured to transform structured JSON logs emitted by Kubernetes workloads to STDOUT, and subsequently to the [OTLP logs data model](https://opentelemetry.io/docs/specs/otel/logs/data-model/).
With the first part of this PoC, we evaluated how the OpenTelemetry Collector's [filelog receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver) can be configured to transform structured JSON logs emitted by Kubernetes workloads to STDOUT, and subsequently to the [OTLP logs data model](https://opentelemetry.io/docs/specs/otel/logs/data-model/).
OpenTelemtry Collector should move JSON attributes to the **attributes** map of the log record, extract other fields like **severity** or **timestamp**, write the actual log message to the **body** field, and add any missing information to ensure that the **attributes** and **resource** attributes comply with the semantic conventions.

This PoC does not cover logs ingested by the application using the OTLP protocol. We assume that the application already fills the log record fields with the intended values.
Expand All @@ -14,15 +16,14 @@ We created a Helm values file for the `open-telemetry/opentelemetry-collector` c

1. Create an SAP Cloud Logging instance. Store the endpoint, client certificate, and key under the keys `ingest-otlp-endpoint`, `ingest-otlp-cert`, and `ingest-otlp-key` respectively, in a Kubernetes Secret within the `otel-logging` namespace.

2. Deploy the OpenTelemetry Collector Helm chart with the values file [otlp-logs.yaml](../assets/otel-logs-values.yaml):
2. Deploy the OpenTelemetry Collector Helm chart with the values file [otlp-logs.yaml](assets/otel-logs-values.yaml):

```bash
helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts
helm install -n otel-logging logging open-telemetry/opentelemetry-collector \
-f ../assets/otel-logs-values.yaml
helm install -n otel-logging logging open-telemetry/opentelemetry-collector -f ./assets/otel-logs-values.yaml
```

## Results
### Results

We tested different log formats to evaluate the filelog receiver configuration. The following example of a log record emitted by telemetry-metric-agent demonstrates the transformation. The original log record looks as follows:

Expand Down Expand Up @@ -104,3 +105,63 @@ In the used configuration, we move the original log record to the **original** a
The OpenTelemetry Collector setup is able to extract the log message from different attributes, depending on their presence. This means that it is possible to support different logging libraries.
Non-JSON logs are preserved in the **body** field until the enrichment with resource attributes is completed.
## 2. Buffering and Backpressure
### Scope and Goals
After evaluating the filelog receiver configuration in the first part of the PoC, we want to test the buffering and backpressure capabilities of the OpenTelemetry Collector. The OpenTelemetry based logging solution should give similar resilience and guarantees about log delivery as the current logging solution.
## Setup
We split the OpenTelemetry Collector for log processing to an agent (DaemonSet) and a gateway (StatefulSet). The agent uses the same configuration as shown in the first part of the PoC to read logs from the host file-system and converts them to the OTLP format, while the gateway adds Kubernetes metadata and ensures that no logs are lost in the case of a backend failure.
The figure below shows the different plugins that are configured in the processing pipeline. Important to mention is the use of the batch processor in the gateway, which introduces asynchronicity to the pipeline and causes that backpressure is not propagated back to the agent. To minimize the risk of log loss due to the batch processors properties, a persistent exporter queue was introduced in the gateway, which uses a persistent volume to buffer logs in case of a backend failure.
![Otel Collector Setup](./assets/otlp-logs.drawio.svg)
To deploy the OpenTelemetry Collector agent and gateway, perform the following steps:
1. Create a SAP Cloud Logging instance as described above
1. Create a persistent volume claim (PVC):
```bash
kubectl apply -n otel-logging -f ./assets/otel-gateway-pvc.yaml
```

1. Deploy the gateway:

```bash
helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts
helm install -n otel-logging logging open-telemetry/opentelemetry-collector -f ./assets/otel-log-gateway-values.yaml
```

1. Deploy the agent:

```bash
helm install -n otel-logging logging open-telemetry/opentelemetry-collector -f ./assets/otel-log-agent-values.yaml
```

### Evaluation

To evaluate the buffering and backpressure capabilities of the described OpenTelemetry Collector setup, we tested the following scenarios and observed the described behavior:

* **Outage of the OTLP backend**

Log records cannot be shipped from the gateway to the OTLP backend (SAP Cloud Logging). Once the configured queue limit is reached, log records will be dropped from the queue. The enqueue errors are not propagated back to other pipeline elements due to the asynchronicity introduced by the batch processor.

* **Broken connectivity between the agent and the gateway**

Log records cannot be exported by the agent to the gateway using the OTLP protocol. The exporter queue on the agent will up to its maximum size and then start rejecting new records. This enqueue error is propagated to the filelog receiver, which eventually stops reading new logs. Log loss is avoided until the log retention of the kubelet removes old logs.

### Conclusions

The evaluation of the two failure scenarios showed that the OpenTelemetry Collector can similar guarantees about the prevention of log loss as the current Fluent Bit setup. When using a batch processer, using a persistent output queue and with that increasing the queue capacity, helps to prevent data loss. Splitting that processing pipeline to agent and gateway allows to use a PVC for the exporter queue and with that give it a large capacity without the risk that the node file-system fills up.

During the evaluation, the following potential problems and risks have been identified:

* The persistent queue of the OpenTelemetry Collector is still in alpha state and might not be suitable yet for production use.
* The queue capacity is configured by the number of batches. A storage capacity based limitation is not possible. This makes it hard to give exact guarantees about the stored logs before data loss.
* Once allocated, the utilized storage space of the persistent queue never shrinks again. This is not a problem as long as a dedicated PVC is used for the queue, but makes it less suitable to be stored on the node's host file system.
* Not using a batch processor in the agent might have a negative performance impact.

0 comments on commit 475327b

Please sign in to comment.