From fbc1b4f06b8c450a31440cc749e5634ed2fa2b5b Mon Sep 17 00:00:00 2001 From: Vinicius Fortuna Date: Thu, 20 Oct 2022 15:09:38 -0500 Subject: [PATCH] Reduce metrics resources (#123) This PR: - Removes `location` from the `shadowsocks_tcp_probes` time series, which is the largest one. - Removes 0s from `shadowsocks_data_bytes`. We were creating 4 entries (each proxy leg) for every country that attempted to connect, even if they were unsuccessful connections. The metric was dominated by zeros. - Reduces the cardinality of `shadowsocks_data_bytes` by removing the status and location. We don't need that for data limits. To enable country usage tracking, I introduce `shadowsocks_data_bytes_per_location`. To enable status tracking for UDP, I introduce `shadowsocks_udp_packets_from_client_per_location`. TCP already had `shadowsocks_connections_closed` for status tracking. --- .github/CODEOWNERS | 3 -- service/PROBES.md | 4 -- service/metrics/metrics.go | 87 +++++++++++++++++++++------------ service/metrics/metrics_test.go | 5 +- service/tcp.go | 2 +- service/tcp_test.go | 2 +- service/udp_test.go | 2 +- 7 files changed, 61 insertions(+), 44 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index fa84ab13..acf2e595 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,4 +1 @@ * @Jigsaw-Code/outline-networking-owners - -*.md @Jigsaw-Code/outline-strings-owners -LICENSE @Jigsaw-Code/outline-strings-owners diff --git a/service/PROBES.md b/service/PROBES.md index c34bf614..befbcbe7 100644 --- a/service/PROBES.md +++ b/service/PROBES.md @@ -29,7 +29,3 @@ This feature is on by default in Outline. Admins who are using outline-ss-serve Shadowsocks uses the same Key Derivation Function for both upstream and downstream flows, so in principle an attacker could record data sent from the server to the client, and use it in a "reflected replay" attack as simulated client->server data. The data would appear to be valid and authenticated to the server, but the connection would most likely fail when attempting to parse the destination address header, perhaps leading to a distinctive failure behavior. To avoid this class of attacks, outline-ss-server uses an [HMAC](https://en.wikipedia.org/wiki/HMAC) with a 32-bit tag to mark all server handshakes, and checks for the presence of this tag in all incoming handshakes. If the tag is present, the connection is a reflected replay, with a false positive probability of 1 in 4 billion. - -## Metrics - -Outline provides server operators with metrics on a variety of aspects of server activity, including any detected attacks. To observe attacks detected by your server, look at the `tcp_probes` histogram vector in Prometheus. The `status` field will be `"ERR_CIPHER"` (indicating invalid probe data), `"ERR_REPLAY_CLIENT"`, or `"ERR_REPLAY_SERVER"`, depending on the kind of attack your server observed. You can also see what country each probe appeared to originate from, and approximately how many bytes were sent before giving up. \ No newline at end of file diff --git a/service/metrics/metrics.go b/service/metrics/metrics.go index e9a3905b..bda7b0b7 100644 --- a/service/metrics/metrics.go +++ b/service/metrics/metrics.go @@ -38,7 +38,7 @@ type ShadowsocksMetrics interface { // TCP metrics AddOpenTCPConnection(clientLocation string) AddClosedTCPConnection(clientLocation, accessKey, status string, data ProxyMetrics, timeToCipher, duration time.Duration) - AddTCPProbe(clientLocation, status, drainResult string, port int, data ProxyMetrics) + AddTCPProbe(status, drainResult string, port int, data ProxyMetrics) // UDP metrics AddUDPPacketFromClient(clientLocation, accessKey, status string, clientProxyBytes, proxyTargetBytes int, timeToCipher time.Duration) @@ -50,11 +50,12 @@ type ShadowsocksMetrics interface { type shadowsocksMetrics struct { ipCountryDB *geoip2.Reader - buildInfo *prometheus.GaugeVec - accessKeys prometheus.Gauge - ports prometheus.Gauge - dataBytes *prometheus.CounterVec - timeToCipherMs *prometheus.HistogramVec + buildInfo *prometheus.GaugeVec + accessKeys prometheus.Gauge + ports prometheus.Gauge + dataBytes *prometheus.CounterVec + dataBytesPerLocation *prometheus.CounterVec + timeToCipherMs *prometheus.HistogramVec // TODO: Add time to first byte. tcpProbes *prometheus.HistogramVec @@ -62,11 +63,13 @@ type shadowsocksMetrics struct { tcpClosedConnections *prometheus.CounterVec tcpConnectionDurationMs *prometheus.HistogramVec - udpAddedNatEntries prometheus.Counter - udpRemovedNatEntries prometheus.Counter + udpPacketsFromClientPerLocation *prometheus.CounterVec + udpAddedNatEntries prometheus.Counter + udpRemovedNatEntries prometheus.Counter } func newShadowsocksMetrics(ipCountryDB *geoip2.Reader) *shadowsocksMetrics { + // Don't forget to pass the counters to the registerer.MustRegister call in NewPrometheusShadowsocksMetrics. return &shadowsocksMetrics{ ipCountryDB: ipCountryDB, buildInfo: prometheus.NewGaugeVec(prometheus.GaugeOpts{ @@ -84,6 +87,12 @@ func newShadowsocksMetrics(ipCountryDB *geoip2.Reader) *shadowsocksMetrics { Name: "ports", Help: "Count of open Shadowsocks ports", }), + tcpProbes: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "shadowsocks", + Name: "tcp_probes", + Buckets: []float64{0, 49, 50, 51, 73, 91}, + Help: "Histogram of number of bytes from client to proxy, for detecting possible probes", + }, []string{"port", "status", "error"}), tcpOpenConnections: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "shadowsocks", Subsystem: "tcp", @@ -115,14 +124,14 @@ func newShadowsocksMetrics(ipCountryDB *geoip2.Reader) *shadowsocksMetrics { prometheus.CounterOpts{ Namespace: "shadowsocks", Name: "data_bytes", - Help: "Bytes transferred by the proxy", - }, []string{"dir", "proto", "location", "status", "access_key"}), - tcpProbes: prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Namespace: "shadowsocks", - Name: "tcp_probes", - Buckets: []float64{0, 49, 50, 51, 73, 91}, - Help: "Histogram of number of bytes from client to proxy, for detecting possible probes", - }, []string{"location", "port", "status", "error"}), + Help: "Bytes transferred by the proxy, per access key", + }, []string{"dir", "proto", "access_key"}), + dataBytesPerLocation: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "shadowsocks", + Name: "data_bytes_per_location", + Help: "Bytes transferred by the proxy, per location", + }, []string{"dir", "proto", "location"}), timeToCipherMs: prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: "shadowsocks", @@ -130,6 +139,13 @@ func newShadowsocksMetrics(ipCountryDB *geoip2.Reader) *shadowsocksMetrics { Help: "Time needed to find the cipher", Buckets: []float64{0.1, 1, 10, 100, 1000}, }, []string{"proto", "found_key"}), + udpPacketsFromClientPerLocation: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "shadowsocks", + Subsystem: "udp", + Name: "packets_from_client_per_location", + Help: "Packets received from the client, per location and status", + }, []string{"location", "status"}), udpAddedNatEntries: prometheus.NewCounter( prometheus.CounterOpts{ Namespace: "shadowsocks", @@ -154,8 +170,8 @@ func newShadowsocksMetrics(ipCountryDB *geoip2.Reader) *shadowsocksMetrics { func NewPrometheusShadowsocksMetrics(ipCountryDB *geoip2.Reader, registerer prometheus.Registerer) ShadowsocksMetrics { m := newShadowsocksMetrics(ipCountryDB) // TODO: Is it possible to pass where to register the collectors? - registerer.MustRegister(m.buildInfo, m.accessKeys, m.ports, m.tcpOpenConnections, m.tcpProbes, m.tcpClosedConnections, m.tcpConnectionDurationMs, - m.dataBytes, m.timeToCipherMs, m.udpAddedNatEntries, m.udpRemovedNatEntries) + registerer.MustRegister(m.buildInfo, m.accessKeys, m.ports, m.tcpProbes, m.tcpOpenConnections, m.tcpClosedConnections, m.tcpConnectionDurationMs, + m.dataBytes, m.dataBytesPerLocation, m.timeToCipherMs, m.udpPacketsFromClientPerLocation, m.udpAddedNatEntries, m.udpRemovedNatEntries) return m } @@ -216,9 +232,9 @@ func isFound(accessKey string) string { } // addIfNonZero helps avoid the creation of series that are always zero. -func addIfNonZero(counter prometheus.Counter, value int64) { +func addIfNonZero(value int64, counterVec *prometheus.CounterVec, lvs ...string) { if value > 0 { - counter.Add(float64(value)) + counterVec.WithLabelValues(lvs...).Add(float64(value)) } } @@ -226,25 +242,34 @@ func (m *shadowsocksMetrics) AddClosedTCPConnection(clientLocation, accessKey, s m.tcpClosedConnections.WithLabelValues(clientLocation, status, accessKey).Inc() m.tcpConnectionDurationMs.WithLabelValues(status).Observe(duration.Seconds() * 1000) m.timeToCipherMs.WithLabelValues("tcp", isFound(accessKey)).Observe(timeToCipher.Seconds() * 1000) - addIfNonZero(m.dataBytes.WithLabelValues("c>p", "tcp", clientLocation, status, accessKey), data.ClientProxy) - addIfNonZero(m.dataBytes.WithLabelValues("p>t", "tcp", clientLocation, status, accessKey), data.ProxyTarget) - addIfNonZero(m.dataBytes.WithLabelValues("pp", "tcp", accessKey) + addIfNonZero(data.ClientProxy, m.dataBytesPerLocation, "c>p", "tcp", clientLocation) + addIfNonZero(data.ProxyTarget, m.dataBytes, "p>t", "tcp", accessKey) + addIfNonZero(data.ProxyTarget, m.dataBytesPerLocation, "p>t", "tcp", clientLocation) + addIfNonZero(data.TargetProxy, m.dataBytes, "pp", "udp", clientLocation, status, accessKey), int64(clientProxyBytes)) - addIfNonZero(m.dataBytes.WithLabelValues("p>t", "udp", clientLocation, status, accessKey), int64(proxyTargetBytes)) + m.udpPacketsFromClientPerLocation.WithLabelValues(clientLocation, status).Inc() + addIfNonZero(int64(clientProxyBytes), m.dataBytes, "c>p", "udp", accessKey) + addIfNonZero(int64(clientProxyBytes), m.dataBytesPerLocation, "c>p", "udp", clientLocation) + addIfNonZero(int64(proxyTargetBytes), m.dataBytes, "p>t", "udp", accessKey) + addIfNonZero(int64(proxyTargetBytes), m.dataBytesPerLocation, "p>t", "udp", clientLocation) } func (m *shadowsocksMetrics) AddUDPPacketFromTarget(clientLocation, accessKey, status string, targetProxyBytes, proxyClientBytes int) { - addIfNonZero(m.dataBytes.WithLabelValues("p