y-scope · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/components/clp-py-utils/clp_py_utils/initialize-results-cache.py b/components/clp-py-utils/clp_py_utils/initialize-results-cache.py
@@ -1,6 +1,7 @@
 import argparse
 import logging
 import sys
+from typing import Tuple
 from urllib.parse import urlparse
 
 from pymongo import IndexModel, MongoClient
@@ -17,7 +18,7 @@
 logger.addHandler(logging_console_handler)
 
 
-def check_replica_set_status(client: MongoClient, netloc: str) -> tuple[bool, bool]:
+def check_replica_set_status(client: MongoClient, netloc: str) -> Tuple[bool, bool]:
     """
     Checks the current replica set status of the MongoDB server and determines whether it needs to
     be configured (or reconfigured).

diff --git a/components/core/src/clp/ffi/ir_stream/Serializer.cpp b/components/core/src/clp/ffi/ir_stream/Serializer.cpp
@@ -550,6 +550,9 @@ auto Serializer<encoded_variable_t>::create(
             cVariableEncodingMethodsVersion
     );
     if (optional_user_defined_metadata.has_value()) {
+        if (false == optional_user_defined_metadata.value().is_object()) {
+            return std::errc::protocol_not_supported;
+        }
         metadata.emplace(
                 string{cProtocol::Metadata::UserDefinedMetadataKey},
                 std::move(optional_user_defined_metadata.value())

diff --git a/components/core/src/clp/ffi/ir_stream/Serializer.hpp b/components/core/src/clp/ffi/ir_stream/Serializer.hpp
@@ -41,9 +41,11 @@ class Serializer {
     // Factory functions
     /**
      * Creates an IR serializer and serializes the stream's preamble.
-     * @param optional_user_defined_metadata Stream-level user-defined metadata.
+     * @param optional_user_defined_metadata Stream-level user-defined metadata, given as a JSON
+     * object.
      * @return A result containing the serializer or an error code indicating the failure:
-     * - std::errc::protocol_error on failure to serialize the preamble.
+     * - std::errc::protocol_error if the stream's metadata couldn't be serialized.
+     * - std::errc::protocol_not_supported if the given user-defined metadata is not a JSON object.
      */
     [[nodiscard]] static auto create(
             std::optional<nlohmann::json> optional_user_defined_metadata = std::nullopt

diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp
@@ -1494,3 +1494,23 @@ TEMPLATE_TEST_CASE(
     };
     REQUIRE(assert_invalid_serialization(array_with_invalid_submap));
 }
+
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+TEMPLATE_TEST_CASE(
+        "ffi_ir_stream_Serializer_serialize_invalid_user_defined_metadata",
+        "[clp][ffi][ir_stream][Serializer]",
+        four_byte_encoded_variable_t,
+        eight_byte_encoded_variable_t
+) {
+    auto invalid_user_defined_metadata = GENERATE(
+            nlohmann::json(std::string{"str"}),
+            nlohmann::json(int{0}),
+            nlohmann::json(double{0.0}),
+            nlohmann::json(true),
+            nlohmann::json(nullptr),
+            nlohmann::json(vector<int>{0, 1, 2})
+    );
+    auto const serializer_result{Serializer<TestType>::create(invalid_user_defined_metadata)};
+    REQUIRE(serializer_result.has_error());
+    REQUIRE((std::errc::protocol_not_supported == serializer_result.error()));
+}
diff --git a/docs/src/user-guide/guides-overview.md b/docs/src/user-guide/guides-overview.md
@@ -0,0 +1,14 @@
+# Overview
+
+The guides below describe how to use CLP in different use cases.
+
+::::{grid} 1 1 2 2
+:gutter: 2
+
+:::{grid-item-card}
+:link: guides-using-object-storage/index
+Using object storage
+^^^
+Using CLP to ingest logs from object storage and store archives on object storage.
+:::
+::::
diff --git a/docs/src/user-guide/guides-using-object-storage/clp-config.md b/docs/src/user-guide/guides-using-object-storage/clp-config.md
@@ -0,0 +1,78 @@
+# Configuring CLP
+
+To use object storage with CLP, follow the steps below to configure each use case you require.
+
+:::{note}
+If CLP is already running, shut it down, update its configuration, and then start it again.
+:::
+
+## Configuration for archive storage
+
+To configure CLP to store archives on S3, update the `archive_output.storage` key in
+`<package>/etc/clp-config.yml` with the values in the code block below, replacing the fields in
+angle brackets (`<>`) with the appropriate values:
+
+```yaml
+archive_output:
+  storage:
+    type: "s3"
+    staging_directory: "var/data/staged-archives"  # Or a path of your choosing
+    s3_config:
+      region_code: "<region-code>"
+      bucket: "<bucket-name>"
+      key_prefix: "<key-prefix>"
+      credentials:
+        access_key_id: "<aws-access-key-id>"
+        secret_access_key: "<aws-secret-access-key>"
+
+  # archive_output's other config keys
+```
+
+* `staging_directory` is the local filesystem directory where archives will be temporarily stored
+  before being uploaded to S3.
+* `s3_config` configures both the S3 bucket where archives should be stored and the credentials
+  for accessing it.
+  * `<region-code>` is the AWS region [code][aws-region-codes] for the bucket.
+  * `<bucket-name>` is the bucket's name.
+  * `<key-prefix>` is the "directory" where all archives will be stored within the bucket and
+    must end with a trailing forward slash (e.g., `archives/`).
+  * `credentials` contains the CLP IAM user's credentials.
+
+## Configuration for stream storage
+
+To configure CLP to cache stream files on S3, update the `stream_output.storage` key in
+`<package>/etc/clp-config.yml` with the values in the code block below, replacing the fields in
+angle brackets (`<>`) with the appropriate values:
+
+```yaml
+stream_output:
+  storage:
+    type: "s3"
+    staging_directory: "var/data/staged-streams"  # Or a path of your choosing
+    s3_config:
+      region_code: "<region-code>"
+      bucket: "<bucket-name>"
+      key_prefix: "<key-prefix>"
+      credentials:
+        access_key_id: "<aws-access-key-id>"
+        secret_access_key: "<aws-secret-access-key>"
+
+  # stream_output's other config keys
+```
+
+* `staging_directory` is the local filesystem directory where streams will be temporarily stored
+  before being uploaded to S3.
+* `s3_config` configures both the S3 bucket where streams should be stored and the credentials
+  for accessing it.
+  * `<region-code>` is the AWS region [code][aws-region-codes] for the bucket.
+  * `<bucket-name>` is the bucket's name.
+  * `<key-prefix>` is the "directory" where all streams will be stored within the bucket and
+    must end with a trailing forward slash (e.g., `streams/`).
+  * `credentials` contains the CLP IAM user's credentials.
+
+:::{note}
+CLP currently doesn't explicitly delete the cached streams. This limitation will be addressed in a
+future release.
+:::
+
+[aws-region-codes]: https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Concepts.RegionsAndAvailabilityZones.html#Concepts.RegionsAndAvailabilityZones.Availability
diff --git a/docs/src/user-guide/guides-using-object-storage/clp-usage.md b/docs/src/user-guide/guides-using-object-storage/clp-usage.md
@@ -0,0 +1,52 @@
+# Using CLP with object storage
+
+To compress logs from S3, follow the steps in the section below. For all other operations, you
+should be able to use CLP as described in the [quick start](../quick-start-overview.md) guide.
+
+## Compressing logs from S3
+
+To compress logs from S3, use the `s3` subcommand as follows, replacing the fields in angle brackets
+(`<>`) with the appropriate values:
+
+```bash
+sbin/compress.sh \
+  s3 \
+  --aws-credentials-file <credentials-file> \
+  --timestamp-key <timestamp-key> \
+  https://<bucket-name>.s3.<region-code>.amazonaws.com/<prefix>
+```
+
+* `<credentials-file>` is the path to an AWS credentials file like the following:
+
+  ```ini
+  [default]
+  aws_access_key_id = <aws-access-key-id>
+  aws_secret_access_key = <aws-secret-access-key>
+  ```
+
+  * CLP expects the credentials to be in the `default` section.
+  * `<aws-access-key-id>` and `<aws-secret-access-key>` are the access key ID and secret access
+    key of the CLP IAM user.
+  * If you don't want to use a credentials file, you can specify the credentials on the command
+    line using the `--aws-access-key-id` and `--aws-secret-access-key` flags (note that this may
+    expose your credentials to other users running on the system).
+
+* `<timestamp-key>` is the field path of the kv-pair that contains the timestamp in each log event.
+* `<bucket-name>` is the name of the S3 bucket containing your logs.
+* `<region-code>` is the AWS region [code][aws-region-codes] for the S3 bucket containing your logs.
+* `<prefix>` is the prefix of all logs you wish to compress and must begin with the
+  `<all-logs-prefix>` value from the [compression IAM policy][compression-iam-policy].
+
+:::{note}
+The `s3` subcommand only supports a single URL but will compress any logs that have the given
+prefix.
+
+If you wish to compress a single log file, specify the entire path to the log file. However, if that
+log file's path is a prefix of another log file's path, then both log files will be compressed
+(e.g., with two files "logs/syslog" and "logs/syslog.1", a prefix like "logs/syslog" will cause
+both logs to be compressed). This limitation will be addressed in a future release.
+:::
+
+[add-iam-policy]: https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies_manage-attach-detach.html#embed-inline-policy-console
+[aws-region-codes]: https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Concepts.RegionsAndAvailabilityZones.html#Concepts.RegionsAndAvailabilityZones.Availability
+[compression-iam-policy]: ./object-storage-config.md#configuration-for-compression
diff --git a/docs/src/user-guide/guides-using-object-storage/index.md b/docs/src/user-guide/guides-using-object-storage/index.md
@@ -0,0 +1,95 @@
+# Using object storage
+
+CLP can:
+
+* compress logs from object storage (e.g., S3);
+* store archives on object storage; and
+* cache stream files (used for viewing compressed logs) on object storage.
+
+This guide explains how to configure and use CLP for all three use cases. Note that you can choose
+to use object storage for any combination of the three use cases (e.g., compress logs from S3 and
+cache the stream files on S3, but store archives on the local filesystem).
+
+:::{note}
+Currently, only the [clp-json][release-choices] release supports object storage. Support for
+`clp-text` will be added in a future release.
+:::
+
+:::{note}
+Currently, CLP only supports using S3 as object storage. Support for other object storage services
+will be added in a future release.
+:::
+
+## Prerequisites
+
+1. This guide assumes you're able to configure, start, stop, and use a CLP cluster as described in
+   the [quick-start guide](../quick-start-overview.md).
+2. An S3 bucket and [key prefix][aws-key-prefixes] containing the logs you wish to compress.
+3. An S3 bucket and key prefix where you wish to store compressed archives.
+4. An S3 bucket and key prefix where you wish to cache stream files.
+5. An AWS IAM user with the necessary permissions to access the S3 bucket(s) and prefixes mentioned
+   above.
+    * To create a user, follow [this guide][aws-create-iam-user].
+      * You don't need to assign any groups or policies to the user at this stage since we will
+        attach policies in later steps, depending on which object storage use cases you require.
+    * You may use a single IAM user for all use cases, or a separate one for each.
+    * For brevity, we'll refer to this user as the "CLP IAM user" in the rest of this guide.
+6. IAM user (long-term) credentials for the IAM user(s) created in step (4) above.
+    * To create these credentials, follow [this guide][aws-create-access-keys].
+      * Choose the "Other" use case to generate long-term credentials.
+
+    :::{note}
+    CLP currently requires IAM user (long-term) credentials to access the relevant S3 buckets.
+    Support for other authentication methods (e.g., temporary credentials) will be added in a future
+    release.
+    :::
+
+## Configuration
+
+The subsections below explain how to configure your object storage bucket and CLP for each use case:
+
+::::{grid} 1 1 1 1
+:gutter: 2
+
+:::{grid-item-card}
+:link: object-storage-config
+Configuring object storage
+^^^
+Configuring your object storage bucket for each use case.
+:::
+
+:::{grid-item-card}
+:link: clp-config
+Configuring CLP
+^^^
+Configuring CLP to use object storage for each use case.
+:::
+::::
+
+## Using CLP with object storage
+
+The subsection below explains how to use CLP with object storage for each use case:
+
+::::{grid} 1 1 1 1
+:gutter: 2
+
+:::{grid-item-card}
+:link: clp-usage
+Using CLP with object storage
+^^^
+Using CLP to compress, search, and view log files from object storage.
+:::
+::::
+
+:::{toctree}
+:hidden:
+
+object-storage-config
+clp-config
+clp-usage
+:::
+
+[aws-create-access-keys]: https://docs.aws.amazon.com/keyspaces/latest/devguide/create.keypair.html
+[aws-create-iam-user]: https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html
+[aws-key-prefixes]: https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-prefixes.html
+[release-choices]: ../quick-start-cluster-setup/index.md#choosing-a-release