From d0967f565b521f0301b6bf792f74446d429c439e Mon Sep 17 00:00:00 2001 From: Jim Cipar Date: Tue, 2 Jul 2024 10:06:18 -0400 Subject: [PATCH 1/4] Modify CMakeLists to fix build errors. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change bumps our C++ version to 17 and disables specific warnings that cause build errors. I was hitting three build errors, possibly due to the newer compiler: - lang/c++/impl/Compiler.cc:304:18: error: possibly dangling reference to a temporary [-Werror=dangling-reference] - /usr/include/boost/math/tools/config.hpp:23:6: error: #warning "The minimum language standard to use Boost.Math will be C++14 starting in July 2023 (Boost 1.82 release)" [-Werror=cpp] - avro/lang/c++/include/avro/Reader.hh:89:17: error: ‘v.avro::ReaderImpl::readValue(double&)::::d’ may be used uninitialized [-Werror=maybe-uninitialized] The last one only occurred during `./build.sh install`, not during `./build.sh test` --- lang/c++/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lang/c++/CMakeLists.txt b/lang/c++/CMakeLists.txt index 6f7b2caef7a..5617096a8cd 100644 --- a/lang/c++/CMakeLists.txt +++ b/lang/c++/CMakeLists.txt @@ -21,7 +21,7 @@ cmake_minimum_required (VERSION 3.1) set (CMAKE_LEGACY_CYGWIN_WIN32 0) if (NOT DEFINED CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD 17) endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -64,7 +64,7 @@ if (WIN32 AND NOT CYGWIN AND NOT MSYS) endif() if (CMAKE_COMPILER_IS_GNUCXX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic -Werror") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic -Werror -Wno-dangling-reference -Wno-maybe-uninitialized") if (AVRO_ADD_PROTECTOR_FLAGS) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fstack-protector-all -D_GLIBCXX_DEBUG") # Unset _GLIBCXX_DEBUG for avrogencpp.cc because using Boost Program Options From 98d18c398cc9736645de302052e66fd91f20b7ce Mon Sep 17 00:00:00 2001 From: Jim Cipar Date: Tue, 2 Jul 2024 10:12:45 -0400 Subject: [PATCH 2/4] Allow custom metadata in data files. The Iceberg spec requires custom metadata fields for Avro files. It expects information such as the table scheme, the partition information and the manifest type to be in the Manifest File's metadata. This change makes the `setMetadata` method of `DataFileWriterBase` public instead of private, and adds a similar public method to `DataFileWriter`. --- lang/c++/impl/DataFile.cc | 23 ++++++++++++++++++----- lang/c++/include/avro/DataFile.hh | 25 +++++++++++++++++++------ lang/c++/test/DataFileTests.cc | 29 +++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 11 deletions(-) diff --git a/lang/c++/impl/DataFile.cc b/lang/c++/impl/DataFile.cc index 45fdbb649d7..10136e6ee98 100644 --- a/lang/c++/impl/DataFile.cc +++ b/lang/c++/impl/DataFile.cc @@ -20,6 +20,8 @@ #include "Compiler.hh" #include "Exception.hh" +#include +#include #include #include // for boost::crc_32_type @@ -64,7 +66,7 @@ boost::iostreams::zlib_params get_zlib_params() { } // namespace DataFileWriterBase::DataFileWriterBase(const char *filename, const ValidSchema &schema, size_t syncInterval, - Codec codec) : filename_(filename), + Codec codec, const std::map &customMetadata) : filename_(filename), schema_(schema), encoderPtr_(binaryEncoder()), syncInterval_(syncInterval), @@ -74,11 +76,11 @@ DataFileWriterBase::DataFileWriterBase(const char *filename, const ValidSchema & sync_(makeSync()), objectCount_(0), lastSync_(0) { - init(schema, syncInterval, codec); + init(schema, syncInterval, codec, customMetadata); } DataFileWriterBase::DataFileWriterBase(std::unique_ptr outputStream, - const ValidSchema &schema, size_t syncInterval, Codec codec) : filename_(), + const ValidSchema &schema, size_t syncInterval, Codec codec, const std::map &customMetadata) : filename_(), schema_(schema), encoderPtr_(binaryEncoder()), syncInterval_(syncInterval), @@ -88,10 +90,10 @@ DataFileWriterBase::DataFileWriterBase(std::unique_ptr outputStrea sync_(makeSync()), objectCount_(0), lastSync_(0) { - init(schema, syncInterval, codec); + init(schema, syncInterval, codec, customMetadata); } -void DataFileWriterBase::init(const ValidSchema &schema, size_t syncInterval, const Codec &codec) { +void DataFileWriterBase::init(const ValidSchema &schema, size_t syncInterval, const Codec &codec, const std::map &customMetadata) { if (syncInterval < minSyncInterval || syncInterval > maxSyncInterval) { throw Exception(boost::format("Invalid sync interval: %1%. " "Should be between %2% and %3%") @@ -111,6 +113,9 @@ void DataFileWriterBase::init(const ValidSchema &schema, size_t syncInterval, co throw Exception(boost::format("Unknown codec: %1%") % codec); } setMetadata(AVRO_SCHEMA_KEY, schema.toJson(false)); + for (const auto &kv : customMetadata) { + setMetadata(kv.first, kv.second); + } writeHeader(); encoderPtr_->init(*buffer_); @@ -562,4 +567,12 @@ int64_t DataFileReaderBase::previousSync() const { return blockStart_; } +std::optional DataFileReaderBase::getMetadata(const std::string& key) { + if (auto it = metadata_.find(key); it == metadata_.cend()) { + return std::nullopt; + } else { + return std::string(it->second.cbegin(), it->second.cend()); + } +} + } // namespace avro diff --git a/lang/c++/include/avro/DataFile.hh b/lang/c++/include/avro/DataFile.hh index 94a1dab8e31..8b5a5552a64 100644 --- a/lang/c++/include/avro/DataFile.hh +++ b/lang/c++/include/avro/DataFile.hh @@ -27,6 +27,7 @@ #include "buffer/Buffer.hh" #include +#include #include #include @@ -89,7 +90,7 @@ class AVRO_DECL DataFileWriterBase : boost::noncopyable { /** * Shared constructor portion since we aren't using C++11 */ - void init(const ValidSchema &schema, size_t syncInterval, const Codec &codec); + void init(const ValidSchema &schema, size_t syncInterval, const Codec &codec, const std::map &customMetadata); public: /** @@ -118,9 +119,9 @@ public: * Constructs a data file writer with the given sync interval and name. */ DataFileWriterBase(const char *filename, const ValidSchema &schema, - size_t syncInterval, Codec codec = NULL_CODEC); + size_t syncInterval, Codec codec = NULL_CODEC, const std::map &customMetadata = {}); DataFileWriterBase(std::unique_ptr outputStream, - const ValidSchema &schema, size_t syncInterval, Codec codec); + const ValidSchema &schema, size_t syncInterval, Codec codec, const std::map &customMetadata = {}); ~DataFileWriterBase(); /** @@ -152,10 +153,10 @@ public: * Constructs a new data file. */ DataFileWriter(const char *filename, const ValidSchema &schema, - size_t syncInterval = 16 * 1024, Codec codec = NULL_CODEC) : base_(new DataFileWriterBase(filename, schema, syncInterval, codec)) {} + size_t syncInterval = 16 * 1024, Codec codec = NULL_CODEC, const std::map &customMetadata = {}) : base_(new DataFileWriterBase(filename, schema, syncInterval, codec, customMetadata)) {} DataFileWriter(std::unique_ptr outputStream, const ValidSchema &schema, - size_t syncInterval = 16 * 1024, Codec codec = NULL_CODEC) : base_(new DataFileWriterBase(std::move(outputStream), schema, syncInterval, codec)) {} + size_t syncInterval = 16 * 1024, Codec codec = NULL_CODEC, const std::map &customMetadata = {}) : base_(new DataFileWriterBase(std::move(outputStream), schema, syncInterval, codec, customMetadata)) {} /** * Writes the given piece of data into the file. @@ -206,7 +207,6 @@ class AVRO_DECL DataFileReaderBase : boost::noncopyable { DecoderPtr dataDecoder_; std::unique_ptr dataStream_; typedef std::map> Metadata; - Metadata metadata_; DataFileSync sync_{}; @@ -270,6 +270,12 @@ public: */ const ValidSchema &dataSchema() { return dataSchema_; } + + /** + * Gets a metadata key/value pair for the file. + */ + std::optional getMetadata(const std::string& key); + /** * Closes the reader. No further operation is possible on this reader. */ @@ -382,6 +388,13 @@ public: */ const ValidSchema &dataSchema() { return base_->dataSchema(); } + /** + * Gets a metadata key/value pair for the file. + */ + std::optional getMetadata(const std::string& key) { + return base_->getMetadata(key); + } + /** * Closes the reader. No further operation is possible on this reader. */ diff --git a/lang/c++/test/DataFileTests.cc b/lang/c++/test/DataFileTests.cc index a29a6f9c2a2..724a682b7fd 100644 --- a/lang/c++/test/DataFileTests.cc +++ b/lang/c++/test/DataFileTests.cc @@ -658,6 +658,26 @@ class DataFileTest { BOOST_CHECK_EQUAL(root->leafAt(5)->getDoc(), "extra slashes\\\\"); } } + + void testWriteCustomMetadata() { + uint32_t a = 42; + { + avro::DataFileWriter df(filename, writerSchema, 16 * 1024, avro::NULL_CODEC, {{"test_key_1", "test_value_1"}, {"test_key_2", "test_value_2"}}); + df.write(a); + } + + { + + avro::DataFileReader df(filename); + auto val_1 = df.getMetadata("test_key_1"); + BOOST_CHECK(val_1.has_value()); + BOOST_CHECK_EQUAL(val_1.value(), "test_value_1"); + + auto val_2 = df.getMetadata("test_key_2"); + BOOST_CHECK(val_2.has_value()); + BOOST_CHECK_EQUAL(val_2.value(), "test_value_2"); + } + } }; void addReaderTests(test_suite *ts, const shared_ptr &t) { @@ -1126,6 +1146,15 @@ init_unit_test_suite(int argc, char *argv[]) { boost::unit_test::framework::master_test_suite().add(ts); } + { + auto *ts = BOOST_TEST_SUITE("DataFile tests: test13.df"); + shared_ptr t(new DataFileTest("test13.df", ischWithDoc, ischWithDoc)); + ts->add(BOOST_CLASS_TEST_CASE(&DataFileTest::testWrite, t)); + ts->add(BOOST_CLASS_TEST_CASE(&DataFileTest::testWriteCustomMetadata, t)); + ts->add(BOOST_CLASS_TEST_CASE(&DataFileTest::testCleanup, t)); + boost::unit_test::framework::master_test_suite().add(ts); + } + boost::unit_test::framework::master_test_suite().add(BOOST_TEST_CASE(&testSkipStringNullCodec)); boost::unit_test::framework::master_test_suite().add(BOOST_TEST_CASE(&testSkipStringDeflateCodec)); #ifdef SNAPPY_CODEC_AVAILABLE From f2110ac837717fa0e6a919d6a8e9ee0190610b88 Mon Sep 17 00:00:00 2001 From: Jim Cipar Date: Tue, 2 Jul 2024 10:43:05 -0400 Subject: [PATCH 3/4] Add `map` logical type for arrays. Maps can be represented by either the built-in map type, or an array of key-value pairs with the logical type `map`. The Iceberg "column_sizes" field is an example of the latter. This change allows arrays to be annotated with the `map` logical type, which was previously not supported in C++ Avro. --- lang/c++/impl/Compiler.cc | 2 ++ lang/c++/impl/LogicalType.cc | 3 +++ lang/c++/impl/Node.cc | 6 ++++++ lang/c++/impl/NodeImpl.cc | 5 +++++ lang/c++/include/avro/LogicalType.hh | 3 ++- lang/c++/test/SchemaTests.cc | 29 ++++++++++++++++++++++++++++ 6 files changed, 47 insertions(+), 1 deletion(-) diff --git a/lang/c++/impl/Compiler.cc b/lang/c++/impl/Compiler.cc index cde3fd38312..74050740169 100644 --- a/lang/c++/impl/Compiler.cc +++ b/lang/c++/impl/Compiler.cc @@ -362,6 +362,8 @@ static LogicalType makeLogicalType(const Entity &e, const Object &m) { t = LogicalType::DURATION; else if (typeField == "uuid") t = LogicalType::UUID; + else if (typeField == "map") + t = LogicalType::MAP; return LogicalType(t); } diff --git a/lang/c++/impl/LogicalType.cc b/lang/c++/impl/LogicalType.cc index 1aa24bf20de..90297aa6528 100644 --- a/lang/c++/impl/LogicalType.cc +++ b/lang/c++/impl/LogicalType.cc @@ -77,6 +77,9 @@ void LogicalType::printJson(std::ostream &os) const { case UUID: os << R"("logicalType": "uuid")"; break; + case MAP: + os << R"("logicalType": "map")"; + break; } } diff --git a/lang/c++/impl/Node.cc b/lang/c++/impl/Node.cc index 46310d0f9ef..f7a62bdb2fb 100644 --- a/lang/c++/impl/Node.cc +++ b/lang/c++/impl/Node.cc @@ -142,6 +142,12 @@ void Node::setLogicalType(LogicalType logicalType) { "STRING type"); } break; + case LogicalType::MAP: + if (type_ != AVRO_ARRAY) { + throw Exception("MAP logical type can only annotate " + "ARRAY type"); + } + break; } logicalType_ = logicalType; diff --git a/lang/c++/impl/NodeImpl.cc b/lang/c++/impl/NodeImpl.cc index fc2909390b3..9b22b4de253 100644 --- a/lang/c++/impl/NodeImpl.cc +++ b/lang/c++/impl/NodeImpl.cc @@ -519,6 +519,11 @@ void NodeArray::printJson(std::ostream &os, size_t depth) const { os << indent(depth + 1) << R"("doc": ")" << escape(getDoc()) << "\",\n"; } + if (logicalType().type() != LogicalType::NONE) { + os << indent(depth); + logicalType().printJson(os); + os << ",\n"; + } os << indent(depth + 1) << "\"items\": "; leafAttributes_.get()->printJson(os, depth + 1); os << '\n'; diff --git a/lang/c++/include/avro/LogicalType.hh b/lang/c++/include/avro/LogicalType.hh index 4d06e74f635..b28206c1c38 100644 --- a/lang/c++/include/avro/LogicalType.hh +++ b/lang/c++/include/avro/LogicalType.hh @@ -36,7 +36,8 @@ public: TIMESTAMP_MILLIS, TIMESTAMP_MICROS, DURATION, - UUID + UUID, + MAP }; explicit LogicalType(Type type); diff --git a/lang/c++/test/SchemaTests.cc b/lang/c++/test/SchemaTests.cc index 4f30c8da15a..fd1a11f5f3a 100755 --- a/lang/c++/test/SchemaTests.cc +++ b/lang/c++/test/SchemaTests.cc @@ -380,6 +380,26 @@ static void testLogicalTypes() { const char* unionType = "[\n\ {\"type\":\"string\", \"logicalType\":\"uuid\"},\"null\"\n\ ]"; + const char* arrayMapType = R"({ + "type": "array", + "logicalType": "map", + "items": { + "type": "record", + "name": "k1_v2", + "fields": [ + { + "name": "key", + "type": "int", + "field-id": "1" + }, + { + "name": "value", + "type": "long", + "field-id": "2" + } + ] + } + })"; { BOOST_TEST_CHECKPOINT(bytesDecimalType); ValidSchema schema1 = compileJsonSchemaFromString(bytesDecimalType); @@ -474,6 +494,15 @@ static void testLogicalTypes() { GenericDatum datum(schema); BOOST_CHECK(datum.logicalType().type() == LogicalType::UUID); } + { + BOOST_TEST_CHECKPOINT(arrayMapType); + ValidSchema schema = compileJsonSchemaFromString(arrayMapType); + BOOST_CHECK(schema.root()->type() == AVRO_ARRAY); + LogicalType logicalType = schema.root()->logicalType(); + BOOST_CHECK(logicalType.type() == LogicalType::MAP); + GenericDatum datum(schema); + BOOST_CHECK(datum.logicalType().type() == LogicalType::MAP); + } } static void testMalformedLogicalTypes(const char *schema) { From 966d457e725b5decdafdd453934af85999d7ac57 Mon Sep 17 00:00:00 2001 From: Jim Cipar Date: Tue, 2 Jul 2024 11:10:50 -0400 Subject: [PATCH 4/4] Add element-id support for Avro arrays. `element-id` is an optional field used by Iceberg arrays and required by some Iceberg implmentations. --- lang/c++/impl/Compiler.cc | 8 ++++++-- lang/c++/impl/NodeImpl.cc | 4 ++++ lang/c++/include/avro/NodeImpl.hh | 2 ++ lang/c++/test/SchemaTests.cc | 1 + 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/lang/c++/impl/Compiler.cc b/lang/c++/impl/Compiler.cc index 74050740169..3e29d0792c8 100644 --- a/lang/c++/impl/Compiler.cc +++ b/lang/c++/impl/Compiler.cc @@ -423,8 +423,12 @@ static NodePtr makeFixedNode(const Entity &e, static NodePtr makeArrayNode(const Entity &e, const Object &m, SymbolTable &st, const string &ns) { auto it = findField(e, m, "items"); - NodePtr node = NodePtr(std::make_shared( - asSingleAttribute(makeNode(it->second, st, ns)))); + std::shared_ptr nodePointer = std::make_shared( + asSingleAttribute(makeNode(it->second, st, ns))); + NodePtr node = NodePtr(nodePointer); + if (containsField(m, "element-id")) { + nodePointer->elementId_ = getStringField(e, m, "element-id"); + } if (containsField(m, "doc")) { node->setDoc(getDocField(e, m)); } diff --git a/lang/c++/impl/NodeImpl.cc b/lang/c++/impl/NodeImpl.cc index 9b22b4de253..3a0673e1a09 100644 --- a/lang/c++/impl/NodeImpl.cc +++ b/lang/c++/impl/NodeImpl.cc @@ -524,6 +524,10 @@ void NodeArray::printJson(std::ostream &os, size_t depth) const { logicalType().printJson(os); os << ",\n"; } + if (!elementId_.empty()) { + os << indent(depth + 1) << R"("element-id": ")" + << escape(elementId_) << "\",\n"; + } os << indent(depth + 1) << "\"items\": "; leafAttributes_.get()->printJson(os, depth + 1); os << '\n'; diff --git a/lang/c++/include/avro/NodeImpl.hh b/lang/c++/include/avro/NodeImpl.hh index 34875fb602e..3ee178351cb 100644 --- a/lang/c++/include/avro/NodeImpl.hh +++ b/lang/c++/include/avro/NodeImpl.hh @@ -416,6 +416,8 @@ public: } void printDefaultToJson(const GenericDatum &g, std::ostream &os, size_t depth) const override; + + std::string elementId_; }; class AVRO_DECL NodeMap : public NodeImplMap { diff --git a/lang/c++/test/SchemaTests.cc b/lang/c++/test/SchemaTests.cc index fd1a11f5f3a..a45d34880ad 100755 --- a/lang/c++/test/SchemaTests.cc +++ b/lang/c++/test/SchemaTests.cc @@ -199,6 +199,7 @@ const char *roundTripSchemas[] = { R"({"type":"array","items":"long"})", "{\"type\":\"array\",\"items\":{\"type\":\"enum\"," "\"name\":\"Test\",\"symbols\":[\"A\",\"B\"]}}", + R"({"type":"array","element-id":"testElemId","items":"long"})", // Map R"({"type":"map","values":"long"})",