Skip to content

Commit

Permalink
Add Schema.JSON, Schema.STRING and Schema.BYTES support to merge-key-…
Browse files Browse the repository at this point in the history
…value (#94)

* Add Schema.JSON support to merge-key-value

* Add support for STRING and BYTES to merge-key-value
  • Loading branch information
cbornet authored Jul 4, 2023
1 parent 47a8338 commit 109f399
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 48 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ Output: `{name: value1} (AVRO)`

### Merge KeyValue

Merges the fields of KeyValue records where both the key and value are structured types of the same schema type. (Currently only AVRO is supported).
Merges the fields of KeyValue records where both the key and value are structured types of the same schema type. (Currently only AVRO and JSON are supported).

Step name: `merge-key-value`

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
*/
package com.datastax.oss.pulsar.functions.transforms;

import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
Expand All @@ -25,7 +28,7 @@
import org.apache.pulsar.common.schema.SchemaType;

public class MergeKeyValueStep implements TransformStep {

public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private final Map<org.apache.avro.Schema, Map<org.apache.avro.Schema, org.apache.avro.Schema>>
schemaCache = new ConcurrentHashMap<>();

Expand All @@ -35,61 +38,77 @@ public void process(TransformContext transformContext) {
if (keySchema == null) {
return;
}
if (keySchema.getSchemaInfo().getType() == SchemaType.AVRO
Object keyObject = transformContext.getKeyObject();
Object valueObject = transformContext.getValueObject();
if (keyObject instanceof Map && valueObject instanceof Map) {
Map<Object, Object> value = (Map<Object, Object>) valueObject;
Map<String, Object> keyCopy =
OBJECT_MAPPER.convertValue(keyObject, new TypeReference<>() {});
keyCopy.forEach(value::putIfAbsent);
} else if (keySchema.getSchemaInfo().getType() == SchemaType.AVRO
&& transformContext.getValueSchema().getSchemaInfo().getType() == SchemaType.AVRO) {
GenericRecord avroKeyRecord = (GenericRecord) transformContext.getKeyObject();
GenericRecord avroKeyRecord = (GenericRecord) keyObject;
org.apache.avro.Schema avroKeySchema = avroKeyRecord.getSchema();

GenericRecord avroValueRecord = (GenericRecord) transformContext.getValueObject();
GenericRecord avroValueRecord = (GenericRecord) valueObject;
org.apache.avro.Schema avroValueSchema = avroValueRecord.getSchema();

List<String> valueSchemaFieldNames =
avroValueSchema
.getFields()
.stream()
.map(org.apache.avro.Schema.Field::name)
.collect(Collectors.toList());
List<org.apache.avro.Schema.Field> fields =
avroKeySchema
.getFields()
.stream()
.filter(field -> !valueSchemaFieldNames.contains(field.name()))
.map(
f ->
new org.apache.avro.Schema.Field(
f.name(), f.schema(), f.doc(), f.defaultVal(), f.order()))
.collect(Collectors.toList());
fields.addAll(
avroValueSchema
.getFields()
.stream()
.map(
f ->
new org.apache.avro.Schema.Field(
f.name(), f.schema(), f.doc(), f.defaultVal(), f.order()))
.collect(Collectors.toList()));

Map<org.apache.avro.Schema, org.apache.avro.Schema> schemaCacheKey =
schemaCache.computeIfAbsent(avroKeySchema, s -> new ConcurrentHashMap<>());
org.apache.avro.Schema modified =
schemaCacheKey.computeIfAbsent(
avroValueSchema,
schema ->
org.apache.avro.Schema.createRecord(
avroValueSchema.getName(),
null,
avroValueSchema.getNamespace(),
false,
fields));
GenericRecord newRecord = new GenericData.Record(modified);
for (String fieldName : valueSchemaFieldNames) {
newRecord.put(fieldName, avroValueRecord.get(fieldName));
}
org.apache.avro.Schema mergedSchema = getMergedSchema(avroKeySchema, avroValueSchema);
GenericRecord newRecord = new GenericData.Record(mergedSchema);
avroValueSchema
.getFields()
.forEach(field -> newRecord.put(field.name(), avroValueRecord.get(field.name())));
for (org.apache.avro.Schema.Field field : avroKeySchema.getFields()) {
newRecord.put(field.name(), avroKeyRecord.get(field.name()));
if (avroValueSchema.getField(field.name()) == null) {
newRecord.put(field.name(), avroKeyRecord.get(field.name()));
}
}
transformContext.setValueObject(newRecord);
transformContext.setValueModified(true);
} else if (keySchema.getSchemaInfo().getType() == SchemaType.JSON
&& transformContext.getValueSchema().getSchemaInfo().getType() == SchemaType.JSON) {
org.apache.avro.Schema avroKeySchema =
(org.apache.avro.Schema) keySchema.getNativeSchema().orElseThrow();
org.apache.avro.Schema avroValueSchema =
(org.apache.avro.Schema)
transformContext.getValueSchema().getNativeSchema().orElseThrow();
org.apache.avro.Schema mergedSchema = getMergedSchema(avroKeySchema, avroValueSchema);
transformContext.setValueSchema(new JsonNodeSchema(mergedSchema));
ObjectNode newValue = ((ObjectNode) keyObject).deepCopy();
newValue.setAll(((ObjectNode) valueObject).deepCopy());
transformContext.setValueObject(newValue);
transformContext.setValueModified(true);
}
}

private org.apache.avro.Schema getMergedSchema(
org.apache.avro.Schema avroKeySchema, org.apache.avro.Schema avroValueSchema) {
List<org.apache.avro.Schema.Field> fields =
avroKeySchema
.getFields()
.stream()
.filter(field -> avroValueSchema.getField(field.name()) == null)
.map(
f ->
new org.apache.avro.Schema.Field(
f.name(), f.schema(), f.doc(), f.defaultVal(), f.order()))
.collect(Collectors.toList());
fields.addAll(
avroValueSchema
.getFields()
.stream()
.map(
f ->
new org.apache.avro.Schema.Field(
f.name(), f.schema(), f.doc(), f.defaultVal(), f.order()))
.collect(Collectors.toList()));

Map<org.apache.avro.Schema, org.apache.avro.Schema> schemaCacheKey =
schemaCache.computeIfAbsent(avroKeySchema, s -> new ConcurrentHashMap<>());
return schemaCacheKey.computeIfAbsent(
avroValueSchema,
schema ->
org.apache.avro.Schema.createRecord(
avroValueSchema.getName(), null, avroValueSchema.getNamespace(), false, fields));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import static org.testng.Assert.assertNotSame;
import static org.testng.Assert.assertSame;

import com.fasterxml.jackson.databind.JsonNode;
import java.nio.charset.StandardCharsets;
import org.apache.avro.generic.GenericData;
import org.apache.pulsar.client.api.Schema;
import org.apache.pulsar.client.api.schema.GenericObject;
Expand Down Expand Up @@ -52,6 +54,81 @@ void testKeyValueAvro() throws Exception {
assertSame(messageValue.getKey(), recordValue.getKey());
}

@Test
void testKeyValueJson() throws Exception {
Record<GenericObject> record = Utils.createTestJsonKeyValueRecord();
Record<?> outputRecord = Utils.process(record, new MergeKeyValueStep());
KeyValueSchema<?, ?> messageSchema = (KeyValueSchema<?, ?>) outputRecord.getSchema();
KeyValue<?, ?> messageValue = (KeyValue<?, ?>) outputRecord.getValue();

JsonNode read = (JsonNode) messageValue.getValue();
assertEquals(read.get("keyField1").asText(), "key1");
assertEquals(read.get("keyField2").asText(), "key2");
assertEquals(read.get("keyField3").asText(), "key3");
assertEquals(read.get("valueField1").asText(), "value1");
assertEquals(read.get("valueField2").asText(), "value2");
assertEquals(read.get("valueField3").asText(), "value3");

KeyValueSchema<?, ?> recordSchema = (KeyValueSchema) record.getSchema();
KeyValue<?, ?> recordValue = (KeyValue<?, ?>) record.getValue().getNativeObject();
assertSame(messageSchema.getKeySchema(), recordSchema.getKeySchema());
assertSame(messageValue.getKey(), recordValue.getKey());
}

@Test
void testKeyValueStringJson() throws Exception {
Schema<KeyValue<String, String>> keyValueSchema =
Schema.KeyValue(Schema.STRING, Schema.STRING, KeyValueEncodingType.SEPARATED);

String key = "{\"keyField1\": \"key1\", \"keyField2\": \"key2\", \"keyField3\": \"key3\"}";
String value =
"{\"valueField1\": \"value1\", \"valueField2\": \"value2\", \"valueField3\": \"value3\"}";

KeyValue<String, String> keyValue = new KeyValue<>(key, value);

Record<GenericObject> record =
new Utils.TestRecord<>(
keyValueSchema,
AutoConsumeSchema.wrapPrimitiveObject(keyValue, SchemaType.KEY_VALUE, new byte[] {}),
null);

Record<?> outputRecord = Utils.process(record, new MergeKeyValueStep());
KeyValue<?, ?> messageValue = (KeyValue<?, ?>) outputRecord.getValue();

assertEquals(
messageValue.getValue(),
"{\"valueField1\":\"value1\",\"valueField2\":\"value2\","
+ "\"valueField3\":\"value3\",\"keyField1\":\"key1\",\"keyField2\":\"key2\",\"keyField3\":\"key3\"}");
}

@Test
void testKeyValueBytesJson() throws Exception {
Schema<KeyValue<byte[], byte[]>> keyValueSchema =
Schema.KeyValue(Schema.BYTES, Schema.BYTES, KeyValueEncodingType.SEPARATED);

String key = "{\"keyField1\": \"key1\", \"keyField2\": \"key2\", \"keyField3\": \"key3\"}";
String value =
"{\"valueField1\": \"value1\", \"valueField2\": \"value2\", \"valueField3\": \"value3\"}";

KeyValue<byte[], byte[]> keyValue =
new KeyValue<>(
key.getBytes(StandardCharsets.UTF_8), value.getBytes(StandardCharsets.UTF_8));

Record<GenericObject> record =
new Utils.TestRecord<>(
keyValueSchema,
AutoConsumeSchema.wrapPrimitiveObject(keyValue, SchemaType.KEY_VALUE, new byte[] {}),
null);

Record<?> outputRecord = Utils.process(record, new MergeKeyValueStep());
KeyValue<?, ?> messageValue = (KeyValue<?, ?>) outputRecord.getValue();

assertEquals(
new String((byte[]) messageValue.getValue(), StandardCharsets.UTF_8),
"{\"valueField1\":\"value1\",\"valueField2\":\"value2\",\"valueField3\":\"value3\","
+ "\"keyField1\":\"key1\",\"keyField2\":\"key2\",\"keyField3\":\"key3\"}");
}

@Test
void testPrimitive() throws Exception {
Record<GenericObject> record =
Expand Down

0 comments on commit 109f399

Please sign in to comment.