From 4d4e5ddbf2e41ad2e7add8e180398d8b9b156886 Mon Sep 17 00:00:00 2001 From: tushengxia Date: Thu, 14 Apr 2022 15:29:15 +0800 Subject: [PATCH 1/3] init repo directories --- hbase-tries-index/README.md | 1 + omnidata/README.md | 1 + omnidata/omnidata-hive-connector/README.md | 1 + omnidata/omnidata-hiveudf-loader/README.md | 1 + omnidata/omnidata-openlookeng-connector/README.md | 1 + omnidata/omnidata-spark-connector/README.md | 1 + omnioperator/omniop-openlookeng-extension/README.md | 1 + omnioperator/omniop-spark-extension/README.md | 1 + 8 files changed, 8 insertions(+) create mode 100644 hbase-tries-index/README.md create mode 100644 omnidata/README.md create mode 100644 omnidata/omnidata-hive-connector/README.md create mode 100644 omnidata/omnidata-hiveudf-loader/README.md create mode 100644 omnidata/omnidata-openlookeng-connector/README.md create mode 100644 omnidata/omnidata-spark-connector/README.md create mode 100644 omnioperator/omniop-openlookeng-extension/README.md create mode 100644 omnioperator/omniop-spark-extension/README.md diff --git a/hbase-tries-index/README.md b/hbase-tries-index/README.md new file mode 100644 index 00000000..c972155d --- /dev/null +++ b/hbase-tries-index/README.md @@ -0,0 +1 @@ +# hbase-tries-index \ No newline at end of file diff --git a/omnidata/README.md b/omnidata/README.md new file mode 100644 index 00000000..5c2f9d42 --- /dev/null +++ b/omnidata/README.md @@ -0,0 +1 @@ +# omnidata \ No newline at end of file diff --git a/omnidata/omnidata-hive-connector/README.md b/omnidata/omnidata-hive-connector/README.md new file mode 100644 index 00000000..c5dc6069 --- /dev/null +++ b/omnidata/omnidata-hive-connector/README.md @@ -0,0 +1 @@ +# omnidata-hive-connector \ No newline at end of file diff --git a/omnidata/omnidata-hiveudf-loader/README.md b/omnidata/omnidata-hiveudf-loader/README.md new file mode 100644 index 00000000..ce7e150c --- /dev/null +++ b/omnidata/omnidata-hiveudf-loader/README.md @@ -0,0 +1 @@ +# omnidata-hiveudf-loader \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/README.md b/omnidata/omnidata-openlookeng-connector/README.md new file mode 100644 index 00000000..193fc2b5 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/README.md @@ -0,0 +1 @@ +# omnidata-openlookeng-connector \ No newline at end of file diff --git a/omnidata/omnidata-spark-connector/README.md b/omnidata/omnidata-spark-connector/README.md new file mode 100644 index 00000000..adad9fcd --- /dev/null +++ b/omnidata/omnidata-spark-connector/README.md @@ -0,0 +1 @@ +# omnidata-spark-connector \ No newline at end of file diff --git a/omnioperator/omniop-openlookeng-extension/README.md b/omnioperator/omniop-openlookeng-extension/README.md new file mode 100644 index 00000000..5cad1f0b --- /dev/null +++ b/omnioperator/omniop-openlookeng-extension/README.md @@ -0,0 +1 @@ +# omniop-openlookeng-extension \ No newline at end of file diff --git a/omnioperator/omniop-spark-extension/README.md b/omnioperator/omniop-spark-extension/README.md new file mode 100644 index 00000000..047f57aa --- /dev/null +++ b/omnioperator/omniop-spark-extension/README.md @@ -0,0 +1 @@ +# omniop-spark-extension \ No newline at end of file From 552764ebab39adb372f8e226021fb66ba121b99c Mon Sep 17 00:00:00 2001 From: tushengxia Date: Thu, 14 Apr 2022 15:34:29 +0800 Subject: [PATCH 2/3] move hbase repo to new repo --- hbase-tries-index/README.md | 29 +- .../fake-tries-index-pack/pom.xml | 6 + .../boostkit/hbase/index/AllocatedMemory.java | 29 + .../index/IllegalMemoryRequestException.java | 14 + .../index/InsufficientMemoryException.java | 14 + .../hbase/index/LoudsTriesService.java | 24 + .../hbase/index/NativeMaxLimitAllocator.java | 29 + .../boostkit/hbase/index/OffheapLruCache.java | 29 + .../hbase/index/OffheapReplaceableCache.java | 8 + .../boostkit/hbase/index/ValueInfo.java | 19 + .../patch/HBase-Louds-Tries-Index.patch | 754 ++++++++++++++++++ 11 files changed, 954 insertions(+), 1 deletion(-) create mode 100644 hbase-tries-index/fake-tries-index-pack/pom.xml create mode 100644 hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/AllocatedMemory.java create mode 100644 hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/IllegalMemoryRequestException.java create mode 100644 hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/InsufficientMemoryException.java create mode 100644 hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/LoudsTriesService.java create mode 100644 hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/NativeMaxLimitAllocator.java create mode 100644 hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/OffheapLruCache.java create mode 100644 hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/OffheapReplaceableCache.java create mode 100644 hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/ValueInfo.java create mode 100644 hbase-tries-index/patch/HBase-Louds-Tries-Index.patch diff --git a/hbase-tries-index/README.md b/hbase-tries-index/README.md index c972155d..0063e019 100644 --- a/hbase-tries-index/README.md +++ b/hbase-tries-index/README.md @@ -1 +1,28 @@ -# hbase-tries-index \ No newline at end of file +# hbase-trie-index + + + +Introduction +============ +The trie index library is used to accelerate HBase Data Block selection. By using Succinct Data Structure instead of ArrayList, trie index costs less memory and performs more efficiently. The trie index library is based on the original APIs of Apache [HBase 2.2.3](https://github.com/apache/hbase/tree/rel/2.2.3). + + + + +Building And Packageing +==================== + +(1) Build the project: + + mvn clean package + + +Contribution Guidelines +======== + +Track the bugs and feature requests via GitHub [issues]. + +More Information +======== + +For further assistance, send an email to kunpengcompute@huawei.com. diff --git a/hbase-tries-index/fake-tries-index-pack/pom.xml b/hbase-tries-index/fake-tries-index-pack/pom.xml new file mode 100644 index 00000000..58763fe6 --- /dev/null +++ b/hbase-tries-index/fake-tries-index-pack/pom.xml @@ -0,0 +1,6 @@ + + 4.0.0 + com.huawei + fake-tries-index-pack + 2.2.3 + diff --git a/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/AllocatedMemory.java b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/AllocatedMemory.java new file mode 100644 index 00000000..7ac4fcb9 --- /dev/null +++ b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/AllocatedMemory.java @@ -0,0 +1,29 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. + */ +package com.huawei.boostkit.hbase.index; + +import java.nio.ByteBuffer; + +/** + * Allocated memory by unsafe + * + * @since 2021.09 + */ +public class AllocatedMemory { + public ByteBuffer getBuf() { + return null; + } + + public void retain() { + } + + public void release() { + } + + public int refCnt() { + return 0; + } + + public int size() {return 0;} +} diff --git a/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/IllegalMemoryRequestException.java b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/IllegalMemoryRequestException.java new file mode 100644 index 00000000..d6af80ec --- /dev/null +++ b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/IllegalMemoryRequestException.java @@ -0,0 +1,14 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. + */ +package com.huawei.boostkit.hbase.index; + +/** + * Illegal Memory Request Exception + * + * @since 2021.09 + */ +public class IllegalMemoryRequestException extends Exception { + public IllegalMemoryRequestException(long requestMemorySize) { + } +} diff --git a/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/InsufficientMemoryException.java b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/InsufficientMemoryException.java new file mode 100644 index 00000000..6f862bbd --- /dev/null +++ b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/InsufficientMemoryException.java @@ -0,0 +1,14 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. + */ +package com.huawei.boostkit.hbase.index; + +/** + * Insufficient memory exception + * + * @since 2021.09 + */ +public class InsufficientMemoryException extends Exception { + public InsufficientMemoryException(long memoryGap) { + } +} diff --git a/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/LoudsTriesService.java b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/LoudsTriesService.java new file mode 100644 index 00000000..0cae40df --- /dev/null +++ b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/LoudsTriesService.java @@ -0,0 +1,24 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. + */ +package com.huawei.boostkit.hbase.index; + +import java.io.DataOutput; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; + +/** + * Louds Tries Map service + * + * @since 2021.09 + */ +public class LoudsTriesService { + + public static void build(List keys, long[] offsets, int[] lengths, DataOutput out) throws IOException { + } + + public static ValueInfo get(ByteBuffer buff, byte[] key, int offset, int length) { + return null; + } +} diff --git a/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/NativeMaxLimitAllocator.java b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/NativeMaxLimitAllocator.java new file mode 100644 index 00000000..a59df916 --- /dev/null +++ b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/NativeMaxLimitAllocator.java @@ -0,0 +1,29 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. + */ +package com.huawei.boostkit.hbase.index; + +/** + * Direct memory allocator with maximum memory + * memory is free manually + * + * @since 2021.09 + */ +public class NativeMaxLimitAllocator { + + + public NativeMaxLimitAllocator(long maxMemorySize) { + } + + public AllocatedMemory allocate(int size) throws InsufficientMemoryException, IllegalMemoryRequestException { + return null; + } + + public long getMaxMemory() { + return 0; + } + + public long getMemoryUsed() { + return 0; + } +} diff --git a/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/OffheapLruCache.java b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/OffheapLruCache.java new file mode 100644 index 00000000..90501554 --- /dev/null +++ b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/OffheapLruCache.java @@ -0,0 +1,29 @@ +package com.huawei.boostkit.hbase.index; + +public class OffheapLruCache { + public OffheapLruCache(long maxMemorySize, float reserveRatio, long evaluatedBlockSize) { + } + + public void putCache(KEYTYPE cacheKey, OffheapReplaceableCache cacheable) + throws InsufficientMemoryException, IllegalMemoryRequestException { + } + + public OffheapReplaceableCache getCache(KEYTYPE cacheKey) { + return null; + } + + public boolean containsKey(KEYTYPE cacheKey) { + return false; + } + + public void returnCache(OffheapReplaceableCache value) { + } + + public long getMaxMemory() { + return 0; + } + + public void shutdown() { + } + public void activateEvictThread(){} +} diff --git a/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/OffheapReplaceableCache.java b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/OffheapReplaceableCache.java new file mode 100644 index 00000000..f4d41ba7 --- /dev/null +++ b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/OffheapReplaceableCache.java @@ -0,0 +1,8 @@ +package com.huawei.boostkit.hbase.index; + +public interface OffheapReplaceableCache { + void retain(); + void release(); + int dataSize(); + OffheapReplaceableCache replace(AllocatedMemory allocate); +} diff --git a/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/ValueInfo.java b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/ValueInfo.java new file mode 100644 index 00000000..f0a0c1ab --- /dev/null +++ b/hbase-tries-index/fake-tries-index-pack/src/main/java/com/huawei/boostkit/hbase/index/ValueInfo.java @@ -0,0 +1,19 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. + */ +package com.huawei.boostkit.hbase.index; + +/** + * Value info + * + * @since 2021.09 + */ +public class ValueInfo { + public final long blockOffset; + public final int blockLength; + + public ValueInfo(long blockOffset, int blockLength) { + this.blockOffset = blockOffset; + this.blockLength = blockLength; + } +} diff --git a/hbase-tries-index/patch/HBase-Louds-Tries-Index.patch b/hbase-tries-index/patch/HBase-Louds-Tries-Index.patch new file mode 100644 index 00000000..40987dc2 --- /dev/null +++ b/hbase-tries-index/patch/HBase-Louds-Tries-Index.patch @@ -0,0 +1,754 @@ +From fb7f4d913bb8764b824c143d46bddd5fdcf1a511 Mon Sep 17 00:00:00 2001 +From: asd +Date: Tue, 16 Nov 2021 15:35:58 +0800 +Subject: [PATCH] patch + +--- + hbase-common/pom.xml | 6 ++ + .../apache/hadoop/hbase/io/hfile/BlockType.java | 3 + + .../apache/hadoop/hbase/nio/SingleByteBuff.java | 2 +- + hbase-server/pom.xml | 6 ++ + .../hadoop/hbase/io/hfile/BlockCacheFactory.java | 6 ++ + .../apache/hadoop/hbase/io/hfile/CacheStats.java | 2 + + .../hadoop/hbase/io/hfile/CombinedBlockCache.java | 8 +- + .../apache/hadoop/hbase/io/hfile/HFileBlock.java | 49 +++++++++++- + .../hadoop/hbase/io/hfile/HFileBlockIndex.java | 52 ++++++++++-- + .../hadoop/hbase/io/hfile/HFileReaderImpl.java | 3 + + .../hadoop/hbase/io/hfile/HFileWriterImpl.java | 4 +- + .../hbase/io/hfile/LoudsTriesLruBlockCache.java | 93 ++++++++++++++++++++++ + .../io/hfile/LoudsTriesLruBlockCacheMBean.java | 13 +++ + .../hadoop/hbase/io/hfile/LruBlockCache.java | 45 +++++++++-- + .../hadoop/hbase/io/hfile/NativeByteBuff.java | 52 ++++++++++++ + .../hadoop/hbase/regionserver/HRegionServer.java | 5 ++ + 16 files changed, 330 insertions(+), 19 deletions(-) + create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/LoudsTriesLruBlockCache.java + create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/LoudsTriesLruBlockCacheMBean.java + create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/NativeByteBuff.java + +diff --git a/hbase-common/pom.xml b/hbase-common/pom.xml +index c096318..af27cac 100644 +--- a/hbase-common/pom.xml ++++ b/hbase-common/pom.xml +@@ -183,6 +183,12 @@ + + + ++ com.huawei ++ fake-tries-index-pack ++ 2.2.3 ++ provided ++ ++ + org.apache.hbase + hbase-annotations + test-jar +diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java +index 4753813..4fa293e 100644 +--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java ++++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockType.java +@@ -51,6 +51,9 @@ public enum BlockType { + + /** Version 2 leaf index block. Appears in the data block section */ + LEAF_INDEX("IDXLEAF2", BlockCategory.INDEX), ++ ++ /** Tries leaf index block. Appears in the data block section */ ++ LEAF_INDEX_TRIES("TRIELEAF", BlockCategory.INDEX), + + /** Bloom filter block, version 2 */ + BLOOM_CHUNK("BLMFBLK2", BlockCategory.BLOOM), +diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/SingleByteBuff.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/SingleByteBuff.java +index 6d64d7b..e3aaa3f 100644 +--- a/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/SingleByteBuff.java ++++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/nio/SingleByteBuff.java +@@ -41,7 +41,7 @@ public class SingleByteBuff extends ByteBuff { + private static final boolean UNSAFE_UNALIGNED = UnsafeAvailChecker.unaligned(); + + // Underlying BB +- private final ByteBuffer buf; ++ protected final ByteBuffer buf; + + // To access primitive values from underlying ByteBuffer using Unsafe + private long unsafeOffset; +diff --git a/hbase-server/pom.xml b/hbase-server/pom.xml +index a660845..fd55fe3 100644 +--- a/hbase-server/pom.xml ++++ b/hbase-server/pom.xml +@@ -314,6 +314,12 @@ + + + ++ com.huawei ++ fake-tries-index-pack ++ 2.2.3 ++ provided ++ ++ + org.apache.hbase.thirdparty + hbase-shaded-protobuf + +diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCacheFactory.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCacheFactory.java +index 01fb130..21d6a99 100644 +--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCacheFactory.java ++++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCacheFactory.java +@@ -21,11 +21,13 @@ import static org.apache.hadoop.hbase.HConstants.BUCKET_CACHE_IOENGINE_KEY; + import static org.apache.hadoop.hbase.HConstants.BUCKET_CACHE_SIZE_KEY; + + import java.io.IOException; ++import java.lang.reflect.InvocationTargetException; + + import org.apache.hadoop.conf.Configuration; + import org.apache.hadoop.hbase.HConstants; + import org.apache.hadoop.hbase.io.hfile.bucket.BucketCache; + import org.apache.hadoop.hbase.io.util.MemorySizeUtil; ++import org.apache.hadoop.hbase.regionserver.HRegionServer; + import org.apache.hadoop.hbase.util.ReflectionUtils; + import org.apache.hadoop.util.StringUtils; + import org.apache.yetus.audience.InterfaceAudience; +@@ -134,6 +136,10 @@ public final class BlockCacheFactory { + LOG.info( + "Allocating onheap LruBlockCache size=" + StringUtils.byteDesc(cacheSize) + ", blockSize=" + + StringUtils.byteDesc(blockSize)); ++ ++ if (c.getBoolean(LruBlockCache.TRIES_USE_OFFHEAP_KEY, LruBlockCache.DEF_TRIES_USE_OFFHEAP)) { ++ return new LoudsTriesLruBlockCache(cacheSize, blockSize, true, c); ++ } + return new LruBlockCache(cacheSize, blockSize, true, c); + } + +diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CacheStats.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CacheStats.java +index 7c5b563..93f5cd7 100644 +--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CacheStats.java ++++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CacheStats.java +@@ -161,6 +161,7 @@ public class CacheStats { + dataMissCount.increment(); + break; + case LEAF_INDEX: ++ case LEAF_INDEX_TRIES: + leafIndexMissCount.increment(); + break; + case BLOOM_CHUNK: +@@ -209,6 +210,7 @@ public class CacheStats { + dataHitCount.increment(); + break; + case LEAF_INDEX: ++ case LEAF_INDEX_TRIES: + leafIndexHitCount.increment(); + break; + case BLOOM_CHUNK: +diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java +index b7b9c77..bcd7f17 100644 +--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java ++++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/CombinedBlockCache.java +@@ -380,8 +380,12 @@ public class CombinedBlockCache implements ResizableBlockCache, HeapSize { + + @Override + public void returnBlock(BlockCacheKey cacheKey, Cacheable block) { +- // returnBlock is meaningful for L2 cache alone. +- this.l2Cache.returnBlock(cacheKey, block); ++ boolean metaBlock = block.getBlockType().getCategory() != BlockCategory.DATA; ++ if (metaBlock) { ++ onHeapCache.returnBlock(cacheKey, block); ++ } else { ++ l2Cache.returnBlock(cacheKey, block); ++ } + } + + @VisibleForTesting +diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java +index ebc4564..1406e08 100644 +--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java ++++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlock.java +@@ -35,6 +35,10 @@ import org.apache.hadoop.hbase.HConstants; + import org.apache.yetus.audience.InterfaceAudience; + import org.slf4j.Logger; + import org.slf4j.LoggerFactory; ++ ++import com.huawei.boostkit.hbase.index.AllocatedMemory; ++import com.huawei.boostkit.hbase.index.OffheapReplaceableCache; ++ + import org.apache.hadoop.hbase.fs.HFileSystem; + import org.apache.hadoop.hbase.io.ByteArrayOutputStream; + import org.apache.hadoop.hbase.io.ByteBuffInputStream; +@@ -111,7 +115,7 @@ import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; + * IOEngine is in the bucket subpackage. Pull it up? Then this class knows about bucketcache. Ugh. + */ + @InterfaceAudience.Private +-public class HFileBlock implements Cacheable { ++public class HFileBlock implements Cacheable, OffheapReplaceableCache { + private static final Logger LOG = LoggerFactory.getLogger(HFileBlock.class); + + // Block Header fields. +@@ -179,7 +183,7 @@ public class HFileBlock implements Cacheable { + * So, we have this ByteBuff type. Unfortunately, it is spread all about HFileBlock. Would be + * good if could be confined to cache-use only but hard-to-do. + */ +- private ByteBuff buf; ++ protected ByteBuff buf; + + /** Meta data that holds meta information on the hfileblock. + */ +@@ -400,6 +404,20 @@ public class HFileBlock implements Cacheable { + this.buf = buf; + this.buf.rewind(); + } ++ ++ public HFileBlock (HFileBlock original, ByteBuff buff, MemoryType memType) { ++ this.blockType = original.blockType; ++ this.buf = buff; ++ buff.put(0, original.buf, 0, original.buf.limit()); ++ this.onDiskDataSizeWithHeader = original.onDiskDataSizeWithHeader; ++ this.uncompressedSizeWithoutHeader = original.uncompressedSizeWithoutHeader; ++ this.prevBlockOffset = original.prevBlockOffset; ++ this.onDiskSizeWithoutHeader = original.onDiskSizeWithoutHeader; ++ this.fileContext = original.fileContext; ++ this.offset = original.offset; ++ this.memType = memType; ++ this.nextBlockOnDiskSize = original.nextBlockOnDiskSize; ++ } + + /** + * Called from constructors. +@@ -2121,4 +2139,31 @@ public class HFileBlock implements Cacheable { + public HFileBlock deepClone() { + return new HFileBlock(this, true); + } ++ ++ @Override ++ public void retain() { ++ if (buf instanceof NativeByteBuff) { ++ ((NativeByteBuff) buf).retain(); ++ } ++ } ++ ++ @Override ++ public void release() { ++ if (buf instanceof NativeByteBuff) { ++ ((NativeByteBuff) buf).release(); ++ } ++ } ++ ++ @Override ++ public int dataSize() { ++ return buf.limit(); ++ } ++ ++ @Override ++ public OffheapReplaceableCache replace(AllocatedMemory allocate) { ++ ByteBuff newBuff = new NativeByteBuff(allocate) ++ .position(allocate.size() - dataSize()).slice(); ++ newBuff.put(0, buf, 0, dataSize()); ++ return new HFileBlock(this, newBuff, MemoryType.SHARED); ++ } + } +diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java +index 90d11ac..0452d24 100644 +--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java ++++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java +@@ -47,6 +47,7 @@ import org.apache.hadoop.hbase.io.HeapSize; + import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; + import org.apache.hadoop.hbase.io.hfile.HFile.CachingBlockReader; + import org.apache.hadoop.hbase.nio.ByteBuff; ++import org.apache.hadoop.hbase.regionserver.HRegionServer; + import org.apache.hadoop.hbase.regionserver.KeyValueScanner; + import org.apache.hadoop.hbase.util.Bytes; + import org.apache.hadoop.hbase.util.ClassSize; +@@ -54,6 +55,8 @@ import org.apache.hadoop.hbase.util.ObjectIntPair; + import org.apache.hadoop.io.WritableUtils; + import org.apache.hadoop.util.StringUtils; + ++import com.huawei.boostkit.hbase.index.*; ++ + /** + * Provides functionality to write ({@link BlockIndexWriter}) and read + * BlockIndexReader +@@ -362,6 +365,20 @@ public class HFileBlockIndex { + // Locate the entry corresponding to the given key in the non-root + // (leaf or intermediate-level) index block. + ByteBuff buffer = block.getBufferWithoutHeader(); ++ if (block.getBlockType() == BlockType.LEAF_INDEX_TRIES) { ++ ValueInfo blockInfo = LoudsTriesService.get(buffer.asSubByteBuffer(block.getUncompressedSizeWithoutHeader()), key.getRowArray(), ++ key.getRowOffset(), key.getRowLength()); ++ if (blockInfo != null) { ++ currentOffset = blockInfo.blockOffset; ++ currentOnDiskSize = blockInfo.blockLength; ++ } else { ++ // This has to be changed ++ // For now change this to key value ++ throw new IOException("The key " + CellUtil.getCellKeyAsString(key) + " is before the" ++ + " first key of the non-root index block " + block); ++ } ++ continue; ++ } + index = locateNonRootIndexEntry(buffer, key, comparator); + if (index == -1) { + // This has to be changed +@@ -374,7 +391,8 @@ public class HFileBlockIndex { + currentOffset = buffer.getLong(); + currentOnDiskSize = buffer.getInt(); + +- // Only update next indexed key if there is a next indexed key in the current level ++ // Only update next indexed key if there is a next indexed key in ++ // the current level + byte[] nonRootIndexedKey = getNonRootIndexedKey(buffer, index + 1); + if (nonRootIndexedKey != null) { + tmpNextIndexKV.setKey(nonRootIndexedKey, 0, nonRootIndexedKey.length); +@@ -1307,7 +1325,13 @@ public class HFileBlockIndex { + + // Write the inline block index to the output stream in the non-root + // index block format. +- curInlineChunk.writeNonRoot(out); ++ if (HRegionServer.WRITE_TRIES_INDEX) { ++ LoudsTriesService.build(curInlineChunk.getBlockKeyList(), ++ curInlineChunk.getBlockOffsetList().stream().mapToLong(l -> l).toArray(), ++ curInlineChunk.getOnDiskDataSizeList().stream().mapToInt(i -> i).toArray(), out); ++ } else { ++ curInlineChunk.writeNonRoot(out); ++ } + + // Save the first key of the inline block so that we can add it to the + // parent-level index. +@@ -1350,7 +1374,7 @@ public class HFileBlockIndex { + + @Override + public BlockType getInlineBlockType() { +- return BlockType.LEAF_INDEX; ++ return HRegionServer.WRITE_TRIES_INDEX ? BlockType.LEAF_INDEX_TRIES : BlockType.LEAF_INDEX; + } + + /** +@@ -1554,8 +1578,13 @@ public class HFileBlockIndex { + long midKeySubEntry = (totalNumSubEntries - 1) / 2; + int midKeyEntry = getEntryBySubEntry(midKeySubEntry); + +- baosDos.writeLong(blockOffsets.get(midKeyEntry)); +- baosDos.writeInt(onDiskDataSizes.get(midKeyEntry)); ++ if (HRegionServer.WRITE_TRIES_INDEX) { ++ baosDos.writeLong((long) -1); ++ baosDos.writeInt((int) -1); ++ } else { ++ baosDos.writeLong(blockOffsets.get(midKeyEntry)); ++ baosDos.writeInt(onDiskDataSizes.get(midKeyEntry)); ++ } + + long numSubEntriesBefore = midKeyEntry > 0 + ? numSubEntriesAt.get(midKeyEntry - 1) : 0; +@@ -1672,6 +1701,19 @@ public class HFileBlockIndex { + public int getOnDiskDataSize(int i) { + return onDiskDataSizes.get(i); + } ++ ++ public List getBlockKeyList() { ++ return blockKeys; ++ } ++ ++ public List getBlockOffsetList() { ++ return blockOffsets; ++ } ++ ++ public List getOnDiskDataSizeList() { ++ return onDiskDataSizes; ++ } ++ + + public long getCumulativeNumKV(int i) { + if (i < 0) +diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java +index d1b3a89..8ee4c95 100644 +--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java ++++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileReaderImpl.java +@@ -1577,6 +1577,9 @@ public class HFileReaderImpl implements HFile.Reader, Configurable { + // verification. + return; + } ++ if (actualBlockType == expectedBlockType) return; ++ if (expectedBlockType == BlockType.LEAF_INDEX) ++ expectedBlockType = BlockType.LEAF_INDEX_TRIES; + if (actualBlockType != expectedBlockType) { + throw new IOException("Expected block type " + expectedBlockType + ", " + + "but got " + actualBlockType + ": " + block + ", path=" + path); +diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterImpl.java +index fa5f1f1..d8e801c 100644 +--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterImpl.java ++++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterImpl.java +@@ -47,6 +47,7 @@ import org.apache.hadoop.hbase.io.crypto.Encryption; + import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; + import org.apache.hadoop.hbase.io.hfile.HFile.FileInfo; + import org.apache.hadoop.hbase.io.hfile.HFileBlock.BlockWritable; ++import org.apache.hadoop.hbase.regionserver.HRegionServer; + import org.apache.hadoop.hbase.security.EncryptionUtil; + import org.apache.hadoop.hbase.security.User; + import org.apache.hadoop.hbase.util.BloomFilterWriter; +@@ -241,7 +242,8 @@ public class HFileWriterImpl implements HFile.Writer { + throw new IOException("Key cannot be null or empty"); + } + if (lastCell != null) { +- int keyComp = PrivateCellUtil.compareKeyIgnoresMvcc(comparator, lastCell, cell); ++ int keyComp = HRegionServer.WRITE_TRIES_INDEX ? comparator.compareRows(lastCell, cell) : ++ PrivateCellUtil.compareKeyIgnoresMvcc(comparator, lastCell, cell); + + if (keyComp > 0) { + throw new IOException("Added a key not lexically larger than" +diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/LoudsTriesLruBlockCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/LoudsTriesLruBlockCache.java +new file mode 100644 +index 0000000..e4cba9f +--- /dev/null ++++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/LoudsTriesLruBlockCache.java +@@ -0,0 +1,93 @@ ++package org.apache.hadoop.hbase.io.hfile; ++ ++ ++import java.util.ArrayList; ++import java.util.HashSet; ++ ++import org.apache.hadoop.conf.Configuration; ++import org.apache.hadoop.hbase.io.hfile.Cacheable.MemoryType; ++import org.apache.hadoop.hbase.nio.ByteBuff; ++import org.apache.yetus.audience.InterfaceAudience; ++import org.slf4j.Logger; ++import org.slf4j.LoggerFactory; ++ ++import com.huawei.boostkit.hbase.index.IllegalMemoryRequestException; ++import com.huawei.boostkit.hbase.index.InsufficientMemoryException; ++import com.huawei.boostkit.hbase.index.OffheapLruCache; ++import com.huawei.boostkit.hbase.index.OffheapReplaceableCache; ++ ++@InterfaceAudience.Private ++public class LoudsTriesLruBlockCache extends LruBlockCache { ++ private static final Logger LOG = LoggerFactory.getLogger(LoudsTriesLruBlockCache.class); ++ private static final String SURF_MEMORY_SIZE_KEY = "hbase.tries.offheap.cache.size"; ++ private static final String SURF_RESERVE_RATIO_KEY = "hbase.tries.cache.reserve.ratio"; ++ private static final long DEFAULT_SURF_MEMORY_SIZE_KEY = 1024 * 1024 * 1024; ++ private static final float DEFAULT_SURF_RESERVE_RATIO = 0.05f; ++ static { ++ String arch = System.getProperty("os.arch", ""); ++ if (!arch.equals("aarch64")) { ++ throw new UnsupportedOperationException("os arch type required aarch64 but actually " + arch); ++ } ++ } ++ ++ private final OffheapLruCache offHeapCache; ++ ++ public LoudsTriesLruBlockCache(long maxSize, long blockSize, boolean evictionThread, Configuration conf) { ++ this(maxSize, blockSize, evictionThread, conf, conf.getLong(SURF_MEMORY_SIZE_KEY, DEFAULT_SURF_MEMORY_SIZE_KEY), ++ conf.getFloat(SURF_RESERVE_RATIO_KEY, DEFAULT_SURF_RESERVE_RATIO)); ++ } ++ ++ public LoudsTriesLruBlockCache(long maxSize, long blockSize, boolean evictionThread, Configuration conf, ++ long maxMemorySize, float reserveRatio) { ++ super(maxSize, blockSize, evictionThread, conf); ++ offHeapCache = new OffheapLruCache<>(maxMemorySize, reserveRatio, blockSize); ++ } ++ ++ @Override ++ public void cacheBlock(BlockCacheKey cacheKey, Cacheable buf, boolean inMemory) { ++ if (!(buf instanceof HFileBlock) || buf.getBlockType() != BlockType.LEAF_INDEX_TRIES) { ++ super.cacheBlock(cacheKey, buf, inMemory); ++ return; ++ } ++ if (!BlockCacheUtil.shouldReplaceExistingCacheBlock(this, cacheKey, buf)) { ++ return; ++ } ++ try { ++ offHeapCache.putCache(cacheKey, (HFileBlock)buf); ++ } catch (InsufficientMemoryException e) { ++ getStats().failInsert(); ++ offHeapCache.activateEvictThread(); ++ } catch (IllegalMemoryRequestException e) { ++ if (getStats().failInsert() % 50 == 0) { ++ LOG.warn("Trying to cache too large a block " + cacheKey.getHfileName() + " @ " + cacheKey.getOffset() + " is " ++ + buf.heapSize() + " which is larger than " + offHeapCache.getMaxMemory()); ++ } ++ return; ++ } ++ } ++ ++ public boolean containsBlock(BlockCacheKey cacheKey) { ++ return offHeapCache.containsKey(cacheKey) || super.containsBlock(cacheKey); ++ } ++ ++ public Cacheable getBlock(BlockCacheKey cacheKey, boolean caching, boolean repeat, ++ boolean updateCacheMetrics) { ++ if (offHeapCache.containsKey(cacheKey)) { ++ return (Cacheable) offHeapCache.getCache(cacheKey); ++ } ++ return super.getBlock(cacheKey, caching, repeat, updateCacheMetrics); ++ } ++ ++ @Override ++ public void shutdown() { ++ offHeapCache.shutdown(); ++ super.shutdown(); ++ } ++ ++ @Override ++ public void returnBlock(BlockCacheKey cacheKey, Cacheable cache) { ++ if (cache instanceof OffheapReplaceableCache) { ++ offHeapCache.returnCache((OffheapReplaceableCache) cache); ++ } ++ } ++} +diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/LoudsTriesLruBlockCacheMBean.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/LoudsTriesLruBlockCacheMBean.java +new file mode 100644 +index 0000000..5281f2c +--- /dev/null ++++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/LoudsTriesLruBlockCacheMBean.java +@@ -0,0 +1,13 @@ ++package org.apache.hadoop.hbase.io.hfile; ++ ++import java.util.ArrayList; ++import java.util.HashSet; ++ ++import org.apache.yetus.audience.InterfaceAudience; ++ ++@InterfaceAudience.Private ++public interface LoudsTriesLruBlockCacheMBean { ++ int[] getLeakRefCnts(); ++ ArrayList[] getLeakStack(); ++ HashSet getRetainStacks(); ++} +diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/LruBlockCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/LruBlockCache.java +index f8b724c..3b189ff 100644 +--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/LruBlockCache.java ++++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/LruBlockCache.java +@@ -35,10 +35,12 @@ import java.util.concurrent.locks.ReentrantLock; + import org.apache.hadoop.conf.Configuration; + import org.apache.hadoop.hbase.io.HeapSize; + import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; ++import org.apache.hadoop.hbase.regionserver.HRegionServer; + import org.apache.hadoop.hbase.util.Bytes; + import org.apache.hadoop.hbase.util.ClassSize; + import org.apache.hadoop.hbase.util.HasThread; + import org.apache.hadoop.util.StringUtils; ++import org.apache.hadoop.yarn.webapp.hamlet.Hamlet.HR; + import org.apache.yetus.audience.InterfaceAudience; + import org.slf4j.Logger; + import org.slf4j.LoggerFactory; +@@ -126,6 +128,8 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + */ + private static final String LRU_IN_MEMORY_FORCE_MODE_CONFIG_NAME = + "hbase.lru.rs.inmemoryforcemode"; ++ ++ static final String TRIES_USE_OFFHEAP_KEY = "hbase.tries.use-offheap"; + + /* Default Configuration Parameters*/ + +@@ -146,13 +150,15 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + + private static final boolean DEFAULT_IN_MEMORY_FORCE_MODE = false; + ++ static final boolean DEF_TRIES_USE_OFFHEAP = false; ++ + /* Statistics thread */ + private static final int STAT_THREAD_PERIOD = 60 * 5; + private static final String LRU_MAX_BLOCK_SIZE = "hbase.lru.max.block.size"; + private static final long DEFAULT_MAX_BLOCK_SIZE = 16L * 1024L * 1024L; + + /** Concurrent map (the cache) */ +- private transient final Map map; ++ protected transient final Map map; + + /** Eviction lock (locked when eviction in process) */ + private transient final ReentrantLock evictionLock = new ReentrantLock(true); +@@ -177,13 +183,13 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + private final LongAdder dataBlockSize; + + /** Current number of cached elements */ +- private final AtomicLong elements; ++ protected final AtomicLong elements; + + /** Current number of cached data block elements */ + private final LongAdder dataBlockElements; + + /** Cache access count (sequential ID) */ +- private final AtomicLong count; ++ protected final AtomicLong count; + + /** hard capacity limit */ + private float hardCapacityLimitFactor; +@@ -218,6 +224,8 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + /** Whether in-memory hfile's data block has higher priority when evicting */ + private boolean forceInMemory; + ++ private final boolean triesUseOffheap; ++ + /** + * Where to send victims (blocks evicted/missing from the cache). This is used only when we use an + * external cache as L2. +@@ -252,7 +260,8 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + DEFAULT_MEMORY_FACTOR, + DEFAULT_HARD_CAPACITY_LIMIT_FACTOR, + false, +- DEFAULT_MAX_BLOCK_SIZE ++ DEFAULT_MAX_BLOCK_SIZE, ++ DEF_TRIES_USE_OFFHEAP + ); + } + +@@ -269,7 +278,8 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + conf.getFloat(LRU_HARD_CAPACITY_LIMIT_FACTOR_CONFIG_NAME, + DEFAULT_HARD_CAPACITY_LIMIT_FACTOR), + conf.getBoolean(LRU_IN_MEMORY_FORCE_MODE_CONFIG_NAME, DEFAULT_IN_MEMORY_FORCE_MODE), +- conf.getLong(LRU_MAX_BLOCK_SIZE, DEFAULT_MAX_BLOCK_SIZE) ++ conf.getLong(LRU_MAX_BLOCK_SIZE, DEFAULT_MAX_BLOCK_SIZE), ++ conf.getBoolean(TRIES_USE_OFFHEAP_KEY, DEF_TRIES_USE_OFFHEAP) + ); + } + +@@ -277,6 +287,16 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + this(maxSize, blockSize, true, conf); + } + ++ public LruBlockCache(long maxSize, long blockSize, boolean evictionThread, ++ int mapInitialSize, float mapLoadFactor, int mapConcurrencyLevel, ++ float minFactor, float acceptableFactor, float singleFactor, ++ float multiFactor, float memoryFactor, float hardLimitFactor, ++ boolean forceInMemory, long maxBlockSize) { ++ this(maxSize, blockSize, evictionThread, mapInitialSize, mapLoadFactor, mapConcurrencyLevel, minFactor, ++ acceptableFactor, singleFactor, multiFactor, memoryFactor, hardLimitFactor, forceInMemory, maxBlockSize, ++ DEF_TRIES_USE_OFFHEAP); ++ } ++ + /** + * Configurable constructor. Use this constructor if not using defaults. + * +@@ -296,7 +316,7 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + int mapInitialSize, float mapLoadFactor, int mapConcurrencyLevel, + float minFactor, float acceptableFactor, float singleFactor, + float multiFactor, float memoryFactor, float hardLimitFactor, +- boolean forceInMemory, long maxBlockSize) { ++ boolean forceInMemory, long maxBlockSize, boolean triesUseOffheap) { + this.maxBlockSize = maxBlockSize; + if(singleFactor + multiFactor + memoryFactor != 1 || + singleFactor < 0 || multiFactor < 0 || memoryFactor < 0) { +@@ -336,6 +356,7 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + // every five minutes. + this.scheduleThreadPool.scheduleAtFixedRate(new StatisticsThread(this), STAT_THREAD_PERIOD, + STAT_THREAD_PERIOD, TimeUnit.SECONDS); ++ this.triesUseOffheap = triesUseOffheap; + } + + @Override +@@ -455,7 +476,7 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + if (bt != null && bt.isData()) { + dataBlockSize.add(heapsize); + } +- return size.addAndGet(heapsize); ++ return bt == BlockType.LEAF_INDEX_TRIES && triesUseOffheap ? size.get() : size.addAndGet(heapsize); + } + + /** +@@ -473,7 +494,10 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + @Override + public Cacheable getBlock(BlockCacheKey cacheKey, boolean caching, boolean repeat, + boolean updateCacheMetrics) { +- LruCachedBlock cb = map.get(cacheKey); ++ LruCachedBlock cb = map.computeIfPresent(cacheKey, (key, val) -> { ++ handleExistCacheBlock(val); ++ return val; ++ }); + if (cb == null) { + if (!repeat && updateCacheMetrics) { + stats.miss(caching, cacheKey.isPrimary(), cacheKey.getBlockType()); +@@ -500,6 +524,9 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + return cb.getBuffer(); + } + ++ protected void handleExistCacheBlock(LruCachedBlock val) { ++ } ++ + /** + * Whether the cache contains block with specified cacheKey + * +@@ -622,6 +649,8 @@ public class LruBlockCache implements ResizableBlockCache, HeapSize { + + // Scan entire map putting into appropriate buckets + for (LruCachedBlock cachedBlock : map.values()) { ++ if (cachedBlock.getBuffer().getBlockType() == BlockType.LEAF_INDEX_TRIES && triesUseOffheap) ++ continue; + switch (cachedBlock.getPriority()) { + case SINGLE: { + bucketSingle.add(cachedBlock); +diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/NativeByteBuff.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/NativeByteBuff.java +new file mode 100644 +index 0000000..9912895 +--- /dev/null ++++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/NativeByteBuff.java +@@ -0,0 +1,52 @@ ++package org.apache.hadoop.hbase.io.hfile; ++ ++import static org.apache.hadoop.hbase.util.UnsafeAccess.theUnsafe; ++ ++import java.lang.reflect.Constructor; ++import java.lang.reflect.InvocationTargetException; ++import java.nio.ByteBuffer; ++import java.nio.ByteOrder; ++ ++import org.apache.hadoop.hbase.nio.SingleByteBuff; ++import org.apache.hadoop.hbase.util.Pair; ++import org.apache.hbase.thirdparty.io.netty.util.AbstractReferenceCounted; ++import org.apache.hbase.thirdparty.io.netty.util.ReferenceCounted; ++import org.apache.yetus.audience.InterfaceAudience; ++ ++import com.huawei.boostkit.hbase.index.AllocatedMemory; ++ ++@InterfaceAudience.Private ++public class NativeByteBuff extends SingleByteBuff { ++ private AllocatedMemory memory; ++ ++ private NativeByteBuff(AllocatedMemory memory, ByteBuffer buffer) { ++ super(buffer); ++ this.memory = memory; ++ } ++ ++ public NativeByteBuff(AllocatedMemory memory) { ++ this(memory, memory.getBuf()); ++ } ++ ++ @Override ++ public NativeByteBuff slice() { ++ return new NativeByteBuff(memory, this.buf.slice()); ++ } ++ ++ @Override ++ public NativeByteBuff duplicate() { ++ return new NativeByteBuff(memory, this.buf.duplicate()); ++ } ++ ++ public void retain() { ++ memory.retain(); ++ } ++ ++ public void release() { ++ memory.release(); ++ } ++ ++ public int refCnt() { ++ return memory.refCnt(); ++ } ++} +diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +index d88aeef..233bb11 100644 +--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java ++++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +@@ -3077,6 +3077,10 @@ public class HRegionServer extends HasThread implements + } + } + ++ public static final String WRITE_TRIES_KEY = "hbase.write.tries"; ++ public static final boolean DEF_WRITE_TRIES = false; ++ public static boolean WRITE_TRIES_INDEX = DEF_WRITE_TRIES; ++ + /** + * @see org.apache.hadoop.hbase.regionserver.HRegionServerCommandLine + */ +@@ -3084,6 +3088,7 @@ public class HRegionServer extends HasThread implements + LOG.info("STARTING executorService " + HRegionServer.class.getSimpleName()); + VersionInfo.logVersion(); + Configuration conf = HBaseConfiguration.create(); ++ WRITE_TRIES_INDEX = conf.getBoolean(WRITE_TRIES_KEY, DEF_WRITE_TRIES); + @SuppressWarnings("unchecked") + Class regionServerClass = (Class) conf + .getClass(HConstants.REGION_SERVER_IMPL, HRegionServer.class); +-- +1.8.3.1 + From 1cde12786875ba3dcfd84fd1735ff02ee7434cac Mon Sep 17 00:00:00 2001 From: tushengxia Date: Thu, 14 Apr 2022 19:48:49 +0800 Subject: [PATCH 3/3] spark v1.0.0 and openLooKeng v1.0.0 tag --- .gitignore | 5 + .../omnidata-openlookeng-connector/.gitignore | 5 + .../omnidata-openlookeng-connector/README.md | 32 +- .../omnidata-openlookeng-connector/build.sh | 9 + .../connector/pom.xml | 791 ++ .../hive/BackgroundHiveSplitLoader.java | 909 +++ .../plugin/hive/BaseStorageFormat.java | 24 + .../plugin/hive/CachingDirectoryLister.java | 172 + .../prestosql/plugin/hive/CoercionPolicy.java | 19 + .../plugin/hive/ConcurrentLazyQueue.java | 42 + .../ConnectorObjectNameGeneratorModule.java | 117 + .../hive/CreateEmptyPartitionProcedure.java | 142 + .../plugin/hive/DeleteDeltaLocations.java | 128 + .../plugin/hive/DirectoryLister.java | 28 + .../hive/DynamicConfigurationProvider.java | 34 + .../hive/FileFormatDataSourceStats.java | 96 + .../plugin/hive/ForCachingHiveMetastore.java | 31 + .../ForCachingHiveMetastoreTableRefresh.java | 32 + .../io/prestosql/plugin/hive/ForHdfs.java | 31 + .../io/prestosql/plugin/hive/ForHive.java | 29 + .../plugin/hive/ForHiveMetastore.java | 31 + .../hive/ForHiveTransactionHeartbeats.java | 29 + .../prestosql/plugin/hive/ForHiveVacuum.java | 29 + .../hive/ForRecordingHiveMetastore.java | 31 + .../plugin/hive/GenericHiveRecordCursor.java | 518 ++ .../hive/GenericHiveRecordCursorProvider.java | 90 + .../plugin/hive/HdfsConfiguration.java | 24 + .../hive/HdfsConfigurationInitializer.java | 199 + .../plugin/hive/HdfsEnvironment.java | 164 + .../plugin/hive/HiveACIDWriteType.java | 53 + .../plugin/hive/HiveAnalyzeProperties.java | 88 + .../plugin/hive/HiveBasicStatistics.java | 119 + .../plugin/hive/HiveBooleanParser.java | 61 + .../hive/HiveBucketAdapterRecordCursor.java | 193 + .../plugin/hive/HiveBucketFunction.java | 84 + .../plugin/hive/HiveBucketHandle.java | 106 + .../plugin/hive/HiveBucketProperty.java | 131 + .../prestosql/plugin/hive/HiveBucketing.java | 325 + .../plugin/hive/HiveCatalogName.java | 32 + .../plugin/hive/HiveCoercionPolicy.java | 126 + .../plugin/hive/HiveCoercionRecordCursor.java | 696 ++ .../plugin/hive/HiveColumnHandle.java | 267 + .../plugin/hive/HiveCompressionCodec.java | 61 + .../io/prestosql/plugin/hive/HiveConfig.java | 2099 ++++++ .../prestosql/plugin/hive/HiveConnector.java | 240 + .../plugin/hive/HiveConnectorFactory.java | 178 + .../plugin/hive/HiveDecimalParser.java | 37 + .../hive/HiveDeleteAsInsertTableHandle.java | 51 + .../prestosql/plugin/hive/HiveErrorCode.java | 83 + .../plugin/hive/HiveEventClient.java | 40 + .../prestosql/plugin/hive/HiveFileWriter.java | 55 + .../plugin/hive/HiveFileWriterFactory.java | 37 + .../plugin/hive/HiveHandleResolver.java | 90 + .../plugin/hive/HiveHdfsConfiguration.java | 69 + .../prestosql/plugin/hive/HiveInputInfo.java | 48 + .../plugin/hive/HiveInsertTableHandle.java | 51 + .../plugin/hive/HiveLocationService.java | 149 + .../prestosql/plugin/hive/HiveMetadata.java | 3236 ++++++++ .../plugin/hive/HiveMetadataFactory.java | 216 + .../plugin/hive/HiveMetastoreClosure.java | 111 + .../io/prestosql/plugin/hive/HiveModule.java | 230 + .../hive/HiveNodePartitioningProvider.java | 73 + .../plugin/hive/HiveNotReadableException.java | 53 + .../plugin/hive/HiveOffloadExpression.java | 175 + .../plugin/hive/HiveOutputTableHandle.java | 84 + .../prestosql/plugin/hive/HivePageSink.java | 1076 +++ .../plugin/hive/HivePageSinkProvider.java | 214 + .../prestosql/plugin/hive/HivePageSource.java | 522 ++ .../plugin/hive/HivePageSourceFactory.java | 90 + .../plugin/hive/HivePageSourceProvider.java | 1028 +++ .../prestosql/plugin/hive/HivePartition.java | 88 + .../plugin/hive/HivePartitionHandle.java | 62 + .../plugin/hive/HivePartitionKey.java | 91 + .../plugin/hive/HivePartitionManager.java | 399 + .../plugin/hive/HivePartitionMetadata.java | 56 + .../plugin/hive/HivePartitionResult.java | 95 + .../plugin/hive/HivePartitioningHandle.java | 116 + .../io/prestosql/plugin/hive/HivePlugin.java | 73 + .../plugin/hive/HiveProcedureModule.java | 34 + .../hive/HivePushDownRecordPageSource.java | 109 + .../plugin/hive/HiveReadOnlyException.java | 53 + .../plugin/hive/HiveRecordCursor.java | 255 + .../plugin/hive/HiveRecordCursorProvider.java | 43 + .../plugin/hive/HiveSchemaProperties.java | 42 + .../hive/HiveSelectivePageSourceFactory.java | 51 + .../plugin/hive/HiveSessionProperties.java | 731 ++ .../io/prestosql/plugin/hive/HiveSplit.java | 353 + .../plugin/hive/HiveSplitLoader.java | 21 + .../plugin/hive/HiveSplitManager.java | 496 ++ .../plugin/hive/HiveSplitSource.java | 828 ++ .../plugin/hive/HiveSplitWrapper.java | 150 + .../plugin/hive/HiveStorageFormat.java | 173 + .../plugin/hive/HiveTableHandle.java | 472 ++ .../plugin/hive/HiveTableProperties.java | 307 + .../plugin/hive/HiveTransactionHandle.java | 71 + .../plugin/hive/HiveTransactionManager.java | 43 + .../io/prestosql/plugin/hive/HiveType.java | 291 + .../prestosql/plugin/hive/HiveTypeName.java | 68 + .../plugin/hive/HiveTypeTranslator.java | 150 + .../plugin/hive/HiveUpdateTableHandle.java | 51 + .../io/prestosql/plugin/hive/HiveUtil.java | 1215 +++ .../plugin/hive/HiveVacuumSplitSource.java | 272 + .../plugin/hive/HiveVacuumTableHandle.java | 241 + .../hive/HiveViewNotSupportedException.java | 41 + .../plugin/hive/HiveWritableTableHandle.java | 126 + .../prestosql/plugin/hive/HiveWriteUtils.java | 738 ++ .../io/prestosql/plugin/hive/HiveWriter.java | 161 + .../plugin/hive/HiveWriterFactory.java | 1013 +++ .../plugin/hive/HiveWriterStats.java | 35 + .../plugin/hive/HiveWrittenPartitions.java | 41 + .../plugin/hive/InternalHiveSplit.java | 305 + .../plugin/hive/IonSqlQueryBuilder.java | 267 + .../prestosql/plugin/hive/LocationHandle.java | 159 + .../plugin/hive/LocationService.java | 82 + .../prestosql/plugin/hive/NamenodeStats.java | 83 + .../io/prestosql/plugin/hive/NodeVersion.java | 32 + .../prestosql/plugin/hive/OrcFileWriter.java | 464 ++ .../plugin/hive/OrcFileWriterConfig.java | 113 + .../plugin/hive/OrcFileWriterFactory.java | 260 + .../plugin/hive/ParquetFileWriterConfig.java | 50 + .../hive/PartitionNotFoundException.java | 61 + .../hive/PartitionOfflineException.java | 59 + .../plugin/hive/PartitionStatistics.java | 120 + .../plugin/hive/PartitionUpdate.java | 221 + .../plugin/hive/RcFileFileWriter.java | 188 + .../plugin/hive/RcFileFileWriterFactory.java | 165 + .../plugin/hive/RecordFileWriter.java | 223 + .../hive/RoleAlreadyExistsException.java | 28 + .../plugin/hive/S3SelectCsvRecordReader.java | 112 + .../plugin/hive/S3SelectLineRecordReader.java | 227 + .../plugin/hive/S3SelectPushdown.java | 153 + .../plugin/hive/S3SelectRecordCursor.java | 231 + .../hive/S3SelectRecordCursorProvider.java | 95 + .../plugin/hive/SnapshotTempFileWriter.java | 103 + .../plugin/hive/SortingFileWriter.java | 332 + .../hive/SyncPartitionMetadataProcedure.java | 254 + .../plugin/hive/TableOfflineException.java | 51 + .../plugin/hive/TransactionalMetadata.java | 24 + .../prestosql/plugin/hive/TypeTranslator.java | 22 + .../prestosql/plugin/hive/VacuumCleaner.java | 208 + .../hive/VacuumEligibleTableCollector.java | 306 + .../hive/VacuumTableInfoForCleaner.java | 60 + .../hive/ViewAlreadyExistsException.java | 42 + .../plugin/hive/WriteCompletedEvent.java | 172 + .../io/prestosql/plugin/hive/WriteIdInfo.java | 90 + .../authentication/AuthenticationModules.java | 145 + .../CachingKerberosHadoopAuthentication.java | 73 + .../DirectHdfsAuthentication.java | 39 + .../GenericExceptionAction.java | 20 + .../authentication/HadoopAuthentication.java | 21 + .../authentication/HdfsAuthentication.java | 28 + .../authentication/HdfsKerberosConfig.java | 53 + .../HiveAuthenticationModule.java | 76 + .../hive/authentication/HiveIdentity.java | 85 + .../HiveMetastoreAuthentication.java | 21 + .../ImpersonatingHdfsAuthentication.java | 45 + .../KerberosAuthentication.java | 112 + .../KerberosHadoopAuthentication.java | 57 + .../KerberosHiveMetastoreAuthentication.java | 83 + .../authentication/KerberosTicketUtils.java | 75 + .../MetastoreKerberosConfig.java | 83 + .../authentication/NoHdfsAuthentication.java | 25 + .../NoHiveMetastoreAuthentication.java | 26 + .../SimpleHadoopAuthentication.java | 34 + .../UserGroupInformationUtils.java | 64 + .../plugin/hive/avro/PrestoAvroSerDe.java | 41 + .../hive/coercions/DecimalCoercers.java | 322 + .../hive/coercions/DoubleToFloatCoercer.java | 39 + .../hive/coercions/FloatToDoubleCoercer.java | 39 + .../plugin/hive/coercions/HiveCoercer.java | 246 + .../IntegerNumberToVarcharCoercer.java | 37 + .../IntegerNumberUpscaleCoercer.java | 34 + .../plugin/hive/coercions/TypeCoercer.java | 53 + .../VarcharToIntegerNumberCoercer.java | 77 + .../coercions/VarcharToVarcharCoercer.java | 39 + .../hive/gcs/GcsAccessTokenProvider.java | 49 + .../hive/gcs/GcsConfigurationProvider.java | 43 + .../GoogleGcsConfigurationInitializer.java | 56 + .../plugin/hive/gcs/HiveGcsConfig.java | 49 + .../plugin/hive/gcs/HiveGcsModule.java | 40 + .../hive/metastore/BooleanStatistics.java | 82 + .../hive/metastore/CachingHiveMetastore.java | 1353 ++++ .../plugin/hive/metastore/Column.java | 94 + .../plugin/hive/metastore/Database.java | 218 + .../plugin/hive/metastore/DateStatistics.java | 83 + .../hive/metastore/DecimalStatistics.java | 83 + .../hive/metastore/DoubleStatistics.java | 82 + .../hive/metastore/HiveColumnStatistics.java | 417 ++ .../plugin/hive/metastore/HiveMetastore.java | 200 + .../hive/metastore/HiveMetastoreModule.java | 57 + .../hive/metastore/HivePageSinkMetadata.java | 118 + .../HivePageSinkMetadataProvider.java | 63 + .../hive/metastore/HivePartitionName.java | 111 + .../plugin/hive/metastore/HivePrincipal.java | 104 + .../hive/metastore/HivePrivilegeInfo.java | 157 + .../plugin/hive/metastore/HiveTableName.java | 84 + .../hive/metastore/HiveTransaction.java | 166 + .../hive/metastore/IntegerStatistics.java | 82 + .../metastore/MetastoreClientFactory.java | 36 + .../hive/metastore/MetastoreConfig.java | 62 + .../plugin/hive/metastore/MetastoreUtil.java | 302 + .../plugin/hive/metastore/Partition.java | 217 + .../hive/metastore/PartitionFilter.java | 87 + .../metastore/PartitionWithStatistics.java | 62 + .../hive/metastore/PrincipalPrivileges.java | 64 + .../metastore/RecordingHiveMetastore.java | 682 ++ .../SemiTransactionalHiveMetastore.java | 3373 +++++++++ .../plugin/hive/metastore/SortingColumn.java | 127 + .../plugin/hive/metastore/Storage.java | 186 + .../plugin/hive/metastore/StorageFormat.java | 139 + .../plugin/hive/metastore/Table.java | 339 + .../hive/metastore/UserDatabaseKey.java | 79 + .../plugin/hive/metastore/UserTableKey.java | 113 + .../WriteHiveMetastoreRecordingProcedure.java | 66 + .../hive/metastore/file/DatabaseMetadata.java | 89 + .../metastore/file/FileHiveMetastore.java | 1357 ++++ .../file/FileHiveMetastoreConfig.java | 51 + .../metastore/file/FileMetastoreModule.java | 38 + .../metastore/file/PartitionMetadata.java | 166 + .../metastore/file/PermissionMetadata.java | 60 + .../hive/metastore/file/TableMetadata.java | 283 + .../metastore/glue/GlueExpressionUtil.java | 85 + .../metastore/glue/GlueHiveMetastore.java | 898 +++ .../glue/GlueHiveMetastoreConfig.java | 140 + .../metastore/glue/GlueMetastoreModule.java | 60 + .../glue/converter/GlueInputConverter.java | 117 + .../glue/converter/GlueToPrestoConverter.java | 153 + .../thrift/BridgingHiveMetastore.java | 486 ++ .../metastore/thrift/MetastoreLocator.java | 25 + .../thrift/StaticMetastoreConfig.java | 71 + .../thrift/StaticMetastoreLocator.java | 96 + .../metastore/thrift/ThriftConstants.java | 46 + .../metastore/thrift/ThriftHiveMetastore.java | 1897 +++++ .../thrift/ThriftHiveMetastoreClient.java | 557 ++ .../thrift/ThriftHiveMetastoreConfig.java | 142 + .../metastore/thrift/ThriftMetastore.java | 188 + .../thrift/ThriftMetastoreApiStats.java | 97 + .../thrift/ThriftMetastoreClient.java | 189 + .../thrift/ThriftMetastoreClientFactory.java | 103 + .../thrift/ThriftMetastoreModule.java | 120 + .../thrift/ThriftMetastoreStats.java | 317 + .../metastore/thrift/ThriftMetastoreUtil.java | 1000 +++ .../hive/metastore/thrift/Transport.java | 221 + .../hive/omnidata/CommunicationConfig.java | 34 + .../hive/omnidata/OmniDataNodeManager.java | 287 + .../hive/omnidata/OmniDataNodeStatus.java | 44 + .../plugin/hive/orc/HdfsOrcDataSource.java | 82 + .../plugin/hive/orc/OrcAcidRowId.java | 124 + .../plugin/hive/orc/OrcConcatPageSource.java | 133 + .../hive/orc/OrcDeleteDeltaPageSource.java | 228 + .../orc/OrcDeleteDeltaPageSourceFactory.java | 82 + .../plugin/hive/orc/OrcDeletedRows.java | 285 + .../plugin/hive/orc/OrcPageSource.java | 332 + .../plugin/hive/orc/OrcPageSourceFactory.java | 714 ++ .../hive/orc/OrcPushDownPageSource.java | 125 + .../hive/orc/OrcSelectivePageSource.java | 162 + .../orc/OrcSelectivePageSourceFactory.java | 717 ++ .../hive/parquet/HdfsParquetDataSource.java | 115 + .../parquet/ParquetColumnIOConverter.java | 97 + .../hive/parquet/ParquetPageSource.java | 251 + .../parquet/ParquetPageSourceFactory.java | 407 + .../parquet/ParquetPushDownPageSource.java | 120 + .../hive/parquet/ParquetRecordWriter.java | 108 + .../hive/rcfile/HdfsRcFileDataSource.java | 96 + .../plugin/hive/rcfile/RcFilePageSource.java | 246 + .../hive/rcfile/RcFilePageSourceFactory.java | 237 + .../plugin/hive/rule/HiveFilterPushdown.java | 500 ++ .../plugin/hive/rule/HiveLimitPushdown.java | 143 + .../rule/HivePartialAggregationPushdown.java | 397 + .../hive/rule/HivePlanOptimizerProvider.java | 72 + .../plugin/hive/rule/HiveProjectPushdown.java | 99 + .../plugin/hive/rule/HivePushdownUtil.java | 168 + .../hive/s3/ConfigurationInitializer.java | 21 + .../plugin/hive/s3/HiveS3Config.java | 419 ++ .../plugin/hive/s3/HiveS3Module.java | 76 + .../plugin/hive/s3/PrestoS3AclType.java | 42 + .../plugin/hive/s3/PrestoS3ClientFactory.java | 178 + .../s3/PrestoS3ConfigurationInitializer.java | 167 + .../plugin/hive/s3/PrestoS3Constants.java | 42 + .../plugin/hive/s3/PrestoS3FileSystem.java | 1285 ++++ .../s3/PrestoS3FileSystemMetricCollector.java | 73 + .../hive/s3/PrestoS3FileSystemStats.java | 319 + .../plugin/hive/s3/PrestoS3SelectClient.java | 87 + .../plugin/hive/s3/PrestoS3SignerType.java | 27 + .../plugin/hive/s3/PrestoS3SseType.java | 20 + .../plugin/hive/s3/S3FileSystemType.java | 20 + .../hive/security/AccessControlMetadata.java | 125 + .../AccessControlMetadataFactory.java | 21 + .../hive/security/HiveSecurityModule.java | 67 + .../hive/security/LegacyAccessControl.java | 310 + .../hive/security/LegacySecurityConfig.java | 105 + .../hive/security/LegacySecurityModule.java | 32 + .../plugin/hive/security/SecurityConfig.java | 50 + .../hive/security/SecurityConstants.java | 50 + .../security/SqlStandardAccessControl.java | 537 ++ .../SqlStandardAccessControlMetadata.java | 184 + .../security/SqlStandardSecurityModule.java | 74 + .../SystemTableAwareAccessControl.java | 108 + .../statistics/HiveStatisticsProvider.java | 41 + .../MetastoreHiveStatisticsProvider.java | 935 +++ .../statistics/TableColumnStatistics.java | 30 + .../plugin/hive/util/AsyncQueue.java | 242 + .../plugin/hive/util/ConfigurationUtils.java | 64 + .../hive/util/CustomSplitConversionUtils.java | 66 + .../hive/util/CustomSplitConverter.java | 37 + .../plugin/hive/util/DecimalUtils.java | 58 + .../plugin/hive/util/FieldSetterFactory.java | 488 ++ .../hive/util/FooterAwareRecordReader.java | 84 + .../plugin/hive/util/HiveBucketingV1.java | 204 + .../plugin/hive/util/HiveBucketingV2.java | 228 + .../plugin/hive/util/HiveFileIterator.java | 185 + .../hive/util/HudiRealtimeSplitConverter.java | 71 + .../plugin/hive/util/IndexCache.java | 233 + .../plugin/hive/util/IndexCacheLoader.java | 86 + .../hive/util/InternalHiveSplitFactory.java | 267 + .../hive/util/LoggingInvocationHandler.java | 161 + .../plugin/hive/util/MergingPageIterator.java | 142 + .../plugin/hive/util/PageSourceUtil.java | 250 + .../plugin/hive/util/ResumableTask.java | 60 + .../plugin/hive/util/ResumableTasks.java | 54 + .../plugin/hive/util/RetryDriver.java | 170 + .../plugin/hive/util/SerDeUtils.java | 293 + .../plugin/hive/util/SortBuffer.java | 138 + .../plugin/hive/util/Statistics.java | 456 ++ .../plugin/hive/util/TempFileReader.java | 100 + .../plugin/hive/util/TempFileWriter.java | 96 + .../plugin/hive/util/ThrottledAsyncQueue.java | 85 + .../license/license-header-alternate-2010.txt | 13 + .../license/license-header-alternate-2012.txt | 13 + .../license/license-header-alternate-2020.txt | 12 + .../license/license-header-alternate-2021.txt | 12 + .../resource/license/license-header-third.txt | 12 + .../main/resource/license/license-header.txt | 12 + .../connector/src/modernizer/violations.xml | 32 + .../plugin/hive/AbstractTestHive.java | 5321 +++++++++++++ .../hive/AbstractTestHiveFileFormats.java | 947 +++ .../hive/AbstractTestHiveFileSystem.java | 576 ++ .../plugin/hive/AbstractTestHiveLocal.java | 123 + .../plugin/hive/HiveBenchmarkQueryRunner.java | 96 + .../plugin/hive/HiveQueryRunner.java | 350 + .../prestosql/plugin/hive/HiveTestUtils.java | 272 + .../hive/TestBackgroundHiveSplitLoader.java | 1119 +++ .../plugin/hive/TestColumnTypeCacheable.java | 779 ++ .../plugin/hive/TestFileSystemCache.java | 57 + .../plugin/hive/TestHiveBooleanParser.java | 51 + .../plugin/hive/TestHiveBucketing.java | 337 + .../plugin/hive/TestHiveColumnHandle.java | 72 + .../prestosql/plugin/hive/TestHiveConfig.java | 441 ++ .../plugin/hive/TestHiveConnectorFactory.java | 78 + .../plugin/hive/TestHiveDecimalParser.java | 59 + .../hive/TestHiveDistributedAggregations.java | 33 + .../hive/TestHiveDistributedJoinQueries.java | 28 + ...ibutedJoinQueriesWithDynamicFiltering.java | 242 + .../TestHiveDistributedOrderByQueries.java | 27 + .../hive/TestHiveDistributedQueries.java | 54 + .../TestHiveDistributedStarTreeQueries.java | 67 + .../TestHiveDistributedWindowQueries.java | 28 + .../hive/TestHiveFileBasedSecurity.java | 70 + .../plugin/hive/TestHiveFileFormats.java | 889 +++ .../plugin/hive/TestHiveFileMetastore.java | 68 + .../hive/TestHiveInMemoryMetastore.java | 44 + .../hive/TestHiveIntegrationSmokeTest.java | 6625 +++++++++++++++++ .../plugin/hive/TestHiveLocationService.java | 108 + .../plugin/hive/TestHiveMetadata.java | 88 + .../plugin/hive/TestHivePageSink.java | 431 ++ .../plugin/hive/TestHivePageSource.java | 98 + .../hive/TestHivePageSourceProvider.java | 180 + .../prestosql/plugin/hive/TestHiveRoles.java | 497 ++ .../prestosql/plugin/hive/TestHiveSplit.java | 102 + .../plugin/hive/TestHiveSplitSource.java | 853 +++ .../plugin/hive/TestHiveTableHandle.java | 53 + .../plugin/hive/TestHiveTypeTranslator.java | 103 + .../prestosql/plugin/hive/TestHiveUtil.java | 257 + .../hive/TestHiveVacuumTableHandle.java | 83 + .../prestosql/plugin/hive/TestHiveView.java | 151 + .../plugin/hive/TestHiveWriteUtils.java | 52 + .../plugin/hive/TestHiveWriterFactory.java | 193 + .../plugin/hive/TestIonSqlQueryBuilder.java | 129 + .../prestosql/plugin/hive/TestOrcCache.java | 328 + .../plugin/hive/TestOrcFileWriterConfig.java | 68 + .../hive/TestOrcPageSourceMemoryTracking.java | 819 ++ .../hive/TestParquetFileWriterConfig.java | 53 + .../hive/TestPartitionOfflineException.java | 39 + .../plugin/hive/TestPartitionUpdate.java | 54 + .../hive/TestS3SelectLineRecordReader.java | 29 + .../plugin/hive/TestS3SelectPushdown.java | 45 + .../plugin/hive/TestS3SelectRecordCursor.java | 240 + .../prestosql/plugin/hive/TestShowStats.java | 184 + .../hive/TestTableOfflineException.java | 39 + .../TestHdfsKerberosConfig.java | 38 + .../TestMetastoreKerberosConfig.java | 42 + .../benchmark/DynamicFilterBenchmark.java | 150 + .../plugin/hive/benchmark/FileFormat.java | 493 ++ .../plugin/hive/benchmark/FormatWriter.java | 26 + .../benchmark/HiveFileFormatBenchmark.java | 638 ++ .../TestHiveFileFormatBenchmark.java | 72 + .../plugin/hive/gcs/TestHiveGcsConfig.java | 49 + .../metastore/TestCachingHiveMetastore.java | 283 + .../hive/metastore/TestMetastoreConfig.java | 52 + .../hive/metastore/TestMetastoreUtil.java | 173 + .../metastore/TestPrincipalPrivileges.java | 42 + .../metastore/TestRecordingHiveMetastore.java | 291 + .../TestSemiTransactionalHiveMetastore.java | 185 + .../plugin/hive/metastore/TestStorage.java | 36 + .../metastore/UnimplementedHiveMetastore.java | 277 + .../glue/TestGlueExpressionUtil.java | 85 + .../glue/TestGlueHiveMetastoreConfig.java | 67 + .../glue/TestGlueInputConverter.java | 110 + .../glue/TestGlueToPrestoConverter.java | 169 + .../metastore/glue/TestHiveGlueMetastore.java | 99 + .../glue/TestingMetastoreObjects.java | 155 + .../thrift/InMemoryThriftMetastore.java | 705 ++ .../thrift/MockThriftMetastoreClient.java | 574 ++ .../MockThriftMetastoreClientFactory.java | 53 + .../thrift/TestStaticMetastoreConfig.java | 73 + .../thrift/TestStaticMetastoreLocator.java | 149 + .../thrift/TestThriftHiveMetastoreConfig.java | 70 + .../thrift/TestThriftMetastoreUtil.java | 414 + .../thrift/TestingMetastoreLocator.java | 41 + .../hive/orc/TestOrcAcidPageSource.java | 270 + .../orc/TestOrcDeleteDeltaPageSource.java | 60 + .../plugin/hive/orc/TestOrcDeletedRows.java | 141 + .../parquet/AbstractTestParquetReader.java | 1815 +++++ .../plugin/hive/parquet/ParquetTester.java | 652 ++ .../hive/parquet/TestFullParquetReader.java | 26 + .../parquet/TestParquetPageSourceFactory.java | 87 + .../hive/parquet/TestParquetReader.java | 26 + .../predicate/TestParquetPredicateUtils.java | 157 + .../write/MapKeyValuesSchemaConverter.java | 218 + ...LevelArrayMapKeyValuesSchemaConverter.java | 232 + .../SingleLevelArraySchemaConverter.java | 193 + .../write/TestDataWritableWriteSupport.java | 59 + .../parquet/write/TestDataWritableWriter.java | 410 + .../write/TestMapredParquetOutputFormat.java | 65 + .../hive/rule/TestHiveFilterPushdown.java | 81 + .../hive/rule/TestHiveLimitPushdown.java | 45 + .../TestHivePartialAggregationPushdown.java | 128 + .../rule/TestHivePlanOptimizerProvider.java | 67 + .../plugin/hive/rule/TestHivePushdown.java | 40 + .../hive/rule/TestHivePushdownUtil.java | 380 + .../plugin/hive/s3/MockAmazonS3.java | 130 + .../plugin/hive/s3/TestHiveS3Config.java | 130 + .../hive/s3/TestPrestoS3FileSystem.java | 651 ++ .../security/TestLegacyAccessControl.java | 28 + .../security/TestLegacySecurityConfig.java | 61 + .../TestSqlStandardAccessControl.java | 28 + .../BenchmarkGetPartitionsSample.java | 83 + .../TestMetastoreHiveStatisticsProvider.java | 870 +++ .../plugin/hive/util/TestAsyncQueue.java | 256 + .../util/TestCustomSplitConversionUtils.java | 58 + .../plugin/hive/util/TestIndexCache.java | 270 + .../hive/util/TestIndexCacheLoader.java | 123 + .../plugin/hive/util/TestLazyMap.java | 82 + .../util/TestLoggingInvocationHandler.java | 93 + .../hive/util/TestMergingPageIterator.java | 101 + .../plugin/hive/util/TestSerDeUtils.java | 334 + .../plugin/hive/util/TestStatistics.java | 238 + .../hive/util/TestThrottledAsyncQueue.java | 256 + .../src/test/resources/addressbook.parquet | Bin 0 -> 3956 bytes .../bucket_00000 | Bin 0 -> 646 bytes .../bucket_00000 | Bin 0 -> 655 bytes .../io/prestosql/plugin/hive/security.json | 17 + .../bucket_00000 | Bin 0 -> 11406 bytes .../bucket_00000 | Bin 0 -> 873 bytes .../bucket_00000 | Bin 0 -> 882 bytes .../connector/src/test/sql/create-test.sql | 340 + .../connector/src/test/sql/drop-test.sql | 34 + .../omnidata-openlookeng-connector/pom.xml | 20 + .../license/license-header-alternate-2010.txt | 13 + .../license/license-header-alternate-2012.txt | 13 + .../license/license-header-alternate-2020.txt | 12 + .../license/license-header-alternate-2021.txt | 12 + .../resource/license/license-header-third.txt | 12 + .../main/resource/license/license-header.txt | 12 + .../src/modernizer/violations.xml | 32 + .../stub/client/pom.xml | 27 + .../omnidata/block/BlockDeserializer.java | 20 + .../boostkit/omnidata/reader/DataReader.java | 21 + .../omnidata/reader/DataReaderFactory.java | 26 + .../stub/core/pom.xml | 57 + .../boostkit/omnidata/OmniDataProperty.java | 62 + .../omnidata/decode/Deserializer.java | 18 + .../omnidata/exception/OmniDataException.java | 23 + .../omnidata/exception/OmniErrorCode.java | 25 + .../expression/OmniExpressionChecker.java | 34 + .../omnidata/model/AggregationInfo.java | 89 + .../boostkit/omnidata/model/Column.java | 26 + .../boostkit/omnidata/model/Predicate.java | 53 + .../boostkit/omnidata/model/TaskSource.java | 36 + .../omnidata/model/datasource/DataSource.java | 18 + .../datasource/hdfs/HdfsOrcDataSource.java | 22 + .../hdfs/HdfsParquetDataSource.java | 22 + .../datasource/hdfs/HdfsRecordDataSource.java | 24 + .../stub/pom.xml | 83 + omnidata/omnidata-spark-connector/.gitignore | 5 + omnidata/omnidata-spark-connector/LICENSE | 201 + omnidata/omnidata-spark-connector/README.md | 29 +- .../connector/pom.xml | 144 + .../org/apache/spark/sql/DataIoAdapter.java | 883 +++ .../org/apache/spark/sql/NdpFilterUtils.java | 120 + .../java/org/apache/spark/sql/NdpUdfEnum.java | 45 + .../apache/spark/sql/NdpUdfExpressions.java | 292 + .../java/org/apache/spark/sql/NdpUtils.java | 361 + .../apache/spark/sql/OmniDataProperties.java | 49 + .../org/apache/spark/sql/PageCandidate.java | 62 + .../org/apache/spark/sql/PageToColumnar.java | 67 + .../spark/sql/PrestoExpressionInfo.java | 74 + .../org/apache/spark/sql/PushDownData.java | 55 + .../org/apache/spark/sql/PushDownManager.java | 112 + .../sql/execution/DataSourceScanExec.scala | 624 ++ .../spark/sql/execution/QueryExecution.scala | 350 + .../spark/sql/execution/SparkPlanner.scala | 113 + .../spark/sql/execution/SparkStrategies.scala | 787 ++ .../execution/basicPhysicalOperators.scala | 816 ++ .../datasources/DataSourceStrategy.scala | 715 ++ .../execution/datasources/FilePartition.scala | 97 + .../datasources/FileScanRDDPushDown.scala | 234 + .../datasources/FileSourceStrategy.scala | 235 + .../datasources/v2/DataSourceV2Strategy.scala | 310 + .../spark/sql/execution/ndp/NdpPushDown.scala | 436 ++ .../spark/sql/execution/ndp/NdpSupport.scala | 86 + .../spark/sql/hive/HiveStrategies.scala | 305 + omnidata/omnidata-spark-connector/pom.xml | 19 + .../stub/client/pom.xml | 32 + .../omnidata/reader/impl/DataReaderImpl.java | 30 + .../omnidata/spark/SparkDeserializer.java | 22 + .../stub/core/pom.xml | 29 + .../omnidata/decode/Deserializer.java | 19 + .../omnidata/exception/OmniDataException.java | 24 + .../omnidata/exception/OmniErrorCode.java | 26 + .../omnidata/model/AggregationInfo.java | 33 + .../boostkit/omnidata/model/Column.java | 27 + .../boostkit/omnidata/model/Predicate.java | 33 + .../boostkit/omnidata/model/TaskSource.java | 24 + .../omnidata/model/datasource/DataSource.java | 19 + .../datasource/hdfs/HdfsOrcDataSource.java | 23 + .../hdfs/HdfsParquetDataSource.java | 23 + .../omnidata/type/ArrayDecodeType.java | 16 + .../omnidata/type/BooleanDecodeType.java | 15 + .../omnidata/type/ByteDecodeType.java | 15 + .../omnidata/type/DateDecodeType.java | 15 + .../boostkit/omnidata/type/DecodeType.java | 15 + .../omnidata/type/DoubleDecodeType.java | 15 + .../omnidata/type/FloatDecodeType.java | 15 + .../boostkit/omnidata/type/IntDecodeType.java | 15 + .../omnidata/type/LongDecodeType.java | 15 + .../omnidata/type/LongToByteDecodeType.java | 13 + .../omnidata/type/LongToFloatDecodeType.java | 14 + .../omnidata/type/LongToIntDecodeType.java | 14 + .../omnidata/type/LongToShortDecodeType.java | 14 + .../boostkit/omnidata/type/MapDecodeType.java | 17 + .../boostkit/omnidata/type/RowDecodeType.java | 15 + .../omnidata/type/ShortDecodeType.java | 15 + .../omnidata/type/VarcharDecodeType.java | 15 + .../omnidata-spark-connector/stub/pom.xml | 68 + 555 files changed, 116824 insertions(+), 2 deletions(-) create mode 100644 .gitignore create mode 100644 omnidata/omnidata-openlookeng-connector/.gitignore create mode 100644 omnidata/omnidata-openlookeng-connector/build.sh create mode 100644 omnidata/omnidata-openlookeng-connector/connector/pom.xml create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/BackgroundHiveSplitLoader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/BaseStorageFormat.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/CachingDirectoryLister.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/CoercionPolicy.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ConcurrentLazyQueue.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ConnectorObjectNameGeneratorModule.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/CreateEmptyPartitionProcedure.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/DeleteDeltaLocations.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/DirectoryLister.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/DynamicConfigurationProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/FileFormatDataSourceStats.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForCachingHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForCachingHiveMetastoreTableRefresh.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHdfs.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHive.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHiveTransactionHeartbeats.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHiveVacuum.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForRecordingHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/GenericHiveRecordCursor.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/GenericHiveRecordCursorProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HdfsConfiguration.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HdfsConfigurationInitializer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HdfsEnvironment.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveACIDWriteType.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveAnalyzeProperties.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBasicStatistics.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBooleanParser.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketAdapterRecordCursor.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketFunction.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketProperty.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketing.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCatalogName.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCoercionPolicy.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCoercionRecordCursor.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveColumnHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCompressionCodec.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveConnector.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveConnectorFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveDecimalParser.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveDeleteAsInsertTableHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveErrorCode.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveEventClient.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveFileWriter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveFileWriterFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveHandleResolver.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveHdfsConfiguration.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveInputInfo.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveInsertTableHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveLocationService.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveMetadata.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveMetadataFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveMetastoreClosure.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveModule.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveNodePartitioningProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveNotReadableException.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveOffloadExpression.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveOutputTableHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSink.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSinkProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSourceFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSourceProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartition.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionKey.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionManager.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionMetadata.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionResult.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitioningHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePlugin.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveProcedureModule.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePushDownRecordPageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveReadOnlyException.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveRecordCursor.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveRecordCursorProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSchemaProperties.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSelectivePageSourceFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSessionProperties.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplit.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitLoader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitManager.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitWrapper.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveStorageFormat.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTableHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTableProperties.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTransactionHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTransactionManager.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveType.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTypeName.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTypeTranslator.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveUpdateTableHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveUtil.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveVacuumSplitSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveVacuumTableHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveViewNotSupportedException.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWritableTableHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriteUtils.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriterFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriterStats.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWrittenPartitions.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/InternalHiveSplit.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/IonSqlQueryBuilder.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/LocationHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/LocationService.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/NamenodeStats.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/NodeVersion.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/OrcFileWriter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/OrcFileWriterConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/OrcFileWriterFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ParquetFileWriterConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionNotFoundException.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionOfflineException.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionStatistics.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionUpdate.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RcFileFileWriter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RcFileFileWriterFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RecordFileWriter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RoleAlreadyExistsException.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectCsvRecordReader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectLineRecordReader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectPushdown.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectRecordCursor.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectRecordCursorProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/SnapshotTempFileWriter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/SortingFileWriter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/SyncPartitionMetadataProcedure.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/TableOfflineException.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/TransactionalMetadata.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/TypeTranslator.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/VacuumCleaner.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/VacuumEligibleTableCollector.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/VacuumTableInfoForCleaner.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ViewAlreadyExistsException.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/WriteCompletedEvent.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/WriteIdInfo.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/AuthenticationModules.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/CachingKerberosHadoopAuthentication.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/DirectHdfsAuthentication.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/GenericExceptionAction.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HadoopAuthentication.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HdfsAuthentication.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HdfsKerberosConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HiveAuthenticationModule.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HiveIdentity.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HiveMetastoreAuthentication.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/ImpersonatingHdfsAuthentication.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosAuthentication.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosHadoopAuthentication.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosHiveMetastoreAuthentication.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosTicketUtils.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/MetastoreKerberosConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/NoHdfsAuthentication.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/NoHiveMetastoreAuthentication.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/SimpleHadoopAuthentication.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/UserGroupInformationUtils.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/avro/PrestoAvroSerDe.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/DecimalCoercers.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/DoubleToFloatCoercer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/FloatToDoubleCoercer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/HiveCoercer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/IntegerNumberToVarcharCoercer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/IntegerNumberUpscaleCoercer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/TypeCoercer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/VarcharToIntegerNumberCoercer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/VarcharToVarcharCoercer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/GcsAccessTokenProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/GcsConfigurationProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/GoogleGcsConfigurationInitializer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/HiveGcsConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/HiveGcsModule.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/BooleanStatistics.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/CachingHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Column.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Database.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/DateStatistics.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/DecimalStatistics.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/DoubleStatistics.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveColumnStatistics.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveMetastoreModule.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePageSinkMetadata.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePageSinkMetadataProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePartitionName.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePrincipal.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePrivilegeInfo.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveTableName.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveTransaction.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/IntegerStatistics.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/MetastoreClientFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/MetastoreConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/MetastoreUtil.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Partition.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/PartitionFilter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/PartitionWithStatistics.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/PrincipalPrivileges.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/RecordingHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/SemiTransactionalHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/SortingColumn.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Storage.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/StorageFormat.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Table.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/UserDatabaseKey.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/UserTableKey.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/WriteHiveMetastoreRecordingProcedure.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/DatabaseMetadata.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/FileHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/FileHiveMetastoreConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/FileMetastoreModule.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/PartitionMetadata.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/PermissionMetadata.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/TableMetadata.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueExpressionUtil.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueHiveMetastoreConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueMetastoreModule.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/converter/GlueInputConverter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/converter/GlueToPrestoConverter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/BridgingHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/MetastoreLocator.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/StaticMetastoreConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/StaticMetastoreLocator.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftConstants.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftHiveMetastoreClient.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftHiveMetastoreConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreApiStats.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreClient.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreClientFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreModule.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreStats.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreUtil.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/Transport.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/omnidata/CommunicationConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/omnidata/OmniDataNodeManager.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/omnidata/OmniDataNodeStatus.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/HdfsOrcDataSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcAcidRowId.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcConcatPageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcDeleteDeltaPageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcDeleteDeltaPageSourceFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcDeletedRows.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcPageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcPageSourceFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcPushDownPageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcSelectivePageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcSelectivePageSourceFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/HdfsParquetDataSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetColumnIOConverter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetPageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetPageSourceFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetPushDownPageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetRecordWriter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rcfile/HdfsRcFileDataSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rcfile/RcFilePageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rcfile/RcFilePageSourceFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HiveFilterPushdown.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HiveLimitPushdown.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HivePartialAggregationPushdown.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HivePlanOptimizerProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HiveProjectPushdown.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HivePushdownUtil.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/ConfigurationInitializer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/HiveS3Config.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/HiveS3Module.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3AclType.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3ClientFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3ConfigurationInitializer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3Constants.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3FileSystem.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3FileSystemMetricCollector.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3FileSystemStats.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3SelectClient.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3SignerType.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3SseType.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/S3FileSystemType.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/AccessControlMetadata.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/AccessControlMetadataFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/HiveSecurityModule.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/LegacyAccessControl.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/LegacySecurityConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/LegacySecurityModule.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SecurityConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SecurityConstants.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SqlStandardAccessControl.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SqlStandardAccessControlMetadata.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SqlStandardSecurityModule.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SystemTableAwareAccessControl.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/statistics/HiveStatisticsProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/statistics/MetastoreHiveStatisticsProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/statistics/TableColumnStatistics.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/AsyncQueue.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ConfigurationUtils.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/CustomSplitConversionUtils.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/CustomSplitConverter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/DecimalUtils.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/FieldSetterFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/FooterAwareRecordReader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HiveBucketingV1.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HiveBucketingV2.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HiveFileIterator.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HudiRealtimeSplitConverter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/IndexCache.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/IndexCacheLoader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/InternalHiveSplitFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/LoggingInvocationHandler.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/MergingPageIterator.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/PageSourceUtil.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ResumableTask.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ResumableTasks.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/RetryDriver.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/SerDeUtils.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/SortBuffer.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/Statistics.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/TempFileReader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/TempFileWriter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ThrottledAsyncQueue.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2010.txt create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2012.txt create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2020.txt create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2021.txt create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-third.txt create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header.txt create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/modernizer/violations.xml create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHive.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHiveFileFormats.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHiveFileSystem.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHiveLocal.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/HiveBenchmarkQueryRunner.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/HiveQueryRunner.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/HiveTestUtils.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestBackgroundHiveSplitLoader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestColumnTypeCacheable.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestFileSystemCache.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveBooleanParser.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveBucketing.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveColumnHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveConnectorFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDecimalParser.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedAggregations.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedJoinQueries.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedJoinQueriesWithDynamicFiltering.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedOrderByQueries.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedQueries.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedStarTreeQueries.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedWindowQueries.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveFileBasedSecurity.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveFileFormats.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveFileMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveInMemoryMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveIntegrationSmokeTest.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveLocationService.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveMetadata.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHivePageSink.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHivePageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHivePageSourceProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveRoles.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveSplit.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveSplitSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveTableHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveTypeTranslator.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveUtil.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveVacuumTableHandle.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveView.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveWriteUtils.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveWriterFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestIonSqlQueryBuilder.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestOrcCache.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestOrcFileWriterConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestOrcPageSourceMemoryTracking.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestParquetFileWriterConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestPartitionOfflineException.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestPartitionUpdate.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestS3SelectLineRecordReader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestS3SelectPushdown.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestS3SelectRecordCursor.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestShowStats.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestTableOfflineException.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/authentication/TestHdfsKerberosConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/authentication/TestMetastoreKerberosConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/DynamicFilterBenchmark.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/FileFormat.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/FormatWriter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/HiveFileFormatBenchmark.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/TestHiveFileFormatBenchmark.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/gcs/TestHiveGcsConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestCachingHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestMetastoreConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestMetastoreUtil.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestPrincipalPrivileges.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestRecordingHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestSemiTransactionalHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestStorage.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/UnimplementedHiveMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueExpressionUtil.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueHiveMetastoreConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueInputConverter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueToPrestoConverter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestHiveGlueMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestingMetastoreObjects.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/InMemoryThriftMetastore.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/MockThriftMetastoreClient.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/MockThriftMetastoreClientFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestStaticMetastoreConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestStaticMetastoreLocator.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestThriftHiveMetastoreConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestThriftMetastoreUtil.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestingMetastoreLocator.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/orc/TestOrcAcidPageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/orc/TestOrcDeleteDeltaPageSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/orc/TestOrcDeletedRows.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/AbstractTestParquetReader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/ParquetTester.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/TestFullParquetReader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/TestParquetPageSourceFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/TestParquetReader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/predicate/TestParquetPredicateUtils.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/MapKeyValuesSchemaConverter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/SingleLevelArrayMapKeyValuesSchemaConverter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/SingleLevelArraySchemaConverter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/TestDataWritableWriteSupport.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/TestDataWritableWriter.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/TestMapredParquetOutputFormat.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHiveFilterPushdown.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHiveLimitPushdown.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePartialAggregationPushdown.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePlanOptimizerProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePushdown.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePushdownUtil.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/s3/MockAmazonS3.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/s3/TestHiveS3Config.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/s3/TestPrestoS3FileSystem.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/security/TestLegacyAccessControl.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/security/TestLegacySecurityConfig.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/security/TestSqlStandardAccessControl.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/statistics/BenchmarkGetPartitionsSample.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/statistics/TestMetastoreHiveStatisticsProvider.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestAsyncQueue.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestCustomSplitConversionUtils.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestIndexCache.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestIndexCacheLoader.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestLazyMap.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestLoggingInvocationHandler.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestMergingPageIterator.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestSerDeUtils.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestStatistics.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestThrottledAsyncQueue.java create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/resources/addressbook.parquet create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/resources/fullacid_delete_delta_test/delete_delta_0000004_0000004_0000/bucket_00000 create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/resources/fullacid_delete_delta_test/delete_delta_0000007_0000007_0000/bucket_00000 create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/resources/io/prestosql/plugin/hive/security.json create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/resources/nationFile25kRowsSortedOnNationKey/bucket_00000 create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/resources/nation_delete_deltas/delete_delta_0000003_0000003_0000/bucket_00000 create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/resources/nation_delete_deltas/delete_delta_0000004_0000004_0000/bucket_00000 create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/sql/create-test.sql create mode 100644 omnidata/omnidata-openlookeng-connector/connector/src/test/sql/drop-test.sql create mode 100644 omnidata/omnidata-openlookeng-connector/pom.xml create mode 100644 omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2010.txt create mode 100644 omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2012.txt create mode 100644 omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2020.txt create mode 100644 omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2021.txt create mode 100644 omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-third.txt create mode 100644 omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header.txt create mode 100644 omnidata/omnidata-openlookeng-connector/src/modernizer/violations.xml create mode 100644 omnidata/omnidata-openlookeng-connector/stub/client/pom.xml create mode 100644 omnidata/omnidata-openlookeng-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/block/BlockDeserializer.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/reader/DataReader.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/reader/DataReaderFactory.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/pom.xml create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/OmniDataProperty.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/decode/Deserializer.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniDataException.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniErrorCode.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/expression/OmniExpressionChecker.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/AggregationInfo.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Column.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Predicate.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/TaskSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/DataSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsOrcDataSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsParquetDataSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsRecordDataSource.java create mode 100644 omnidata/omnidata-openlookeng-connector/stub/pom.xml create mode 100644 omnidata/omnidata-spark-connector/.gitignore create mode 100644 omnidata/omnidata-spark-connector/LICENSE create mode 100644 omnidata/omnidata-spark-connector/connector/pom.xml create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/DataIoAdapter.java create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpFilterUtils.java create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpUdfEnum.java create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpUdfExpressions.java create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpUtils.java create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/OmniDataProperties.java create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PageCandidate.java create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PageToColumnar.java create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PrestoExpressionInfo.java create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PushDownData.java create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PushDownManager.java create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDDPushDown.scala create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/ndp/NdpPushDown.scala create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/ndp/NdpSupport.scala create mode 100644 omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala create mode 100644 omnidata/omnidata-spark-connector/pom.xml create mode 100644 omnidata/omnidata-spark-connector/stub/client/pom.xml create mode 100644 omnidata/omnidata-spark-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/reader/impl/DataReaderImpl.java create mode 100644 omnidata/omnidata-spark-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/spark/SparkDeserializer.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/pom.xml create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/decode/Deserializer.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniDataException.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniErrorCode.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/AggregationInfo.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Column.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Predicate.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/TaskSource.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/DataSource.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsOrcDataSource.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsParquetDataSource.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/ArrayDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/BooleanDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/ByteDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/DateDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/DecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/DoubleDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/FloatDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/IntDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToByteDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToFloatDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToIntDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToShortDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/MapDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/RowDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/ShortDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/VarcharDecodeType.java create mode 100644 omnidata/omnidata-spark-connector/stub/pom.xml diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..9fe95928 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.iml +*.ipr +*.iws +.idea/ +target/ \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/.gitignore b/omnidata/omnidata-openlookeng-connector/.gitignore new file mode 100644 index 00000000..9fe95928 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/.gitignore @@ -0,0 +1,5 @@ +*.iml +*.ipr +*.iws +.idea/ +target/ \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/README.md b/omnidata/omnidata-openlookeng-connector/README.md index 193fc2b5..8b37416e 100644 --- a/omnidata/omnidata-openlookeng-connector/README.md +++ b/omnidata/omnidata-openlookeng-connector/README.md @@ -1 +1,31 @@ -# omnidata-openlookeng-connector \ No newline at end of file +# OmniData Connector + +## Overview + +OmniData Connector is a data source connector developed for openLooKeng. + +The OmniData connector allows querying data sources where c Server is deployed. It pushes down some operators such as filter to the OmniData service close to the storage to improve the performance of storage-computing-separated system. + +## Building OmniData Connector + +1. OmniData Connector is developed under the architecture of openLooKeng. You need to build openLooKeng first as a non-root user. +2. Simply run the following command from the project root directory:
+`mvn clean install -Dos.detected.arch="aarch64"`
+Then you will find omnidata-openlookeng-connector-*.zip under the omnidata-openlookeng-connector/connector/target/ directory. +OmniData Connector has a comprehensive set of unit tests that can take several minutes to run. You can disable the tests when building:
+`mvn clean install -DskipTests -Dos.detected.arch="aarch64"`
+ +## Deploying OmniData Connector + +1. Unzip omnidata-openlookeng-connector-*.zip to the plugin directory of openLooKeng. +2. Obtain the latest OmniData software package, replace the boostkit-omnidata-client-\*.jar and boostkit-omnidata-core-\*.jar in the omnidata-openlookeng-connector-\* directory. +3. Set "connector.name=omnidata-openlookeng" in the openLooKeng catalog properties file. + +## Contribution Guidelines + +Track the bugs and feature requests via GitHub issues. + +## More Information + +For further assistance, send an email to kunpengcompute@huawei.com. + diff --git a/omnidata/omnidata-openlookeng-connector/build.sh b/omnidata/omnidata-openlookeng-connector/build.sh new file mode 100644 index 00000000..25ef7e1e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/build.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +# Building omniData openLooKeng Connector packages +# Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + +set -e + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +mvn -T12 clean install -Dos.detected.arch="${cpu_name}" + diff --git a/omnidata/omnidata-openlookeng-connector/connector/pom.xml b/omnidata/omnidata-openlookeng-connector/connector/pom.xml new file mode 100644 index 00000000..2e9e819a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/pom.xml @@ -0,0 +1,791 @@ + + + 4.0.0 + + + io.hetu.core + presto-root + 1.4.0 + + + openlookeng-omnidata-connector + openLooKeng OmniData Connector + hetu-plugin + + + ${os.detected.arch} + 1.35.0 + 30.1.1-jre + 206 + 9.4.43.v20210629 + + + + + com.google.guava + guava + ${dep.guava.version} + + + com.google.errorprone + error_prone_annotations + + + + + + org.bouncycastle + bcprov-jdk15on + 1.68 + runtime + + + + + io.grpc + grpc-api + ${dep.grpc.version} + runtime + + + com.google.errorprone + error_prone_annotations + + + + + io.grpc + grpc-protobuf + ${dep.grpc.version} + runtime + + + com.google.errorprone + error_prone_annotations + + + + + io.grpc + grpc-stub + ${dep.grpc.version} + runtime + + + com.google.errorprone + error_prone_annotations + + + + + io.grpc + grpc-netty-shaded + ${dep.grpc.version} + runtime + + + com.google.errorprone + error_prone_annotations + + + com.google.code.gson + gson + + + + + + io.hetu.core + presto-plugin-toolkit + + + + org.mockito + mockito-core + test + + + + io.hetu.core + hetu-common + + + + io.hetu.core + presto-orc + + + + io.hetu.core + presto-parquet + + + org.apache.parquet + parquet-encoding + + + + + + org.apache.hudi + hudi-hadoop-mr + + + + io.hetu.core + presto-expressions + + + + io.hetu.core + presto-memory-context + + + + io.hetu.core + presto-rcfile + + + + io.prestosql.hadoop + hadoop-apache + + + + io.prestosql.hive + hive-apache + + + + org.apache.thrift + libthrift + + + + io.airlift + aircompressor + + + + io.airlift + stats + + + + io.airlift + bootstrap + + + + io.airlift + concurrent + + + + io.airlift + log + + + + io.airlift + event + + + + io.airlift + json + ${dep.arilift.version} + + + + io.airlift + configuration + + + + io.airlift + parameternames + 1.4 + + + + com.google.inject + guice + 5.0.1 + + + + com.google.code.findbugs + jsr305 + true + + + + it.unimi.dsi + fastutil + + + + javax.validation + validation-api + + + + org.weakref + jmxutils + + + + joda-time + joda-time + 2.10.9 + + + + io.airlift + joda-to-java-time-bridge + runtime + + + + com.amazonaws + aws-java-sdk-core + + + + com.amazonaws + aws-java-sdk-glue + + + + com.amazonaws + aws-java-sdk-s3 + + + + com.google.cloud.bigdataoss + util + + + + com.google.cloud.bigdataoss + gcsio + + + + com.google.cloud.bigdataoss + util-hadoop + + + + com.google.cloud.bigdataoss + gcs-connector + + + + com.amazonaws + aws-java-sdk-sts + + + + org.xerial.snappy + snappy-java + runtime + + + + javax.inject + javax.inject + + + + + io.airlift + log-manager + runtime + + + + + io.hetu.core + presto-spi + provided + + + com.google.code.gson + gson + + + + + + io.hetu.core + hetu-cube + provided + + + + io.airlift + slice + 0.39 + + + + io.airlift + units + provided + + + + com.fasterxml.jackson.core + jackson-annotations + provided + + + + org.openjdk.jol + jol-core + provided + + + + com.huawei.boostkit + boostkit-omnidata-client + 1.0.0 + ${dep.os.arch} + + + com.google.code.gson + gson + + + com.google.errorprone + error_prone_annotations + + + org.codehaus.mojo + animal-sniffer-annotations + + + io.airlift + configuration + + + log4j + log4j + + + guava + com.google.guava + + + com.fasterxml.jackson.core + jackson-databind + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-annotations + + + snappy-java + org.xerial.snappy + + + slf4j-api + org.slf4j + + + zookeeper + org.apache.zookeeper + + + jcl-over-slf4j + org.slf4j + + + hive-storage-api + org.apache.hive + + + jersey-common + org.glassfish.jersey.core + + + jersey-server + org.glassfish.jersey.core + + + jersey-container-servlet-core + org.glassfish.jersey.containers + + + jul-to-slf4j + org.slf4j + + + javassist + org.javassist + + + arrow-vector + org.apache.arrow + + + arrow-memory + org.apache.arrow + + + hadoop-hdfs + org.apache.hadoop + + + hadoop-mapreduce-client-core + org.apache.hadoop + + + hadoop-common + org.apache.hadoop + + + slf4j-log4j12 + org.slf4j + + + leveldbjni-all + org.fusesource.leveldbjni + + + spark-catalyst_2.12 + org.apache.spark + + + spark-core_2.12 + org.apache.spark + + + spark-kvstore_2.12 + org.apache.spark + + + spark-launcher_2.12 + org.apache.spark + + + spark-sketch_2.12 + org.apache.spark + + + spark-sql_2.12 + org.apache.spark + + + spark-tags_2.12 + org.apache.spark + + + unused + org.spark-project.spark + + + com.fasterxml.jackson.datatype + jackson-datatype-jdk8 + + + com.fasterxml.jackson.datatype + jackson-datatype-jsr310 + + + com.fasterxml.jackson.datatype + jackson-datatype-guava + + + com.fasterxml.jackson.datatype + jackson-datatype-joda + + + com.fasterxml.jackson.module + jackson-module-parameter-names + + + + + + com.huawei.boostkit + boostkit-omnidata-core + 1.0.0 + ${dep.os.arch} + + + guava + com.google.guava + + + com.fasterxml.jackson.core + jackson-annotations + + + slf4j-log4j12 + org.slf4j + + + logback-classic + ch.qos.logback + + + io.airlift + configuration + + + com.fasterxml.jackson.core + jackson-databind + + + org.slf4j + slf4j-api + + + com.fasterxml.jackson.datatype + jackson-datatype-jdk8 + + + com.fasterxml.jackson.datatype + jackson-datatype-jsr310 + + + com.fasterxml.jackson.datatype + jackson-datatype-guava + + + com.fasterxml.jackson.datatype + jackson-datatype-joda + + + com.fasterxml.jackson.module + jackson-module-parameter-names + + + + + + + io.hetu.core + presto-spi + test-jar + test + + + com.google.code.gson + gson + + + + + io.airlift + testing-mysql-server + test + + + io.hetu.core + hetu-metastore + test + + + io.hetu.core + presto-main + test + + + org.slf4j + log4j-over-slf4j + + + com.sun + tools + + + org.bouncycastle + bcprov-jdk15on + + + + + + io.hetu.core + presto-main + test-jar + test + + + org.bouncycastle + bcprov-jdk15on + + + + + io.hetu.core + presto-client + test + + + + io.hetu.core + presto-parser + test + + + + io.hetu.core + presto-tests + test + + + + io.hetu.core + presto-tpch + test + + + + io.airlift.tpch + tpch + test + + + + org.jetbrains + annotations + provided + + + + org.testng + testng + test + + + + io.airlift + testing + test + + + + org.assertj + assertj-core + test + + + + org.anarres.lzo + lzo-hadoop + test + + + + + io.hetu.core + presto-benchmark + test + + + + org.openjdk.jmh + jmh-core + test + + + + org.openjdk.jmh + jmh-generator-annprocess + test + + + io.hetu.core + hetu-transport + + + + io.hetu.core + hetu-startree + test + + + org.objenesis + objenesis + + + org.checkerframework + checker-qual + + + com.google.errorprone + error_prone_annotations + + + + + org.eclipse.jetty + jetty-util + ${dep.jetty.version} + + + io.airlift + discovery + + + io.airlift + http-client + + + javax.annotation + javax.annotation-api + + + + + + default + + true + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + + **/TestHiveGlueMetastore.java + + + **/TestFullParquetReader.java + + + + + + + + + test-hive-glue + + + + org.apache.maven.plugins + maven-surefire-plugin + + + **/TestHiveGlueMetastore.java + + + + + + + + diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/BackgroundHiveSplitLoader.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/BackgroundHiveSplitLoader.java new file mode 100644 index 00000000..ee625032 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/BackgroundHiveSplitLoader.java @@ -0,0 +1,909 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.base.Suppliers; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterators; +import com.google.common.collect.ListMultimap; +import com.google.common.collect.Streams; +import com.google.common.io.CharStreams; +import com.google.common.util.concurrent.ListenableFuture; +import io.airlift.log.Logger; +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.plugin.hive.HiveSplit.BucketConversion; +import io.prestosql.plugin.hive.HiveVacuumTableHandle.Range; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.util.ConfigurationUtils; +import io.prestosql.plugin.hive.util.HiveFileIterator; +import io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryNotAllowedException; +import io.prestosql.plugin.hive.util.InternalHiveSplitFactory; +import io.prestosql.plugin.hive.util.ResumableTask; +import io.prestosql.plugin.hive.util.ResumableTasks; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.dynamicfilter.DynamicFilter; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.resourcegroups.QueryType; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.hive.common.ValidCompactorWriteIdList; +import org.apache.hadoop.hive.common.ValidWriteIdList; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcFile; +import org.apache.hadoop.hive.ql.io.orc.Reader; +import org.apache.hadoop.hive.shims.HadoopShims; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hive.common.util.Ref; +import org.apache.hudi.hadoop.HoodieParquetInputFormat; +import org.apache.hudi.hadoop.HoodieROTablePathFilter; +import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.lang.annotation.Annotation; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Deque; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedDeque; +import java.util.concurrent.Executor; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.IntPredicate; +import java.util.function.Supplier; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.util.concurrent.Futures.immediateFuture; +import static io.prestosql.plugin.hive.HiveSessionProperties.isDynamicFilteringSplitFilteringEnabled; +import static io.prestosql.plugin.hive.HiveSessionProperties.isForceLocalScheduling; +import static io.prestosql.plugin.hive.HiveUtil.checkCondition; +import static io.prestosql.plugin.hive.HiveUtil.getBucketNumber; +import static io.prestosql.plugin.hive.HiveUtil.getFooterCount; +import static io.prestosql.plugin.hive.HiveUtil.getHeaderCount; +import static io.prestosql.plugin.hive.HiveUtil.getInputFormat; +import static io.prestosql.plugin.hive.HiveUtil.isPartitionFiltered; +import static io.prestosql.plugin.hive.S3SelectPushdown.shouldEnablePushdownForTable; +import static io.prestosql.plugin.hive.metastore.MetastoreUtil.getHiveSchema; +import static io.prestosql.plugin.hive.metastore.MetastoreUtil.getPartitionLocation; +import static io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.FAIL; +import static io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.IGNORED; +import static io.prestosql.plugin.hive.util.HiveFileIterator.NestedDirectoryPolicy.RECURSE; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static java.lang.Math.max; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static org.apache.hadoop.hive.common.FileUtils.HIDDEN_FILES_PATH_FILTER; + +public class BackgroundHiveSplitLoader + implements HiveSplitLoader +{ + private static final Logger LOG = Logger.get(BackgroundHiveSplitLoader.class); + + private static final Pattern DELETE_DELTA_PATTERN = Pattern.compile("delete_delta_(\\d+)_(\\d+)(_\\d+)?"); + + private static final ListenableFuture COMPLETED_FUTURE = immediateFuture(null); + + private final Table table; + private final TupleDomain compactEffectivePredicate; + private final Optional tableBucketInfo; + private final HdfsEnvironment hdfsEnvironment; + private final HdfsContext hdfsContext; + private final NamenodeStats namenodeStats; + private final DirectoryLister directoryLister; + private final int loaderConcurrency; + private final boolean recursiveDirWalkerEnabled; + private final Executor executor; + private final ConnectorSession session; + private final ConcurrentLazyQueue partitions; + private final Deque> fileIterators = new ConcurrentLinkedDeque<>(); + private final Optional validWriteIds; + private final Supplier>> dynamicFilterSupplier; + private final Configuration configuration; + private final Supplier hoodiePathFilterSupplier; + + // Purpose of this lock: + // * Write lock: when you need a consistent view across partitions, fileIterators, and hiveSplitSource. + // * Read lock: when you need to modify any of the above. + // Make sure the lock is held throughout the period during which they may not be consistent with each other. + // Details: + // * When write lock is acquired, except the holder, no one can do any of the following: + // ** poll from (or check empty) partitions + // ** poll from (or check empty) or push to fileIterators + // ** push to hiveSplitSource + // * When any of the above three operations is carried out, either a read lock or a write lock must be held. + // * When a series of operations involving two or more of the above three operations are carried out, the lock + // must be continuously held throughout the series of operations. + // Implications: + // * if you hold a read lock but not a write lock, you can do any of the above three operations, but you may + // see a series of operations involving two or more of the operations carried out half way. + private final ReentrantReadWriteLock taskExecutionLock = new ReentrantReadWriteLock(); + + private HiveSplitSource hiveSplitSource; + private volatile boolean stopped; + private Optional queryType; + private Map queryInfo; + private TypeManager typeManager; + private JobConf jobConf; + + private final Map cachedDynamicFilters = new ConcurrentHashMap<>(); + + public BackgroundHiveSplitLoader( + Table table, + Iterable partitions, + TupleDomain compactEffectivePredicate, + Optional tableBucketInfo, + ConnectorSession session, + HdfsEnvironment hdfsEnvironment, + NamenodeStats namenodeStats, + DirectoryLister directoryLister, + Executor executor, + int loaderConcurrency, + boolean recursiveDirWalkerEnabled, + Optional validWriteIds, + Supplier>> dynamicFilterSupplier, + Optional queryType, + Map queryInfo, + TypeManager typeManager) + { + this.table = table; + this.compactEffectivePredicate = compactEffectivePredicate; + this.tableBucketInfo = tableBucketInfo; + this.loaderConcurrency = loaderConcurrency; + this.typeManager = typeManager; + this.session = session; + this.hdfsEnvironment = hdfsEnvironment; + this.namenodeStats = namenodeStats; + this.directoryLister = directoryLister; + this.recursiveDirWalkerEnabled = recursiveDirWalkerEnabled; + this.executor = executor; + this.hdfsContext = new HdfsContext(session, table.getDatabaseName(), table.getTableName()); + this.validWriteIds = requireNonNull(validWriteIds, "validWriteIds is null"); + this.dynamicFilterSupplier = dynamicFilterSupplier; + this.queryType = requireNonNull(queryType, "queryType is null"); + this.queryInfo = requireNonNull(queryInfo, "queryproperties is null"); + this.partitions = new ConcurrentLazyQueue<>(getPrunedPartitions(partitions)); + Path path = new Path(getPartitionLocation(table, getPrunedPartitions(partitions).iterator().next().getPartition())); + configuration = hdfsEnvironment.getConfiguration(hdfsContext, path); + jobConf = ConfigurationUtils.toJobConf(configuration); + this.hoodiePathFilterSupplier = Suppliers.memoize(HoodieROTablePathFilter::new); + } + + /** + * Get pruned partitions, if applicable. + */ + private Iterable getPrunedPartitions(Iterable partitions) + { + if (AcidUtils.isTransactionalTable(table.getParameters()) && + (queryType.map(t -> t == QueryType.VACUUM).orElse(false))) { + String vacuumPartition = (String) queryInfo.get("partition"); + if (vacuumPartition != null && !vacuumPartition.isEmpty()) { + List list = new ArrayList<>(); + for (Iterator it = partitions.iterator(); it.hasNext(); ) { + HivePartitionMetadata next = it.next(); + if (vacuumPartition.equals(next.getHivePartition().getPartitionId())) { + return ImmutableList.of(next); + } + } + } + } + return partitions; + } + + @Override + public void start(HiveSplitSource splitSource) + { + this.hiveSplitSource = splitSource; + for (int i = 0; i < loaderConcurrency; i++) { + ResumableTasks.submit(executor, new HiveSplitLoaderTask()); + } + } + + @Override + public void stop() + { + stopped = true; + } + + private class HiveSplitLoaderTask + implements ResumableTask + { + @Override + public TaskStatus process() + { + while (true) { + if (stopped) { + return TaskStatus.finished(); + } + ListenableFuture future; + taskExecutionLock.readLock().lock(); + try { + future = loadSplits(); + } + catch (Exception e) { + if (e instanceof IOException) { + e = new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, e); + } + else if (!(e instanceof PrestoException)) { + e = new PrestoException(HiveErrorCode.HIVE_UNKNOWN_ERROR, e); + } + // Fail the split source before releasing the execution lock + // Otherwise, a race could occur where the split source is completed before we fail it. + hiveSplitSource.fail(e); + checkState(stopped); + return TaskStatus.finished(); + } + // For TestBackgroundHiveSplitLoader.testPropagateException + catch (Error e) { + hiveSplitSource.fail(e); + return TaskStatus.finished(); + } + finally { + taskExecutionLock.readLock().unlock(); + } + invokeNoMoreSplitsIfNecessary(); + if (!future.isDone()) { + return TaskStatus.continueOn(future); + } + } + } + } + + private void invokeNoMoreSplitsIfNecessary() + { + taskExecutionLock.readLock().lock(); + try { + // This is an opportunistic check to avoid getting the write lock unnecessarily + if (!partitions.isEmpty() || !fileIterators.isEmpty()) { + return; + } + } + catch (Exception e) { + hiveSplitSource.fail(e); + checkState(stopped, "Task is not marked as stopped even though it failed"); + return; + } + finally { + taskExecutionLock.readLock().unlock(); + } + + taskExecutionLock.writeLock().lock(); + try { + // the write lock guarantees that no one is operating on the partitions, fileIterators, or hiveSplitSource, or half way through doing so. + if (partitions.isEmpty() && fileIterators.isEmpty()) { + // It is legal to call `noMoreSplits` multiple times or after `stop` was called. + // Nothing bad will happen if `noMoreSplits` implementation calls methods that will try to obtain a read lock because the lock is re-entrant. + hiveSplitSource.noMoreSplits(); + } + } + catch (Exception e) { + hiveSplitSource.fail(e); + checkState(stopped, "Task is not marked as stopped even though it failed"); + } + finally { + taskExecutionLock.writeLock().unlock(); + } + } + + private ListenableFuture loadSplits() + throws IOException + { + Iterator splits = fileIterators.poll(); + if (splits == null) { + HivePartitionMetadata partition = partitions.poll(); + if (partition == null) { + return COMPLETED_FUTURE; + } + return loadPartition(partition); + } + + while (splits.hasNext() && !stopped) { + ListenableFuture future = hiveSplitSource.addToQueue(splits.next()); + if (!future.isDone()) { + fileIterators.addFirst(splits); + return future; + } + } + + // No need to put the iterator back, since it's either empty or we've stopped + return COMPLETED_FUTURE; + } + + private ListenableFuture loadPartition(HivePartitionMetadata partition) + throws IOException + { + HivePartition hivePartition = partition.getHivePartition(); + String partitionName = hivePartition.getPartitionId(); + Properties schema = getPartitionSchema(table, partition.getPartition()); + List partitionKeys = getPartitionKeys(table, partition.getPartition()); + TupleDomain effectivePredicate = (TupleDomain) compactEffectivePredicate; + + if (dynamicFilterSupplier != null && isDynamicFilteringSplitFilteringEnabled(session)) { + if (isPartitionFiltered(partitionKeys, dynamicFilterSupplier.get(), typeManager)) { + // Avoid listing files and creating splits from a partition if it has been pruned due to dynamic filters + return COMPLETED_FUTURE; + } + } + + Path path = new Path(getPartitionLocation(table, partition.getPartition())); + InputFormat inputFormat = getInputFormat(configuration, schema, false, jobConf); + FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path); + boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition()); + + if (inputFormat instanceof SymlinkTextInputFormat) { + if (tableBucketInfo.isPresent()) { + throw new PrestoException(NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported"); + } + + // TODO: This should use an iterator like the HiveFileIterator + ListenableFuture lastResult = COMPLETED_FUTURE; + for (Path targetPath : getTargetPathsFromSymlink(fs, path)) { + // The input should be in TextInputFormat. + TextInputFormat targetInputFormat = new TextInputFormat(); + // the splits must be generated using the file system for the target path + // get the configuration for the target path -- it may be a different hdfs instance + FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, targetPath); + jobConf.setInputFormat(TextInputFormat.class); + targetInputFormat.configure(jobConf); + FileInputFormat.setInputPaths(jobConf, targetPath); + InputSplit[] targetSplits = targetInputFormat.getSplits(jobConf, 0); + + InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory( + targetFilesystem, + partitionName, + inputFormat, + schema, + partitionKeys, + effectivePredicate, + partition.getColumnCoercions(), + Optional.empty(), + isForceLocalScheduling(session), + s3SelectPushdownEnabled); + lastResult = addSplitsToSource(targetSplits, splitFactory); + if (stopped) { + return COMPLETED_FUTURE; + } + } + return lastResult; + } + + Optional bucketConversion = Optional.empty(); + boolean bucketConversionRequiresWorkerParticipation = false; + if (partition.getPartition().isPresent()) { + Optional partitionBucketProperty = partition.getPartition().get().getStorage().getBucketProperty(); + if (tableBucketInfo.isPresent() && partitionBucketProperty.isPresent()) { + int readBucketCount = tableBucketInfo.get().getReadBucketCount(); + BucketingVersion bucketingVersion = partitionBucketProperty.get().getBucketingVersion(); // TODO can partition's bucketing_version be different from table's? + int partitionBucketCount = partitionBucketProperty.get().getBucketCount(); + // Validation was done in HiveSplitManager#getPartitionMetadata. + // Here, it's just trying to see if its needs the BucketConversion. + if (readBucketCount != partitionBucketCount) { + bucketConversion = Optional.of(new BucketConversion(bucketingVersion, readBucketCount, partitionBucketCount, tableBucketInfo.get().getBucketColumns())); + if (readBucketCount > partitionBucketCount) { + bucketConversionRequiresWorkerParticipation = true; + } + } + } + } + InternalHiveSplitFactory splitFactory = new InternalHiveSplitFactory( + fs, + partitionName, + inputFormat, + schema, + partitionKeys, + effectivePredicate, + partition.getColumnCoercions(), + bucketConversionRequiresWorkerParticipation ? bucketConversion : Optional.empty(), + isForceLocalScheduling(session), + s3SelectPushdownEnabled); + + // To support custom input formats, we want to call getSplits() + // on the input format to obtain file splits. + if (!isHudiParquetInputFormat(inputFormat) && shouldUseFileSplitsFromInputFormat(inputFormat)) { + if (tableBucketInfo.isPresent()) { + throw new PrestoException(NOT_SUPPORTED, "Presto cannot read bucketed partition in an input format with UseFileSplitsFromInputFormat annotation: " + inputFormat.getClass().getSimpleName()); + } + + if (AcidUtils.isTransactionalTable(table.getParameters())) { + throw new PrestoException(NOT_SUPPORTED, "Hive transactional tables in an input format with UseFileSplitsFromInputFormat annotation are not supported: " + inputFormat.getClass().getSimpleName()); + } + + FileInputFormat.setInputPaths(jobConf, path); + InputSplit[] splits = inputFormat.getSplits(jobConf, 0); + + return addSplitsToSource(splits, splitFactory); + } + + PathFilter pathFilter = isHudiParquetInputFormat(inputFormat) ? hoodiePathFilterSupplier.get() : path1 -> true; + + // S3 Select pushdown works at the granularity of individual S3 objects, + // therefore we must not split files when it is enabled. + boolean splittable = getHeaderCount(schema) == 0 && getFooterCount(schema) == 0 && !s3SelectPushdownEnabled; + + List readPaths; + Optional deleteDeltaLocations; + long min = Long.MAX_VALUE; + long max = Long.MIN_VALUE; + if (AcidUtils.isTransactionalTable(table.getParameters())) { + boolean isVacuum = queryType.map(type -> type == QueryType.VACUUM).orElse(false); + AcidUtils.Directory directory = hdfsEnvironment.doAs(hdfsContext.getIdentity().getUser(), () -> { + ValidWriteIdList writeIdList = validWriteIds.orElseThrow(() -> new IllegalStateException("No validWriteIds present")); + if (isVacuum) { + writeIdList = new ValidCompactorWriteIdList(writeIdList.writeToString()) { + @Override + public RangeResponse isWriteIdRangeValid(long minWriteId, long maxWriteId) + { + //For unknown reasons.. ValidCompactorWriteIdList#isWriteIdRangeValid() doesnot + // check for aborted transactions and AcidUtils.getAcidState() adds aborted transaction to both aborted and working lists. + //Avoid this by overriding. + RangeResponse writeIdRangeValid = super.isWriteIdRangeValid(minWriteId, maxWriteId); + if (writeIdRangeValid == RangeResponse.NONE) { + return RangeResponse.NONE; + } + else if (super.isWriteIdRangeAborted(minWriteId, maxWriteId) == RangeResponse.ALL) { + return RangeResponse.NONE; + } + return writeIdRangeValid; + } + }; + } + return AcidUtils.getAcidState( + path, + configuration, + writeIdList, + Ref.from(false), + true, + table.getParameters()); + }); + + if (AcidUtils.isFullAcidTable(table.getParameters())) { + // From Hive version >= 3.0, delta/base files will always have file '_orc_acid_version' with value >= '2'. + Path baseOrDeltaPath = directory.getBaseDirectory() != null + ? directory.getBaseDirectory() + : (directory.getCurrentDirectories().size() > 0 ? directory.getCurrentDirectories().get(0).getPath() : null); + + if (baseOrDeltaPath != null && AcidUtils.OrcAcidVersion.getAcidVersionFromMetaFile(baseOrDeltaPath, fs) < 2) { + throw new PrestoException(NOT_SUPPORTED, "Hive transactional tables are supported with Hive 3.0 and only after a major compaction has been run"); + } + } + + readPaths = new ArrayList<>(); + + boolean isFullVacuum = isVacuum ? Boolean.valueOf(queryInfo.get("FULL").toString()) : false; + + if (isFullVacuum) { + //Base will contain everything + min = 0; + } + // base + //In case of vacuum, include only in case of Full vacuum. + if (directory.getBaseDirectory() != null && (!isVacuum || isFullVacuum)) { + readPaths.add(directory.getBaseDirectory()); + if (isVacuum) { + min = 0; + max = AcidUtils.parseBase(directory.getBaseDirectory()); + } + } + + // delta directories + for (AcidUtils.ParsedDelta delta : directory.getCurrentDirectories()) { + if (!delta.isDeleteDelta()) { + readPaths.add(delta.getPath()); + } + //In case of Minor compaction, all delete_delta files should be compacted separately, + else if (isVacuum && !isFullVacuum) { + readPaths.add(delta.getPath()); + } + if (isVacuum) { + min = Math.min(delta.getMinWriteId(), min); + max = Math.max(delta.getMaxWriteId(), max); + } + } + + // Create a registry of delete_delta directories for the partition + DeleteDeltaLocations.Builder deleteDeltaLocationsBuilder = DeleteDeltaLocations.builder(path); + for (AcidUtils.ParsedDelta delta : directory.getCurrentDirectories()) { + //In case of minor compaction, delete_delta directories should not be used for masking. + if (delta.isDeleteDelta() && (!isVacuum || isFullVacuum)) { + //For unknown reasons ParseDelta.getStatementId() returns 0, though parsed statement is -1; + //This creates issue while trying to locate the delete_delta directory. + //So parsing again. + OptionalInt statementId = getStatementId(delta.getPath().getName()); + int stmtId = statementId.orElse(0); + deleteDeltaLocationsBuilder.addDeleteDelta(delta.getPath(), delta.getMinWriteId(), delta.getMaxWriteId(), stmtId); + } + } + + deleteDeltaLocations = deleteDeltaLocationsBuilder.build(); + + if (!directory.getOriginalFiles().isEmpty()) { + LOG.info("Now supporting read from non-ACID files in ACID reader"); + // non-ACID file + int numberOfBuckets = Integer.parseInt(schema.getProperty("bucket_count")); + long[] bucketStartRowOffset = new long[Integer.max(numberOfBuckets, 1)]; + for (HadoopShims.HdfsFileStatusWithId f : directory.getOriginalFiles()) { + Path currFilePath = f.getFileStatus().getPath(); + int currBucketNumber = getBucketNumber(currFilePath.getName()).getAsInt(); + fileIterators.addLast(createInternalHiveSplitIterator(currFilePath, fs, splitFactory, splittable, deleteDeltaLocations, Optional.of(bucketStartRowOffset[currBucketNumber]), pathFilter)); + try { + Reader copyReader = OrcFile.createReader(f.getFileStatus().getPath(), + OrcFile.readerOptions(configuration)); + bucketStartRowOffset[currBucketNumber] += copyReader.getNumberOfRows(); + } + catch (Exception e) { + throw new PrestoException(NOT_SUPPORTED, e.getMessage()); + } + } + } + + if (isVacuum && !readPaths.isEmpty()) { + Object vacuumHandle = queryInfo.get("vacuumHandle"); + if (vacuumHandle != null && vacuumHandle instanceof HiveVacuumTableHandle) { + HiveVacuumTableHandle hiveVacuumTableHandle = (HiveVacuumTableHandle) vacuumHandle; + hiveVacuumTableHandle.addRange(partitionName, new Range(min, max)); + } + } + } + else { + readPaths = ImmutableList.of(path); + deleteDeltaLocations = Optional.empty(); + } + + // Bucketed partitions are fully loaded immediately since all files must be loaded to determine the file to bucket mapping + if (tableBucketInfo.isPresent()) { + ListenableFuture lastResult = immediateFuture(null); // TODO document in addToQueue() that it is sufficient to hold on to last returned future + for (Path readPath : readPaths) { + lastResult = hiveSplitSource.addToQueue(getBucketedSplits(readPath, fs, splitFactory, + tableBucketInfo.get(), bucketConversion, getDeleteDeltaLocationFor(readPath, deleteDeltaLocations), pathFilter)); + } + return lastResult; + } + + for (Path readPath : readPaths) { + fileIterators.addLast(createInternalHiveSplitIterator(readPath, fs, splitFactory, splittable, + getDeleteDeltaLocationFor(readPath, deleteDeltaLocations), Optional.empty(), pathFilter)); + } + + return COMPLETED_FUTURE; + } + + private Optional getDeleteDeltaLocationFor(Path readPath, Optional allDeleteDeltaLocations) + { + if (!allDeleteDeltaLocations.isPresent() || allDeleteDeltaLocations.get().getDeleteDeltas().isEmpty()) { + return allDeleteDeltaLocations; + } + /* + * Source delta/base files' record can be deleted in only delete_delta directories having greater writeId + * than source file's writeId. + * Therefore, skipping delta_directories which lesser/same writeId as source will avoid unnecessary + * reads and memory. + */ + Long sourceWriteId = AcidUtils.extractWriteId(readPath); + sourceWriteId = (sourceWriteId == null) ? 0 : sourceWriteId; + if (sourceWriteId == 0) { + return allDeleteDeltaLocations; + } + long sId = sourceWriteId.longValue(); + DeleteDeltaLocations allLocations = allDeleteDeltaLocations.get(); + List filteredWriteIds = allLocations.getDeleteDeltas().stream() + .filter(writeIdInfo -> writeIdInfo.getMaxWriteId() > sId).collect(Collectors.toList()); + if (filteredWriteIds.isEmpty()) { + return Optional.empty(); + } + return Optional.of(new DeleteDeltaLocations(allLocations.getPartitionLocation(), filteredWriteIds)); + } + + private ListenableFuture addSplitsToSource(InputSplit[] targetSplits, InternalHiveSplitFactory splitFactory) + throws IOException + { + ListenableFuture lastResult = COMPLETED_FUTURE; + for (InputSplit inputSplit : targetSplits) { + Optional internalHiveSplit = splitFactory.createInternalHiveSplit((FileSplit) inputSplit); + if (internalHiveSplit.isPresent()) { + lastResult = hiveSplitSource.addToQueue(internalHiveSplit.get()); + } + if (stopped) { + return COMPLETED_FUTURE; + } + } + return lastResult; + } + + private static boolean isHudiParquetInputFormat(InputFormat inputFormat) + { + if (inputFormat instanceof HoodieParquetRealtimeInputFormat) { + return false; + } + return inputFormat instanceof HoodieParquetInputFormat; + } + + private static boolean shouldUseFileSplitsFromInputFormat(InputFormat inputFormat) + { + return Arrays.stream(inputFormat.getClass().getAnnotations()) + .map(Annotation::annotationType) + .map(Class::getSimpleName) + .anyMatch(name -> name.equals("UseFileSplitsFromInputFormat")); + } + + private Iterator createInternalHiveSplitIterator(Path path, FileSystem fileSystem, InternalHiveSplitFactory splitFactory, boolean splittable, Optional deleteDeltaLocations, Optional startRowOffsetOfFile, PathFilter pathFilter) + { + return Streams.stream(new HiveFileIterator(table, path, fileSystem, directoryLister, namenodeStats, recursiveDirWalkerEnabled ? RECURSE : IGNORED, pathFilter)) + .map(status -> splitFactory.createInternalHiveSplit(status, splittable, deleteDeltaLocations, startRowOffsetOfFile)) + .filter(Optional::isPresent) + .map(Optional::get) + .iterator(); + } + + private List getBucketedSplits(Path path, FileSystem fileSystem, InternalHiveSplitFactory splitFactory, BucketSplitInfo bucketSplitInfo, Optional bucketConversion, Optional deleteDeltaLocations, PathFilter pathFilter) + { + int readBucketCount = bucketSplitInfo.getReadBucketCount(); + int tableBucketCount = bucketSplitInfo.getTableBucketCount(); + int partitionBucketCount = bucketConversion.map(BucketConversion::getPartitionBucketCount).orElse(tableBucketCount); + int bucketCount = max(readBucketCount, partitionBucketCount); + + // list all files in the partition + List files = new ArrayList<>(partitionBucketCount); + try { + Iterators.addAll(files, new HiveFileIterator(table, path, fileSystem, directoryLister, namenodeStats, FAIL, pathFilter)); + } + catch (NestedDirectoryNotAllowedException e) { + // Fail here to be on the safe side. This seems to be the same as what Hive does + throw new PrestoException( + HiveErrorCode.HIVE_INVALID_BUCKET_FILES, + format("Hive table '%s' is corrupt. Found sub-directory in bucket directory for partition: %s", + table.getSchemaTableName(), + splitFactory.getPartitionName())); + } + + // build mapping of file name to bucket + ListMultimap bucketFiles = ArrayListMultimap.create(); + for (LocatedFileStatus file : files) { + String fileName = file.getPath().getName(); + OptionalInt bucket = getBucketNumber(fileName); + if (bucket.isPresent()) { + bucketFiles.put(bucket.getAsInt(), file); + continue; + } + + // legacy mode requires exactly one file per bucket + if (files.size() != partitionBucketCount) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_BUCKET_FILES, format( + "Hive table '%s' is corrupt. File '%s' does not match the standard naming pattern, and the number " + + "of files in the directory (%s) does not match the declared bucket count (%s) for partition: %s", + table.getSchemaTableName(), + fileName, + files.size(), + partitionBucketCount, + splitFactory.getPartitionName())); + } + + // sort FileStatus objects per `org.apache.hadoop.hive.ql.metadata.Table#getSortedPaths()` + files.sort(null); + + // use position in sorted list as the bucket number + bucketFiles.clear(); + for (int i = 0; i < files.size(); i++) { + bucketFiles.put(i, files.get(i)); + } + break; + } + + // convert files internal splits + List splitList = new ArrayList<>(); + for (int bucketNumber = 0; bucketNumber < bucketCount; bucketNumber++) { + // Physical bucket #. This determine file name. It also determines the order of splits in the result. + int partitionBucketNumber = bucketNumber % partitionBucketCount; + // Logical bucket #. Each logical bucket corresponds to a "bucket" from engine's perspective. + int readBucketNumber = bucketNumber % readBucketCount; + + boolean containsEligibleTableBucket = false; + boolean containsIneligibleTableBucket = false; + for (int tableBucketNumber = bucketNumber % tableBucketCount; tableBucketNumber < tableBucketCount; tableBucketNumber += bucketCount) { + // table bucket number: this is used for evaluating "$bucket" filters. + if (bucketSplitInfo.isTableBucketEnabled(tableBucketNumber)) { + containsEligibleTableBucket = true; + } + else { + containsIneligibleTableBucket = true; + } + } + + if (containsEligibleTableBucket && containsIneligibleTableBucket) { + throw new PrestoException( + NOT_SUPPORTED, + "The bucket filter cannot be satisfied. There are restrictions on the bucket filter when all the following is true: " + + "1. a table has a different buckets count as at least one of its partitions that is read in this query; " + + "2. the table has a different but compatible bucket number with another table in the query; " + + "3. some buckets of the table is filtered out from the query, most likely using a filter on \"$bucket\". " + + "(table name: " + table.getTableName() + ", table bucket count: " + tableBucketCount + ", " + + "partition bucket count: " + partitionBucketCount + ", effective reading bucket count: " + readBucketCount + ")"); + } + if (containsEligibleTableBucket) { + for (LocatedFileStatus file : bucketFiles.get(partitionBucketNumber)) { + // OrcDeletedRows will load only delete delta files matching current bucket (same file name), + // so we can pass all delete delta locations here, without filtering. + splitFactory.createInternalHiveSplit(file, readBucketNumber, deleteDeltaLocations) + .ifPresent(splitList::add); + } + } + } + return splitList; + } + + static OptionalInt getStatementId(String deleteDeltaFileName) + { + Matcher matcher = DELETE_DELTA_PATTERN.matcher(deleteDeltaFileName); + if (matcher.matches()) { + String statementId = matcher.group(3); + if (statementId == null) { + return OptionalInt.of(-1); + } + return OptionalInt.of(Integer.valueOf(statementId.substring(1))); + } + return OptionalInt.empty(); + } + + private static List getTargetPathsFromSymlink(FileSystem fileSystem, Path symlinkDir) + { + try { + FileStatus[] symlinks = fileSystem.listStatus(symlinkDir, HIDDEN_FILES_PATH_FILTER); + List targets = new ArrayList<>(); + + for (FileStatus symlink : symlinks) { + try (BufferedReader reader = new BufferedReader(new InputStreamReader(fileSystem.open(symlink.getPath()), StandardCharsets.UTF_8))) { + CharStreams.readLines(reader).stream() + .map(Path::new) + .forEach(targets::add); + } + } + return targets; + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_BAD_DATA, "Error parsing symlinks from: " + symlinkDir, e); + } + } + + private static List getPartitionKeys(Table table, Optional partition) + { + if (!partition.isPresent()) { + return ImmutableList.of(); + } + ImmutableList.Builder partitionKeys = ImmutableList.builder(); + List keys = table.getPartitionColumns(); + List values = partition.get().getValues(); + checkCondition(keys.size() == values.size(), HiveErrorCode.HIVE_INVALID_METADATA, "Expected %s partition key values, but got %s", keys.size(), values.size()); + for (int i = 0; i < keys.size(); i++) { + String name = keys.get(i).getName(); + HiveType hiveType = keys.get(i).getType(); + if (!hiveType.isSupportedType()) { + throw new PrestoException(NOT_SUPPORTED, format("Unsupported Hive type %s found in partition keys of table %s.%s", hiveType, table.getDatabaseName(), table.getTableName())); + } + String value = values.get(i); + checkCondition(value != null, HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, "partition key value cannot be null for field: %s", name); + partitionKeys.add(new HivePartitionKey(name, value)); + } + return partitionKeys.build(); + } + + private static Properties getPartitionSchema(Table table, Optional partition) + { + if (!partition.isPresent()) { + return getHiveSchema(table); + } + return getHiveSchema(partition.get(), table); + } + + public Table getTable() + { + return table; + } + + public static class BucketSplitInfo + { + private final List bucketColumns; + private final int tableBucketCount; + private final int readBucketCount; + private final IntPredicate bucketFilter; + + public static Optional createBucketSplitInfo(Optional bucketHandle, Optional bucketFilter) + { + requireNonNull(bucketHandle, "bucketHandle is null"); + requireNonNull(bucketFilter, "buckets is null"); + + if (!bucketHandle.isPresent()) { + checkArgument(!bucketFilter.isPresent(), "bucketHandle must be present if bucketFilter is present"); + return Optional.empty(); + } + + int tableBucketCount = bucketHandle.get().getTableBucketCount(); + int readBucketCount = bucketHandle.get().getReadBucketCount(); + + if (tableBucketCount != readBucketCount && bucketFilter.isPresent()) { + // TODO: remove when supported + throw new PrestoException(NOT_SUPPORTED, "Filter on \"$bucket\" is not supported when the table has partitions with different bucket counts"); + } + + List bucketColumns = bucketHandle.get().getColumns(); + IntPredicate predicate = bucketFilter + .map(filter -> filter.getBucketsToKeep()::contains) + .orElse(bucket -> true); + return Optional.of(new BucketSplitInfo(bucketColumns, tableBucketCount, readBucketCount, predicate)); + } + + private BucketSplitInfo(List bucketColumns, int tableBucketCount, int readBucketCount, IntPredicate bucketFilter) + { + this.bucketColumns = ImmutableList.copyOf(requireNonNull(bucketColumns, "bucketColumns is null")); + this.tableBucketCount = tableBucketCount; + this.readBucketCount = readBucketCount; + this.bucketFilter = requireNonNull(bucketFilter, "bucketFilter is null"); + } + + public List getBucketColumns() + { + return bucketColumns; + } + + public int getTableBucketCount() + { + return tableBucketCount; + } + + public int getReadBucketCount() + { + return readBucketCount; + } + + /** + * Evaluates whether the provided table bucket number passes the bucket predicate. + * A bucket predicate can be present in two cases: + *
    + *
  • Filter on "$bucket" column. e.g. {@code "$bucket" between 0 and 100} + *
  • Single-value equality filter on all bucket columns. e.g. for a table with two bucketing columns, + * {@code bucketCol1 = 'a' AND bucketCol2 = 123} + *
+ */ + public boolean isTableBucketEnabled(int tableBucketNumber) + { + return bucketFilter.test(tableBucketNumber); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/BaseStorageFormat.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/BaseStorageFormat.java new file mode 100644 index 00000000..a45778c0 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/BaseStorageFormat.java @@ -0,0 +1,24 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive; + +public interface BaseStorageFormat +{ + String getSerDe(); + + String getInputFormat(); + + String getOutputFormat(); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/CachingDirectoryLister.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/CachingDirectoryLister.java new file mode 100644 index 00000000..3bad0400 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/CachingDirectoryLister.java @@ -0,0 +1,172 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.Weigher; +import com.google.common.collect.ImmutableList; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.connector.SchemaTableName; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.weakref.jmx.Managed; + +import javax.inject.Inject; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; + +public class CachingDirectoryLister + implements DirectoryLister +{ + private final Cache> cache; + private final Set tableNames; + + @Inject + public CachingDirectoryLister(HiveConfig hiveClientConfig) + { + this(hiveClientConfig.getFileStatusCacheExpireAfterWrite(), hiveClientConfig.getFileStatusCacheMaxSize(), hiveClientConfig.getFileStatusCacheTables()); + } + + public CachingDirectoryLister(Duration expireAfterWrite, long maxSize, List tables) + { + this.cache = CacheBuilder.newBuilder() + .maximumWeight(maxSize) + .weigher((Weigher>) (key, value) -> value.size()) + .expireAfterWrite(expireAfterWrite.toMillis(), TimeUnit.MILLISECONDS) + .recordStats() + .build(); + this.tableNames = tables.stream() + .map(CachingDirectoryLister::parseTableName) + .collect(Collectors.toSet()); + } + + private static SchemaTableName parseTableName(String tableName) + { + String[] parts = tableName.split("\\."); + checkArgument(parts.length == 2, "Invalid schemaTableName: %s", tableName); + return new SchemaTableName(parts[0], parts[1]); + } + + @Override + public RemoteIterator list(FileSystem fs, Table table, Path path) + throws IOException + { + List files = cache.getIfPresent(path); + if (files != null) { + return simpleRemoteIterator(files); + } + RemoteIterator iterator = fs.listLocatedStatus(path); + + if (!tableNames.contains(table.getSchemaTableName())) { + return iterator; + } + return cachingRemoteIterator(iterator, path); + } + + private RemoteIterator cachingRemoteIterator(RemoteIterator iterator, Path path) + { + return new RemoteIterator() + { + private final List files = new ArrayList<>(); + + @Override + public boolean hasNext() + throws IOException + { + boolean hasNext = iterator.hasNext(); + if (!hasNext) { + cache.put(path, ImmutableList.copyOf(files)); + } + return hasNext; + } + + @Override + public LocatedFileStatus next() + throws IOException + { + LocatedFileStatus next = iterator.next(); + files.add(next); + return next; + } + }; + } + + private static RemoteIterator simpleRemoteIterator(List files) + { + return new RemoteIterator() + { + private final Iterator iterator = ImmutableList.copyOf(files).iterator(); + + @Override + public boolean hasNext() + { + return iterator.hasNext(); + } + + @Override + public LocatedFileStatus next() + { + return iterator.next(); + } + }; + } + + @Managed + public void flushCache() + { + cache.invalidateAll(); + } + + @Managed + public Double getHitRate() + { + return cache.stats().hitRate(); + } + + @Managed + public Double getMissRate() + { + return cache.stats().missRate(); + } + + @Managed + public long getHitCount() + { + return cache.stats().hitCount(); + } + + @Managed + public long getMissCount() + { + return cache.stats().missCount(); + } + + @Managed + public long getRequestCount() + { + return cache.stats().requestCount(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/CoercionPolicy.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/CoercionPolicy.java new file mode 100644 index 00000000..1222a26c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/CoercionPolicy.java @@ -0,0 +1,19 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +public interface CoercionPolicy +{ + boolean canCoerce(HiveType fromType, HiveType toType); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ConcurrentLazyQueue.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ConcurrentLazyQueue.java new file mode 100644 index 00000000..a6d2443e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ConcurrentLazyQueue.java @@ -0,0 +1,42 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import javax.annotation.concurrent.GuardedBy; + +import java.util.Iterator; + +public class ConcurrentLazyQueue +{ + @GuardedBy("this") + private final Iterator iterator; + + public ConcurrentLazyQueue(Iterable iterable) + { + this.iterator = iterable.iterator(); + } + + public synchronized boolean isEmpty() + { + return !iterator.hasNext(); + } + + public synchronized E poll() + { + if (!iterator.hasNext()) { + return null; + } + return iterator.next(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ConnectorObjectNameGeneratorModule.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ConnectorObjectNameGeneratorModule.java new file mode 100644 index 00000000..f4701c45 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ConnectorObjectNameGeneratorModule.java @@ -0,0 +1,117 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableMap; +import com.google.inject.Binder; +import com.google.inject.Module; +import com.google.inject.Provides; +import io.airlift.configuration.Config; +import org.weakref.jmx.ObjectNameBuilder; +import org.weakref.jmx.ObjectNameGenerator; + +import java.util.Map; + +import static com.google.common.base.MoreObjects.firstNonNull; +import static io.airlift.configuration.ConfigBinder.configBinder; +import static java.util.Objects.requireNonNull; + +// Note: There are multiple copies of this class in the codebase. If you change one you, should change them all. +public class ConnectorObjectNameGeneratorModule + implements Module +{ + private static final String CONNECTOR_PACKAGE_NAME = "io.prestosql.plugin.hive"; + private static final String DEFAULT_DOMAIN_BASE = "presto.plugin.hive"; + + private final String catalogName; + + public ConnectorObjectNameGeneratorModule(String catalogName) + { + this.catalogName = requireNonNull(catalogName, "catalogName is null"); + } + + @Override + public void configure(Binder binder) + { + configBinder(binder).bindConfig(ConnectorObjectNameGeneratorConfig.class); + } + + @Provides + ObjectNameGenerator createPrefixObjectNameGenerator(ConnectorObjectNameGeneratorConfig config) + { + String domainBase = firstNonNull(config.getDomainBase(), DEFAULT_DOMAIN_BASE); + return new ConnectorObjectNameGenerator(domainBase, catalogName); + } + + public static class ConnectorObjectNameGeneratorConfig + { + private String domainBase; + + public String getDomainBase() + { + return domainBase; + } + + @Config("jmx.base-name") + public ConnectorObjectNameGeneratorConfig setDomainBase(String domainBase) + { + this.domainBase = domainBase; + return this; + } + } + + public static final class ConnectorObjectNameGenerator + implements ObjectNameGenerator + { + private final String domainBase; + private final String catalogName; + + public ConnectorObjectNameGenerator(String domainBase, String catalogName) + { + this.domainBase = domainBase; + this.catalogName = catalogName; + } + + @Override + public String generatedNameOf(Class type) + { + return new ObjectNameBuilder(toDomain(type)) + .withProperties(ImmutableMap.builder() + .put("type", type.getSimpleName()) + .put("name", catalogName) + .build()) + .build(); + } + + @Override + public String generatedNameOf(Class type, Map properties) + { + return new ObjectNameBuilder(toDomain(type)) + .withProperties(ImmutableMap.builder() + .putAll(properties) + .put("catalog", catalogName) + .build()) + .build(); + } + + private String toDomain(Class type) + { + String domain = type.getPackage().getName(); + if (domain.startsWith(CONNECTOR_PACKAGE_NAME)) { + domain = domainBase + domain.substring(CONNECTOR_PACKAGE_NAME.length()); + } + return domain; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/CreateEmptyPartitionProcedure.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/CreateEmptyPartitionProcedure.java new file mode 100644 index 00000000..a308e255 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/CreateEmptyPartitionProcedure.java @@ -0,0 +1,142 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.airlift.json.JsonCodec; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.prestosql.plugin.hive.LocationService.WriteInfo; +import io.prestosql.plugin.hive.PartitionUpdate.UpdateMode; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.classloader.ThreadContextClassLoader; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.procedure.Procedure; +import io.prestosql.spi.procedure.Procedure.Argument; +import org.apache.hadoop.hive.common.FileUtils; + +import javax.inject.Inject; +import javax.inject.Provider; + +import java.lang.invoke.MethodHandle; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.function.Supplier; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.prestosql.spi.StandardErrorCode.ALREADY_EXISTS; +import static io.prestosql.spi.StandardErrorCode.INVALID_PROCEDURE_ARGUMENT; +import static io.prestosql.spi.block.MethodHandleUtil.methodHandle; +import static io.prestosql.spi.type.StandardTypes.VARCHAR; +import static java.util.Objects.requireNonNull; + +public class CreateEmptyPartitionProcedure + implements Provider +{ + private static final MethodHandle CREATE_EMPTY_PARTITION = methodHandle( + CreateEmptyPartitionProcedure.class, + "createEmptyPartition", + ConnectorSession.class, + String.class, + String.class, + List.class, + List.class); + + private final Supplier hiveMetadataFactory; + private final HiveMetastore metastore; + private final LocationService locationService; + private final JsonCodec partitionUpdateJsonCodec; + + @Inject + public CreateEmptyPartitionProcedure(Supplier hiveMetadataFactory, HiveMetastore metastore, LocationService locationService, JsonCodec partitionUpdateCodec) + { + this.hiveMetadataFactory = requireNonNull(hiveMetadataFactory, "hiveMetadataFactory is null"); + this.metastore = requireNonNull(metastore, "metastore is null"); + this.locationService = requireNonNull(locationService, "locationService is null"); + this.partitionUpdateJsonCodec = requireNonNull(partitionUpdateCodec, "partitionUpdateCodec is null"); + } + + @Override + public Procedure get() + { + return new Procedure( + "system", + "create_empty_partition", + ImmutableList.of( + new Argument("schema_name", VARCHAR), + new Argument("table_name", VARCHAR), + new Argument("partition_columns", "array(varchar)"), + new Argument("partition_values", "array(varchar)")), + CREATE_EMPTY_PARTITION.bindTo(this)); + } + + public void createEmptyPartition(ConnectorSession session, String schema, String table, List partitionColumnNames, List partitionValues) + { + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) { + doCreateEmptyPartition(session, schema, table, partitionColumnNames, partitionValues); + } + } + + private void doCreateEmptyPartition(ConnectorSession session, String schema, String table, List partitionColumnNames, List partitionValues) + { + TransactionalMetadata hiveMetadata = hiveMetadataFactory.get(); + + ConnectorTableHandle tableHandle = hiveMetadata.getTableHandle(session, new SchemaTableName(schema, table)); + hiveMetadata.beginQuery(session); + HiveInsertTableHandle hiveInsertTableHandle = (HiveInsertTableHandle) hiveMetadata.beginInsert(session, tableHandle); + + List actualPartitionColumnNames = hiveInsertTableHandle.getInputColumns().stream() + .filter(HiveColumnHandle::isPartitionKey) + .map(HiveColumnHandle::getName) + .collect(toImmutableList()); + if (!Objects.equals(partitionColumnNames, actualPartitionColumnNames)) { + throw new PrestoException(INVALID_PROCEDURE_ARGUMENT, "input partition column names doesn't match actual partition column names"); + } + + List partitionStringValues = partitionValues.stream() + .map(String.class::cast) + .collect(toImmutableList()); + + if (metastore.getPartition(new HiveIdentity(session), schema, table, partitionStringValues).isPresent()) { + throw new PrestoException(ALREADY_EXISTS, "Partition already exists"); + } + String partitionName = FileUtils.makePartName(actualPartitionColumnNames, partitionStringValues); + + WriteInfo writeInfo = locationService.getPartitionWriteInfo(hiveInsertTableHandle.getLocationHandle(), Optional.empty(), partitionName); + Slice serializedPartitionUpdate = Slices.wrappedBuffer( + partitionUpdateJsonCodec.toJsonBytes( + new PartitionUpdate( + partitionName, + UpdateMode.NEW, + writeInfo.getWritePath(), + writeInfo.getTargetPath(), + ImmutableList.of(), + 0, + 0, + 0, + ImmutableList.of()))); + + hiveMetadata.finishInsert( + session, + hiveInsertTableHandle, + ImmutableList.of(serializedPartitionUpdate), + ImmutableList.of()); + hiveMetadata.commit(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/DeleteDeltaLocations.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/DeleteDeltaLocations.java new file mode 100644 index 00000000..46eccad5 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/DeleteDeltaLocations.java @@ -0,0 +1,128 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import org.apache.hadoop.fs.Path; + +import java.util.List; +import java.util.Objects; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +/** + * Stores information about ACID DELETE_DELTA for a Partition + */ +public class DeleteDeltaLocations +{ + private final String partitionLocation; + private final List deleteDeltas; + + @JsonCreator + public DeleteDeltaLocations( + @JsonProperty("partitionLocation") String partitionLocation, + @JsonProperty("deleteDeltas") List deleteDeltas) + { + this.partitionLocation = requireNonNull(partitionLocation, "partitionLocation is null"); + this.deleteDeltas = ImmutableList.copyOf(requireNonNull(deleteDeltas, "deleteDeltas is null")); + checkArgument(!deleteDeltas.isEmpty(), "deleteDeltas is empty"); + } + + @JsonProperty + public String getPartitionLocation() + { + return partitionLocation; + } + + @JsonProperty + public List getDeleteDeltas() + { + return deleteDeltas; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + + if (o == null || getClass() != o.getClass()) { + return false; + } + + DeleteDeltaLocations that = (DeleteDeltaLocations) o; + return partitionLocation.equals(that.partitionLocation) && + deleteDeltas.equals(that.deleteDeltas); + } + + @Override + public int hashCode() + { + return Objects.hash(partitionLocation, deleteDeltas); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("partitionLocation", partitionLocation) + .add("deleteDeltas", deleteDeltas) + .toString(); + } + + public static Builder builder(Path partitionPath) + { + return new Builder(partitionPath); + } + + public static class Builder + { + private final Path partitionLocation; + private final ImmutableList.Builder deleteDeltaInfoBuilder = ImmutableList.builder(); + + private Builder(Path partitionPath) + { + partitionLocation = requireNonNull(partitionPath, "partitionPath is null"); + } + + public Builder addDeleteDelta(Path deleteDeltaPath, long minWriteId, long maxWriteId, int statementId) + { + requireNonNull(deleteDeltaPath, "deleteDeltaPath is null"); + Path partitionPathFromDeleteDelta = deleteDeltaPath.getParent(); + checkArgument( + partitionLocation.equals(partitionPathFromDeleteDelta), + "Partition location in DeleteDelta '%s' does not match stored location '%s'", + deleteDeltaPath.getParent().toString(), + partitionLocation); + + deleteDeltaInfoBuilder.add(new WriteIdInfo(minWriteId, maxWriteId, statementId)); + return this; + } + + public Optional build() + { + List deleteDeltas = deleteDeltaInfoBuilder.build(); + if (deleteDeltas.isEmpty()) { + return Optional.empty(); + } + return Optional.of(new DeleteDeltaLocations(partitionLocation.toString(), deleteDeltas)); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/DirectoryLister.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/DirectoryLister.java new file mode 100644 index 00000000..0003fced --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/DirectoryLister.java @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.plugin.hive.metastore.Table; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; + +import java.io.IOException; + +public interface DirectoryLister +{ + RemoteIterator list(FileSystem fs, Table table, Path path) + throws IOException; +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/DynamicConfigurationProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/DynamicConfigurationProvider.java new file mode 100644 index 00000000..fdf788b7 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/DynamicConfigurationProvider.java @@ -0,0 +1,34 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import org.apache.hadoop.conf.Configuration; + +import java.net.URI; + +import static io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import static org.apache.hadoop.fs.PrestoFileSystemCache.CACHE_KEY; + +public interface DynamicConfigurationProvider +{ + void updateConfiguration(Configuration configuration, HdfsContext context, URI uri); + + /** + * Set a cache key to invalidate the file system on credential (or other configuration) change. + */ + static void setCacheKey(Configuration configuration, String value) + { + configuration.set(CACHE_KEY, configuration.get(CACHE_KEY, "") + "|" + value); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/FileFormatDataSourceStats.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/FileFormatDataSourceStats.java new file mode 100644 index 00000000..6871759e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/FileFormatDataSourceStats.java @@ -0,0 +1,96 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.stats.DistributionStat; +import io.airlift.stats.TimeStat; +import org.weakref.jmx.Managed; +import org.weakref.jmx.Nested; + +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static java.util.concurrent.TimeUnit.NANOSECONDS; + +public class FileFormatDataSourceStats +{ + private final DistributionStat readBytes = new DistributionStat(); + private final DistributionStat maxCombinedBytesPerRow = new DistributionStat(); + private final TimeStat time0Bto100KB = new TimeStat(MILLISECONDS); + private final TimeStat time100KBto1MB = new TimeStat(MILLISECONDS); + private final TimeStat time1MBto10MB = new TimeStat(MILLISECONDS); + private final TimeStat time10MBPlus = new TimeStat(MILLISECONDS); + + @Managed + @Nested + public DistributionStat getReadBytes() + { + return readBytes; + } + + @Managed + @Nested + public DistributionStat getMaxCombinedBytesPerRow() + { + return maxCombinedBytesPerRow; + } + + @Managed + @Nested + public TimeStat get0Bto100KB() + { + return time0Bto100KB; + } + + @Managed + @Nested + public TimeStat get100KBto1MB() + { + return time100KBto1MB; + } + + @Managed + @Nested + public TimeStat get1MBto10MB() + { + return time1MBto10MB; + } + + @Managed + @Nested + public TimeStat get10MBPlus() + { + return time10MBPlus; + } + + public void readDataBytesPerSecond(long bytes, long nanos) + { + readBytes.add(bytes); + if (bytes < 100 * 1024) { + time0Bto100KB.add(nanos, NANOSECONDS); + } + else if (bytes < 1024 * 1024) { + time100KBto1MB.add(nanos, NANOSECONDS); + } + else if (bytes < 10 * 1024 * 1024) { + time1MBto10MB.add(nanos, NANOSECONDS); + } + else { + time10MBPlus.add(nanos, NANOSECONDS); + } + } + + public void addMaxCombinedBytesPerRow(long bytes) + { + maxCombinedBytesPerRow.add(bytes); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForCachingHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForCachingHiveMetastore.java new file mode 100644 index 00000000..5b4285cc --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForCachingHiveMetastore.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import javax.inject.Qualifier; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@Qualifier +public @interface ForCachingHiveMetastore +{ +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForCachingHiveMetastoreTableRefresh.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForCachingHiveMetastoreTableRefresh.java new file mode 100644 index 00000000..b43bd0f1 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForCachingHiveMetastoreTableRefresh.java @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import javax.inject.Qualifier; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@Qualifier +public @interface ForCachingHiveMetastoreTableRefresh +{ +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHdfs.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHdfs.java new file mode 100644 index 00000000..c22da3bd --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHdfs.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import javax.inject.Qualifier; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@Qualifier +public @interface ForHdfs +{ +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHive.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHive.java new file mode 100644 index 00000000..1b0dc45d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHive.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import javax.inject.Qualifier; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@Qualifier +public @interface ForHive {} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHiveMetastore.java new file mode 100644 index 00000000..d8c918dd --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHiveMetastore.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import javax.inject.Qualifier; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@Qualifier +public @interface ForHiveMetastore +{ +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHiveTransactionHeartbeats.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHiveTransactionHeartbeats.java new file mode 100644 index 00000000..ed4bd856 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHiveTransactionHeartbeats.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import javax.inject.Qualifier; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@Qualifier +public @interface ForHiveTransactionHeartbeats {} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHiveVacuum.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHiveVacuum.java new file mode 100644 index 00000000..f23c187a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForHiveVacuum.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import javax.inject.Qualifier; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@Qualifier +public @interface ForHiveVacuum {} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForRecordingHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForRecordingHiveMetastore.java new file mode 100644 index 00000000..3971ab5a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ForRecordingHiveMetastore.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import javax.inject.Qualifier; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@Qualifier +public @interface ForRecordingHiveMetastore +{ +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/GenericHiveRecordCursor.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/GenericHiveRecordCursor.java new file mode 100644 index 00000000..c78c5d3b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/GenericHiveRecordCursor.java @@ -0,0 +1,518 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.prestosql.hadoop.TextLineLengthLimitExceededException; +import io.prestosql.plugin.hive.util.SerDeUtils; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.Decimals; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.BinaryComparable; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.RecordReader; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.math.BigInteger; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static io.prestosql.plugin.hive.HiveUtil.closeWithSuppression; +import static io.prestosql.plugin.hive.HiveUtil.getDeserializer; +import static io.prestosql.plugin.hive.HiveUtil.getTableObjectInspector; +import static io.prestosql.plugin.hive.HiveUtil.isStructuralType; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.Chars.isCharType; +import static io.prestosql.spi.type.Chars.truncateToLengthAndTrimSpaces; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.Decimals.rescale; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TimestampType.TIMESTAMP; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.VarbinaryType.VARBINARY; +import static io.prestosql.spi.type.Varchars.isVarcharType; +import static io.prestosql.spi.type.Varchars.truncateToLength; +import static java.lang.Float.floatToRawIntBits; +import static java.lang.Math.max; +import static java.lang.Math.min; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +class GenericHiveRecordCursor + implements RecordCursor +{ + private final Path path; + private final RecordReader recordReader; + private final K key; + private final V value; + + private final Deserializer deserializer; + + private final Type[] types; + private final HiveType[] hiveTypes; + + private final StructObjectInspector rowInspector; + private final ObjectInspector[] fieldInspectors; + private final StructField[] structFields; + + private final boolean[] loaded; + private final boolean[] booleans; + private final long[] longs; + private final double[] doubles; + private final Slice[] slices; + private final Object[] objects; + private final boolean[] nulls; + + private final long totalBytes; + + private long completedBytes; + private Object rowData; + private boolean closed; + + public GenericHiveRecordCursor( + Configuration configuration, + Path path, + RecordReader recordReader, + long totalBytes, + Properties splitSchema, + List columns, + TypeManager typeManager) + { + requireNonNull(path, "path is null"); + requireNonNull(recordReader, "recordReader is null"); + checkArgument(totalBytes >= 0, "totalBytes is negative"); + requireNonNull(splitSchema, "splitSchema is null"); + requireNonNull(columns, "columns is null"); + + this.path = path; + this.recordReader = recordReader; + this.totalBytes = totalBytes; + this.key = recordReader.createKey(); + this.value = recordReader.createValue(); + + this.deserializer = getDeserializer(configuration, splitSchema); + this.rowInspector = getTableObjectInspector(deserializer); + + int size = columns.size(); + + this.types = new Type[size]; + this.hiveTypes = new HiveType[size]; + + this.structFields = new StructField[size]; + this.fieldInspectors = new ObjectInspector[size]; + + this.loaded = new boolean[size]; + this.booleans = new boolean[size]; + this.longs = new long[size]; + this.doubles = new double[size]; + this.slices = new Slice[size]; + this.objects = new Object[size]; + this.nulls = new boolean[size]; + + // initialize data columns + for (int i = 0; i < columns.size(); i++) { + HiveColumnHandle column = columns.get(i); + checkState(column.getColumnType() == HiveColumnHandle.ColumnType.REGULAR, "column type must be regular"); + + types[i] = typeManager.getType(column.getTypeSignature()); + hiveTypes[i] = column.getHiveType(); + + StructField field = rowInspector.getStructFieldRef(column.getName()); + structFields[i] = field; + fieldInspectors[i] = field.getFieldObjectInspector(); + } + } + + @Override + public long getCompletedBytes() + { + if (!closed) { + updateCompletedBytes(); + } + return completedBytes; + } + + @Override + public long getReadTimeNanos() + { + return 0; + } + + private void updateCompletedBytes() + { + try { + long newCompletedBytes = (long) (totalBytes * recordReader.getProgress()); + completedBytes = min(totalBytes, max(completedBytes, newCompletedBytes)); + } + catch (IOException ignored) { + } + } + + @Override + public Type getType(int field) + { + return types[field]; + } + + @Override + public boolean advanceNextPosition() + { + try { + if (closed || !recordReader.next(key, value)) { + close(); + return false; + } + + // reset loaded flags + Arrays.fill(loaded, false); + + // decode value + rowData = deserializer.deserialize(value); + + return true; + } + catch (IOException | SerDeException | RuntimeException e) { + closeWithSuppression(this, e); + if (e instanceof TextLineLengthLimitExceededException) { + throw new PrestoException(HiveErrorCode.HIVE_BAD_DATA, "Line too long in text file: " + path, e); + } + throw new PrestoException(HiveErrorCode.HIVE_CURSOR_ERROR, e); + } + } + + @Override + public boolean getBoolean(int fieldId) + { + checkState(!closed, "Cursor is closed"); + + validateType(fieldId, boolean.class); + if (!loaded[fieldId]) { + parseBooleanColumn(fieldId); + } + return booleans[fieldId]; + } + + private void parseBooleanColumn(int column) + { + loaded[column] = true; + + Object fieldData = rowInspector.getStructFieldData(rowData, structFields[column]); + + if (fieldData == null) { + nulls[column] = true; + } + else { + Object fieldValue = ((PrimitiveObjectInspector) fieldInspectors[column]).getPrimitiveJavaObject(fieldData); + checkState(fieldValue != null, "fieldValue should not be null"); + booleans[column] = (Boolean) fieldValue; + nulls[column] = false; + } + } + + @Override + public long getLong(int fieldId) + { + checkState(!closed, "Cursor is closed"); + + validateType(fieldId, long.class); + if (!loaded[fieldId]) { + parseLongColumn(fieldId); + } + return longs[fieldId]; + } + + private void parseLongColumn(int column) + { + loaded[column] = true; + + Object fieldData = rowInspector.getStructFieldData(rowData, structFields[column]); + + if (fieldData == null) { + nulls[column] = true; + } + else { + Object fieldValue = ((PrimitiveObjectInspector) fieldInspectors[column]).getPrimitiveJavaObject(fieldData); + checkState(fieldValue != null, "fieldValue should not be null"); + longs[column] = getLongExpressedValue(fieldValue); + nulls[column] = false; + } + } + + private long getLongExpressedValue(Object value) + { + if (value instanceof Date) { + return ((Date) value).toEpochDay(); + } + if (value instanceof Timestamp) { + return ((Timestamp) value).toEpochMilli(); + } + if (value instanceof Float) { + return floatToRawIntBits(((Float) value)); + } + return ((Number) value).longValue(); + } + + @Override + public double getDouble(int fieldId) + { + checkState(!closed, "Cursor is closed"); + + validateType(fieldId, double.class); + if (!loaded[fieldId]) { + parseDoubleColumn(fieldId); + } + return doubles[fieldId]; + } + + private void parseDoubleColumn(int column) + { + loaded[column] = true; + + Object fieldData = rowInspector.getStructFieldData(rowData, structFields[column]); + + if (fieldData == null) { + nulls[column] = true; + } + else { + Object fieldValue = ((PrimitiveObjectInspector) fieldInspectors[column]).getPrimitiveJavaObject(fieldData); + checkState(fieldValue != null, "fieldValue should not be null"); + doubles[column] = ((Number) fieldValue).doubleValue(); + nulls[column] = false; + } + } + + @Override + public Slice getSlice(int fieldId) + { + checkState(!closed, "Cursor is closed"); + + validateType(fieldId, Slice.class); + if (!loaded[fieldId]) { + parseStringColumn(fieldId); + } + return slices[fieldId]; + } + + private void parseStringColumn(int column) + { + loaded[column] = true; + + Object fieldData = rowInspector.getStructFieldData(rowData, structFields[column]); + + if (fieldData == null) { + nulls[column] = true; + } + else { + Object fieldValue = ((PrimitiveObjectInspector) fieldInspectors[column]).getPrimitiveWritableObject(fieldData); + checkState(fieldValue != null, "fieldValue should not be null"); + BinaryComparable hiveValue; + if (fieldValue instanceof Text) { + hiveValue = (Text) fieldValue; + } + else if (fieldValue instanceof BytesWritable) { + hiveValue = (BytesWritable) fieldValue; + } + else if (fieldValue instanceof HiveVarcharWritable) { + hiveValue = ((HiveVarcharWritable) fieldValue).getTextValue(); + } + else if (fieldValue instanceof HiveCharWritable) { + hiveValue = ((HiveCharWritable) fieldValue).getTextValue(); + } + else { + throw new IllegalStateException("unsupported string field type: " + fieldValue.getClass().getName()); + } + + // create a slice view over the hive value and trim to character limits + Slice value = Slices.wrappedBuffer(hiveValue.getBytes(), 0, hiveValue.getLength()); + Type type = types[column]; + if (isVarcharType(type)) { + value = truncateToLength(value, type); + } + if (isCharType(type)) { + value = truncateToLengthAndTrimSpaces(value, type); + } + + // store a copy of the bytes, since the hive reader can reuse the underlying buffer + slices[column] = Slices.copyOf(value); + nulls[column] = false; + } + } + + private void parseDecimalColumn(int column) + { + loaded[column] = true; + + Object fieldData = rowInspector.getStructFieldData(rowData, structFields[column]); + + if (fieldData == null) { + nulls[column] = true; + } + else { + Object fieldValue = ((PrimitiveObjectInspector) fieldInspectors[column]).getPrimitiveJavaObject(fieldData); + checkState(fieldValue != null, "fieldValue should not be null"); + + HiveDecimal decimal = (HiveDecimal) fieldValue; + DecimalType columnType = (DecimalType) types[column]; + BigInteger unscaledDecimal = rescale(decimal.unscaledValue(), decimal.scale(), columnType.getScale()); + + if (columnType.isShort()) { + longs[column] = unscaledDecimal.longValue(); + } + else { + slices[column] = Decimals.encodeUnscaledValue(unscaledDecimal); + } + nulls[column] = false; + } + } + + @Override + public Object getObject(int fieldId) + { + checkState(!closed, "Cursor is closed"); + + validateType(fieldId, Block.class); + if (!loaded[fieldId]) { + parseObjectColumn(fieldId); + } + return objects[fieldId]; + } + + private void parseObjectColumn(int column) + { + loaded[column] = true; + + Object fieldData = rowInspector.getStructFieldData(rowData, structFields[column]); + + if (fieldData == null) { + nulls[column] = true; + } + else { + objects[column] = SerDeUtils.getBlockObject(types[column], fieldData, fieldInspectors[column]); + nulls[column] = false; + } + } + + @Override + public boolean isNull(int fieldId) + { + checkState(!closed, "Cursor is closed"); + + if (!loaded[fieldId]) { + parseColumn(fieldId); + } + return nulls[fieldId]; + } + + private void parseColumn(int column) + { + Type type = types[column]; + if (BOOLEAN.equals(type)) { + parseBooleanColumn(column); + } + else if (BIGINT.equals(type)) { + parseLongColumn(column); + } + else if (INTEGER.equals(type)) { + parseLongColumn(column); + } + else if (SMALLINT.equals(type)) { + parseLongColumn(column); + } + else if (TINYINT.equals(type)) { + parseLongColumn(column); + } + else if (REAL.equals(type)) { + parseLongColumn(column); + } + else if (DOUBLE.equals(type)) { + parseDoubleColumn(column); + } + else if (isVarcharType(type) || VARBINARY.equals(type)) { + parseStringColumn(column); + } + else if (isCharType(type)) { + parseStringColumn(column); + } + else if (isStructuralType(hiveTypes[column])) { + parseObjectColumn(column); + } + else if (DATE.equals(type)) { + parseLongColumn(column); + } + else if (TIMESTAMP.equals(type)) { + parseLongColumn(column); + } + else if (type instanceof DecimalType) { + parseDecimalColumn(column); + } + else { + throw new UnsupportedOperationException("Unsupported column type: " + type); + } + } + + private void validateType(int fieldId, Class type) + { + if (!types[fieldId].getJavaType().equals(type)) { + // we don't use Preconditions.checkArgument because it requires boxing fieldId, which affects inner loop performance + throw new IllegalArgumentException(format("Expected field to be %s, actual %s (field %s)", type, types[fieldId], fieldId)); + } + } + + @Override + public void close() + { + // some hive input formats are broken and bad things can happen if you close them multiple times + if (closed) { + return; + } + closed = true; + + updateCompletedBytes(); + + try { + recordReader.close(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/GenericHiveRecordCursorProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/GenericHiveRecordCursorProvider.java new file mode 100644 index 00000000..0b9dcc31 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/GenericHiveRecordCursorProvider.java @@ -0,0 +1,90 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.RecordReader; + +import javax.inject.Inject; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; + +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; +import static java.util.Objects.requireNonNull; + +public class GenericHiveRecordCursorProvider + implements HiveRecordCursorProvider +{ + private final HdfsEnvironment hdfsEnvironment; + + @Inject + public GenericHiveRecordCursorProvider(HdfsEnvironment hdfsEnvironment) + { + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + } + + @Override + public Optional createRecordCursor( + Configuration configuration, + ConnectorSession session, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + TupleDomain effectivePredicate, + TypeManager typeManager, + boolean s3SelectPushdownEnabled, + Map customSplitInfo) + { + // make sure the FileSystem is created with the proper Configuration object + try { + this.hdfsEnvironment.getFileSystem(session.getUser(), path, configuration); + } + catch (IOException e) { + throw new PrestoException(HIVE_FILESYSTEM_ERROR, "Failed getting FileSystem: " + path, e); + } + + return hdfsEnvironment.doAs(session.getUser(), () -> { + RecordReader recordReader = HiveUtil.createRecordReader(configuration, path, start, length, schema, columns, customSplitInfo); + + return Optional.of(new GenericHiveRecordCursor<>( + configuration, + path, + genericRecordReader(recordReader), + length, + schema, + columns, + typeManager)); + }); + } + + @SuppressWarnings("unchecked") + private static RecordReader genericRecordReader(RecordReader recordReader) + { + return (RecordReader) recordReader; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HdfsConfiguration.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HdfsConfiguration.java new file mode 100644 index 00000000..f02adc20 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HdfsConfiguration.java @@ -0,0 +1,24 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import org.apache.hadoop.conf.Configuration; + +import java.net.URI; + +public interface HdfsConfiguration +{ + Configuration getConfiguration(HdfsContext context, URI uri); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HdfsConfigurationInitializer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HdfsConfigurationInitializer.java new file mode 100644 index 00000000..01fb3e9f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HdfsConfigurationInitializer.java @@ -0,0 +1,199 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.common.net.HostAndPort; +import io.airlift.units.Duration; +import io.prestosql.hadoop.SocksSocketFactory; +import io.prestosql.plugin.hive.s3.ConfigurationInitializer; +import io.prestosql.plugin.hive.util.ConfigurationUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.client.HdfsClientConfigKeys; +import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.net.DNSToSwitchMapping; +import org.apache.orc.OrcConf; +import org.apache.parquet.hadoop.ParquetOutputFormat; + +import javax.inject.Inject; +import javax.net.SocketFactory; + +import java.util.List; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.lang.Math.toIntExact; +import static java.util.Objects.requireNonNull; +import static org.apache.hadoop.fs.CommonConfigurationKeys.IPC_PING_INTERVAL_KEY; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_RPC_PROTECTION; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_RPC_SOCKET_FACTORY_CLASS_DEFAULT_KEY; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SOCKS_SERVER_KEY; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_TIMEOUT_KEY; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_KEY_PROVIDER_CACHE_EXPIRY_MS; +import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.DFS_CLIENT_SOCKET_TIMEOUT_KEY; +import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.COMPRESSRESULT; +import static org.apache.hadoop.io.SequenceFile.CompressionType.BLOCK; + +public class HdfsConfigurationInitializer +{ + private final HostAndPort socksProxy; + private final Duration ipcPingInterval; + private final Duration dfsTimeout; + private final Duration dfsConnectTimeout; + private final int dfsConnectMaxRetries; + private final int dfsKeyProviderCacheTtlMillis; + private final String domainSocketPath; + private final Configuration resourcesConfiguration; + private final HiveCompressionCodec compressionCodec; + private final int fileSystemMaxCacheSize; + private final Set configurationInitializers; + private final boolean isHdfsWireEncryptionEnabled; + private int textMaxLineLength; + + @VisibleForTesting + public HdfsConfigurationInitializer(HiveConfig config) + { + this(config, ImmutableSet.of()); + } + + @Inject + public HdfsConfigurationInitializer(HiveConfig config, Set configurationInitializers) + { + requireNonNull(config, "config is null"); + checkArgument(config.getDfsTimeout().toMillis() >= 1, "dfsTimeout must be at least 1 ms"); + checkArgument(toIntExact(config.getTextMaxLineLength().toBytes()) >= 1, "textMaxLineLength must be at least 1 byte"); + + this.socksProxy = config.getMetastoreSocksProxy(); + this.ipcPingInterval = config.getIpcPingInterval(); + this.dfsTimeout = config.getDfsTimeout(); + this.dfsConnectTimeout = config.getDfsConnectTimeout(); + this.dfsConnectMaxRetries = config.getDfsConnectMaxRetries(); + this.dfsKeyProviderCacheTtlMillis = toIntExact(config.getDfsKeyProviderCacheTtl().toMillis()); + this.domainSocketPath = config.getDomainSocketPath(); + this.resourcesConfiguration = readConfiguration(config.getResourceConfigFiles()); + this.compressionCodec = config.getHiveCompressionCodec(); + this.fileSystemMaxCacheSize = config.getFileSystemMaxCacheSize(); + this.isHdfsWireEncryptionEnabled = config.isHdfsWireEncryptionEnabled(); + this.textMaxLineLength = toIntExact(config.getTextMaxLineLength().toBytes()); + + this.configurationInitializers = ImmutableSet.copyOf(requireNonNull(configurationInitializers, "configurationInitializers is null")); + } + + private static Configuration readConfiguration(List resourcePaths) + { + Configuration result = new Configuration(false); + + for (String resourcePath : resourcePaths) { + Configuration resourceProperties = new Configuration(false); + resourceProperties.addResource(new Path(resourcePath)); + ConfigurationUtils.copy(resourceProperties, result); + } + + return result; + } + + public void initializeConfiguration(Configuration config) + { + ConfigurationUtils.copy(resourcesConfiguration, config); + + // this is to prevent dfs client from doing reverse DNS lookups to determine whether nodes are rack local + config.setClass(NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, NoOpDNSToSwitchMapping.class, DNSToSwitchMapping.class); + + if (socksProxy != null) { + config.setClass(HADOOP_RPC_SOCKET_FACTORY_CLASS_DEFAULT_KEY, SocksSocketFactory.class, SocketFactory.class); + config.set(HADOOP_SOCKS_SERVER_KEY, socksProxy.toString()); + } + + if (domainSocketPath != null) { + config.setStrings(DFS_DOMAIN_SOCKET_PATH_KEY, domainSocketPath); + } + + // only enable short circuit reads if domain socket path is properly configured + if (!config.get(DFS_DOMAIN_SOCKET_PATH_KEY, "").trim().isEmpty()) { + config.setBooleanIfUnset(HdfsClientConfigKeys.Read.ShortCircuit.KEY, true); + } + + config.setInt(DFS_CLIENT_SOCKET_TIMEOUT_KEY, toIntExact(dfsTimeout.toMillis())); + config.setInt(IPC_PING_INTERVAL_KEY, toIntExact(ipcPingInterval.toMillis())); + config.setInt(IPC_CLIENT_CONNECT_TIMEOUT_KEY, toIntExact(dfsConnectTimeout.toMillis())); + config.setInt(IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, dfsConnectMaxRetries); + + if (isHdfsWireEncryptionEnabled) { + config.set(HADOOP_RPC_PROTECTION, "privacy"); + config.setBoolean("dfs.encrypt.data.transfer", true); + } + + config.setInt("fs.cache.max-size", fileSystemMaxCacheSize); + + config.setInt(DFS_CLIENT_KEY_PROVIDER_CACHE_EXPIRY_MS, dfsKeyProviderCacheTtlMillis); + config.setInt(LineRecordReader.MAX_LINE_LENGTH, textMaxLineLength); + + configureCompression(config, compressionCodec); + + configurationInitializers.forEach(configurationInitializer -> configurationInitializer.initializeConfiguration(config)); + } + + public static void configureCompression(Configuration config, HiveCompressionCodec compressionCodec) + { + boolean compression = compressionCodec != HiveCompressionCodec.NONE; + config.setBoolean(COMPRESSRESULT.varname, compression); + config.setBoolean("mapred.output.compress", compression); + config.setBoolean(FileOutputFormat.COMPRESS, compression); + // For ORC + OrcConf.COMPRESS.setString(config, compressionCodec.getOrcCompressionKind().name()); + // For RCFile and Text + if (compressionCodec.getCodec().isPresent()) { + config.set("mapred.output.compression.codec", compressionCodec.getCodec().get().getName()); + config.set(FileOutputFormat.COMPRESS_CODEC, compressionCodec.getCodec().get().getName()); + } + else { + config.unset("mapred.output.compression.codec"); + config.unset(FileOutputFormat.COMPRESS_CODEC); + } + // For Parquet + config.set(ParquetOutputFormat.COMPRESSION, compressionCodec.getParquetCompressionCodec().name()); + // For SequenceFile + config.set(FileOutputFormat.COMPRESS_TYPE, BLOCK.toString()); + } + + public static class NoOpDNSToSwitchMapping + implements DNSToSwitchMapping + { + @Override + public List resolve(List names) + { + // dfs client expects an empty list as an indication that the host->switch mapping for the given names are not known + return ImmutableList.of(); + } + + @Override + public void reloadCachedMappings() + { + // no-op + } + + @Override + public void reloadCachedMappings(List names) + { + // no-op + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HdfsEnvironment.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HdfsEnvironment.java new file mode 100644 index 00000000..2d373139 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HdfsEnvironment.java @@ -0,0 +1,164 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.hadoop.HadoopNative; +import io.prestosql.plugin.hive.authentication.GenericExceptionAction; +import io.prestosql.plugin.hive.authentication.HdfsAuthentication; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.security.ConnectorIdentity; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import javax.inject.Inject; + +import java.io.IOException; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +public class HdfsEnvironment +{ + static { + HadoopNative.requireHadoopNative(); + } + + private final HdfsConfiguration hdfsConfiguration; + private final HdfsAuthentication hdfsAuthentication; + private final boolean verifyChecksum; + + @Inject + public HdfsEnvironment( + HdfsConfiguration hdfsConfiguration, + HiveConfig config, + HdfsAuthentication hdfsAuthentication) + { + this.hdfsConfiguration = requireNonNull(hdfsConfiguration, "hdfsConfiguration is null"); + this.verifyChecksum = requireNonNull(config, "config is null").isVerifyChecksum(); + this.hdfsAuthentication = requireNonNull(hdfsAuthentication, "hdfsAuthentication is null"); + } + + public Configuration getConfiguration(HdfsContext context, Path path) + { + return hdfsConfiguration.getConfiguration(context, path.toUri()); + } + + public FileSystem getFileSystem(HdfsContext context, Path path) + throws IOException + { + return getFileSystem(context.getIdentity().getUser(), path, getConfiguration(context, path)); + } + + public FileSystem getFileSystem(String user, Path path, Configuration configuration) + throws IOException + { + return hdfsAuthentication.doAs(user, () -> { + FileSystem fileSystem = path.getFileSystem(configuration); + fileSystem.setVerifyChecksum(verifyChecksum); + return fileSystem; + }); + } + + public R doAs(String user, GenericExceptionAction action) + throws E + { + return hdfsAuthentication.doAs(user, action); + } + + public void doAs(String user, Runnable action) + { + hdfsAuthentication.doAs(user, action); + } + + public static class HdfsContext + { + private final ConnectorIdentity identity; + private final Optional source; + private final Optional queryId; + private final Optional schemaName; + private final Optional tableName; + + public HdfsContext(ConnectorIdentity identity) + { + this.identity = requireNonNull(identity, "identity is null"); + this.source = Optional.empty(); + this.queryId = Optional.empty(); + this.schemaName = Optional.empty(); + this.tableName = Optional.empty(); + } + + public HdfsContext(ConnectorSession session, String schemaName) + { + requireNonNull(session, "session is null"); + requireNonNull(schemaName, "schemaName is null"); + this.identity = requireNonNull(session.getIdentity(), "session.getIdentity() is null"); + this.source = requireNonNull(session.getSource(), "session.getSource()"); + this.queryId = Optional.of(session.getQueryId()); + this.schemaName = Optional.of(schemaName); + this.tableName = Optional.empty(); + } + + public HdfsContext(ConnectorSession session, String schemaName, String tableName) + { + requireNonNull(session, "session is null"); + requireNonNull(schemaName, "schemaName is null"); + requireNonNull(tableName, "tableName is null"); + this.identity = requireNonNull(session.getIdentity(), "session.getIdentity() is null"); + this.source = requireNonNull(session.getSource(), "session.getSource()"); + this.queryId = Optional.of(session.getQueryId()); + this.schemaName = Optional.of(schemaName); + this.tableName = Optional.of(tableName); + } + + public ConnectorIdentity getIdentity() + { + return identity; + } + + public Optional getSource() + { + return source; + } + + public Optional getQueryId() + { + return queryId; + } + + public Optional getSchemaName() + { + return schemaName; + } + + public Optional getTableName() + { + return tableName; + } + + @Override + public String toString() + { + return toStringHelper(this) + .omitNullValues() + .add("user", identity) + .add("source", source.orElse(null)) + .add("queryId", queryId.orElse(null)) + .add("schemaName", schemaName.orElse(null)) + .add("tableName", tableName.orElse(null)) + .toString(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveACIDWriteType.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveACIDWriteType.java new file mode 100644 index 00000000..72dc8463 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveACIDWriteType.java @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +public enum HiveACIDWriteType +{ + VACUUM(-2), + VACUUM_UNIFY(-2), + NONE(-1), + INSERT(0), + INSERT_OVERWRITE(0), + UPDATE(0), //Hive ACID semantics post hive-3.x expects operation to be 0. + DELETE(2); + + private int operationId; + + HiveACIDWriteType(int operationId) + { + this.operationId = operationId; + } + + public int getOperationId() + { + return operationId; + } + + public static boolean isUpdateOrDelete(HiveACIDWriteType writeType) + { + return writeType == UPDATE || writeType == DELETE; + } + + public static boolean isRowIdNeeded(HiveACIDWriteType writeType) + { + return isUpdateOrDelete(writeType) || isVacuum(writeType); + } + + public static boolean isVacuum(HiveACIDWriteType writeType) + { + return writeType == VACUUM || writeType == VACUUM_UNIFY; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveAnalyzeProperties.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveAnalyzeProperties.java new file mode 100644 index 00000000..52d8f211 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveAnalyzeProperties.java @@ -0,0 +1,88 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.session.PropertyMetadata; +import io.prestosql.spi.type.TypeManager; + +import javax.inject.Inject; + +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.firstNonNull; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.prestosql.spi.StandardErrorCode.INVALID_ANALYZE_PROPERTY; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; + +public class HiveAnalyzeProperties +{ + public static final String PARTITIONS_PROPERTY = "partitions"; + + private final List> analyzeProperties; + + @Inject + public HiveAnalyzeProperties(TypeManager typeManager) + { + analyzeProperties = ImmutableList.of( + new PropertyMetadata<>( + PARTITIONS_PROPERTY, + "Partitions to be analyzed", + typeManager.getType(parseTypeSignature("array(array(varchar))")), + List.class, + null, + false, + HiveAnalyzeProperties::decodePartitionLists, + value -> value)); + } + + public List> getAnalyzeProperties() + { + return analyzeProperties; + } + + @SuppressWarnings("unchecked") + public static Optional>> getPartitionList(Map properties) + { + List> partitions = (List>) properties.get(PARTITIONS_PROPERTY); + return partitions == null ? Optional.empty() : Optional.of(partitions); + } + + private static List> decodePartitionLists(Object object) + { + if (object == null) { + return null; + } + + // replace null partition value with hive default partition + return ImmutableList.copyOf(((Collection) object).stream() + .peek(HiveAnalyzeProperties::throwIfNull) + .map(partition -> ((Collection) partition).stream() + .map(name -> firstNonNull((String) name, HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION)) + .collect(toImmutableList())) + .collect(toImmutableSet())); + } + + private static void throwIfNull(Object object) + { + if (object == null) { + throw new PrestoException(INVALID_ANALYZE_PROPERTY, "Invalid null value in analyze partitions property"); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBasicStatistics.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBasicStatistics.java new file mode 100644 index 00000000..d7f36b5a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBasicStatistics.java @@ -0,0 +1,119 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import javax.annotation.concurrent.Immutable; + +import java.util.Objects; +import java.util.OptionalLong; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class HiveBasicStatistics +{ + private final OptionalLong fileCount; + private final OptionalLong rowCount; + private final OptionalLong inMemoryDataSizeInBytes; + private final OptionalLong onDiskDataSizeInBytes; + + public static HiveBasicStatistics createEmptyStatistics() + { + return new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.empty(), OptionalLong.empty()); + } + + public static HiveBasicStatistics createZeroStatistics() + { + return new HiveBasicStatistics(0, 0, 0, 0); + } + + public HiveBasicStatistics(long fileCount, long rowCount, long inMemoryDataSizeInBytes, long onDiskDataSizeInBytes) + { + this(OptionalLong.of(fileCount), OptionalLong.of(rowCount), OptionalLong.of(inMemoryDataSizeInBytes), OptionalLong.of(onDiskDataSizeInBytes)); + } + + @JsonCreator + public HiveBasicStatistics( + @JsonProperty("fileCount") OptionalLong fileCount, + @JsonProperty("rowCount") OptionalLong rowCount, + @JsonProperty("inMemoryDataSizeInBytes") OptionalLong inMemoryDataSizeInBytes, + @JsonProperty("onDiskDataSizeInBytes") OptionalLong onDiskDataSizeInBytes) + { + this.fileCount = requireNonNull(fileCount, "fileCount is null"); + this.rowCount = requireNonNull(rowCount, "rowCount is null"); + this.inMemoryDataSizeInBytes = requireNonNull(inMemoryDataSizeInBytes, "inMemoryDataSizeInBytes is null"); + this.onDiskDataSizeInBytes = requireNonNull(onDiskDataSizeInBytes, "onDiskDataSizeInBytes is null"); + } + + @JsonProperty + public OptionalLong getFileCount() + { + return fileCount; + } + + @JsonProperty + public OptionalLong getRowCount() + { + return rowCount; + } + + @JsonProperty + public OptionalLong getInMemoryDataSizeInBytes() + { + return inMemoryDataSizeInBytes; + } + + @JsonProperty + public OptionalLong getOnDiskDataSizeInBytes() + { + return onDiskDataSizeInBytes; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HiveBasicStatistics that = (HiveBasicStatistics) o; + return Objects.equals(fileCount, that.fileCount) && + Objects.equals(rowCount, that.rowCount) && + Objects.equals(inMemoryDataSizeInBytes, that.inMemoryDataSizeInBytes) && + Objects.equals(onDiskDataSizeInBytes, that.onDiskDataSizeInBytes); + } + + @Override + public int hashCode() + { + return Objects.hash(fileCount, rowCount, inMemoryDataSizeInBytes, onDiskDataSizeInBytes); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("fileCount", fileCount) + .add("rowCount", rowCount) + .add("inMemoryDataSizeInBytes", inMemoryDataSizeInBytes) + .add("onDiskDataSizeInBytes", onDiskDataSizeInBytes) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBooleanParser.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBooleanParser.java new file mode 100644 index 00000000..cb0bad73 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBooleanParser.java @@ -0,0 +1,61 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +public final class HiveBooleanParser +{ + private HiveBooleanParser() {} + + public static Boolean parseHiveBoolean(byte[] bytes, int start, int length) + { + if (isTrue(bytes, start, length)) { + return true; + } + if (isFalse(bytes, start, length)) { + return false; + } + return null; + } + + @SuppressWarnings("PointlessArithmeticExpression") + public static boolean isFalse(byte[] bytes, int start, int length) + { + return (length == 5) && + (toUpperCase(bytes[start + 0]) == 'F') && + (toUpperCase(bytes[start + 1]) == 'A') && + (toUpperCase(bytes[start + 2]) == 'L') && + (toUpperCase(bytes[start + 3]) == 'S') && + (toUpperCase(bytes[start + 4]) == 'E'); + } + + @SuppressWarnings("PointlessArithmeticExpression") + public static boolean isTrue(byte[] bytes, int start, int length) + { + return (length == 4) && + (toUpperCase(bytes[start + 0]) == 'T') && + (toUpperCase(bytes[start + 1]) == 'R') && + (toUpperCase(bytes[start + 2]) == 'U') && + (toUpperCase(bytes[start + 3]) == 'E'); + } + + private static byte toUpperCase(byte b) + { + return isLowerCase(b) ? ((byte) (b - 32)) : b; + } + + private static boolean isLowerCase(byte b) + { + return (b >= 'a') && (b <= 'z'); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketAdapterRecordCursor.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketAdapterRecordCursor.java new file mode 100644 index 00000000..7e758632 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketAdapterRecordCursor.java @@ -0,0 +1,193 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.slice.Slice; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import java.util.List; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_INVALID_BUCKET_FILES; +import static io.prestosql.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class HiveBucketAdapterRecordCursor + implements RecordCursor +{ + private final RecordCursor delegate; + private final int[] bucketColumnIndices; + private final List> javaTypeList; + private final List typeInfoList; + private final BucketingVersion bucketingVersion; + private final int tableBucketCount; + private final int partitionBucketCount; + private final int bucketToKeep; + + private final Object[] scratch; + + public HiveBucketAdapterRecordCursor( + int[] bucketColumnIndices, + List bucketColumnHiveTypes, + BucketingVersion bucketingVersion, + int tableBucketCount, + int partitionBucketCount, + int bucketToKeep, + TypeManager typeManager, + RecordCursor delegate) + { + this.bucketColumnIndices = requireNonNull(bucketColumnIndices, "bucketColumnIndices is null"); + this.delegate = requireNonNull(delegate, "delegate is null"); + requireNonNull(bucketColumnHiveTypes, "bucketColumnHiveTypes is null"); + this.javaTypeList = bucketColumnHiveTypes.stream() + .map(HiveType::getTypeSignature) + .map(typeManager::getType) + .map(Type::getJavaType) + .collect(toImmutableList()); + this.typeInfoList = bucketColumnHiveTypes.stream() + .map(HiveType::getTypeInfo) + .collect(toImmutableList()); + this.bucketingVersion = requireNonNull(bucketingVersion, "bucketingVersion is null"); + this.tableBucketCount = tableBucketCount; + this.partitionBucketCount = partitionBucketCount; + this.bucketToKeep = bucketToKeep; + + this.scratch = new Object[bucketColumnHiveTypes.size()]; + } + + @Override + public long getCompletedBytes() + { + return delegate.getCompletedBytes(); + } + + @Override + public Type getType(int field) + { + return delegate.getType(field); + } + + @Override + public boolean advanceNextPosition() + { + while (true) { + if (Thread.interrupted()) { + // Stop processing if the query has been destroyed. + Thread.currentThread().interrupt(); + throw new PrestoException(GENERIC_INTERNAL_ERROR, "RecordCursor was interrupted"); + } + + boolean hasNextPosition = delegate.advanceNextPosition(); + if (!hasNextPosition) { + return false; + } + for (int i = 0; i < scratch.length; i++) { + int index = bucketColumnIndices[i]; + if (delegate.isNull(index)) { + scratch[i] = null; + continue; + } + Class javaType = javaTypeList.get(i); + if (javaType == boolean.class) { + scratch[i] = delegate.getBoolean(index); + } + else if (javaType == long.class) { + scratch[i] = delegate.getLong(index); + } + else if (javaType == double.class) { + scratch[i] = delegate.getDouble(index); + } + else if (javaType == Slice.class) { + scratch[i] = delegate.getSlice(index); + } + else if (javaType == Block.class) { + scratch[i] = delegate.getObject(index); + } + else { + throw new UnsupportedOperationException("Unknown java type: " + javaType); + } + } + int bucket = HiveBucketing.getHiveBucket(bucketingVersion, tableBucketCount, typeInfoList, scratch); + if ((bucket - bucketToKeep) % partitionBucketCount != 0) { + throw new PrestoException(HIVE_INVALID_BUCKET_FILES, format( + "A row that is supposed to be in bucket %s is encountered. Only rows in bucket %s (modulo %s) are expected", + bucket, bucketToKeep % partitionBucketCount, partitionBucketCount)); + } + if (bucket == bucketToKeep) { + return true; + } + } + } + + @Override + public boolean getBoolean(int field) + { + return delegate.getBoolean(field); + } + + @Override + public long getLong(int field) + { + return delegate.getLong(field); + } + + @Override + public double getDouble(int field) + { + return delegate.getDouble(field); + } + + @Override + public Slice getSlice(int field) + { + return delegate.getSlice(field); + } + + @Override + public Object getObject(int field) + { + return delegate.getObject(field); + } + + @Override + public boolean isNull(int field) + { + return delegate.isNull(field); + } + + @Override + public void close() + { + delegate.close(); + } + + @Override + public long getReadTimeNanos() + { + return delegate.getReadTimeNanos(); + } + + @Override + public long getSystemMemoryUsage() + { + return delegate.getSystemMemoryUsage(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketFunction.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketFunction.java new file mode 100644 index 00000000..64554c3d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketFunction.java @@ -0,0 +1,84 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.spi.Page; +import io.prestosql.spi.connector.BucketFunction; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import java.util.List; +import java.util.stream.Collectors; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +public class HiveBucketFunction + implements BucketFunction +{ + private final BucketingVersion bucketingVersion; + private final int bucketCount; + private final List typeInfos; + private final List typeInfosForUpdate; + private final boolean isRowIdPartitioner; + + public HiveBucketFunction(BucketingVersion bucketingVersion, int bucketCount, List hiveTypes) + { + this(bucketingVersion, bucketCount, hiveTypes, false); + } + + public HiveBucketFunction(BucketingVersion bucketingVersion, int bucketCount, List hiveTypes, boolean forUpdate) + { + this.bucketingVersion = requireNonNull(bucketingVersion, "bucketingVersion is null"); + this.bucketCount = bucketCount; + this.typeInfos = requireNonNull(hiveTypes, "hiveTypes is null").stream() + .map(HiveType::getTypeInfo) + .collect(Collectors.toList()); + this.isRowIdPartitioner = forUpdate && + typeInfos.get(typeInfos.size() - 1).getCategory() == Category.STRUCT; + if (forUpdate && typeInfos.size() > 1) { + typeInfosForUpdate = typeInfos.subList(0, typeInfos.size() - 1); + } + else { + typeInfosForUpdate = ImmutableList.of(); + } + } + + @Override + public int getBucket(Page page, int position) + { + if (isRowIdPartitioner) { + int bucketHashCode = 0; + if (page.getChannelCount() > 1) { + //Consider the partitioning columns also for partitioning during update to parallelize the updates of partitioned tables. + bucketHashCode = HiveBucketing.getBucketHashCode(bucketingVersion, typeInfosForUpdate, page, position, typeInfosForUpdate.size()); + } + bucketHashCode = bucketHashCode * 31 + HiveBucketing.extractBucketNumber(page, position); + return HiveBucketing.getBucketNumber(bucketHashCode, bucketCount); + } + return HiveBucketing.getHiveBucket(bucketingVersion, bucketCount, typeInfos, page, position); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("version", bucketingVersion) + .add("bucketCount", bucketCount) + .add("typeInfos", typeInfos) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketHandle.java new file mode 100644 index 00000000..2040aed4 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketHandle.java @@ -0,0 +1,106 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; + +import java.util.List; +import java.util.Objects; + +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; + +public class HiveBucketHandle +{ + private final List columns; + private final BucketingVersion bucketingVersion; + // Number of buckets in the table, as specified in table metadata + private final int tableBucketCount; + // Number of buckets the table will appear to have when the Hive connector + // presents the table to the engine for read. + private final int readBucketCount; + + @JsonCreator + public HiveBucketHandle( + @JsonProperty("columns") List columns, + @JsonProperty("bucketingVersion") BucketingVersion bucketingVersion, + @JsonProperty("tableBucketCount") int tableBucketCount, + @JsonProperty("readBucketCount") int readBucketCount) + { + this.columns = requireNonNull(columns, "columns is null"); + this.bucketingVersion = requireNonNull(bucketingVersion, "bucketingVersion is null"); + this.tableBucketCount = tableBucketCount; + this.readBucketCount = readBucketCount; + } + + @JsonProperty + public List getColumns() + { + return columns; + } + + @JsonProperty + public BucketingVersion getBucketingVersion() + { + return bucketingVersion; + } + + @JsonProperty + public int getTableBucketCount() + { + return tableBucketCount; + } + + @JsonProperty + public int getReadBucketCount() + { + return readBucketCount; + } + + public HiveBucketProperty toTableBucketProperty() + { + return new HiveBucketProperty( + columns.stream() + .map(HiveColumnHandle::getName) + .collect(toList()), + bucketingVersion, + tableBucketCount, + ImmutableList.of()); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HiveBucketHandle that = (HiveBucketHandle) o; + return Objects.equals(this.columns, that.columns) && + Objects.equals(this.tableBucketCount, that.tableBucketCount) && + Objects.equals(this.readBucketCount, that.readBucketCount) && + Objects.equals(this.bucketingVersion, that.bucketingVersion); + } + + @Override + public int hashCode() + { + return Objects.hash(columns, bucketingVersion, tableBucketCount, readBucketCount); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketProperty.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketProperty.java new file mode 100644 index 00000000..a8aced51 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketProperty.java @@ -0,0 +1,131 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.plugin.hive.metastore.SortingColumn; +import io.prestosql.spi.PrestoException; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; + +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA; +import static java.util.Objects.requireNonNull; + +public class HiveBucketProperty +{ + private final List bucketedBy; + private final BucketingVersion bucketingVersion; + private final int bucketCount; + private final List sortedBy; + + @JsonCreator + public HiveBucketProperty( + @JsonProperty("bucketedBy") List bucketedBy, + @JsonProperty("bucketingVersion") BucketingVersion bucketingVersion, + @JsonProperty("bucketCount") int bucketCount, + @JsonProperty("sortedBy") List sortedBy) + { + this.bucketedBy = ImmutableList.copyOf(requireNonNull(bucketedBy, "bucketedBy is null")); + this.bucketingVersion = requireNonNull(bucketingVersion, "bucketingVersion is null"); + this.bucketCount = bucketCount; + this.sortedBy = ImmutableList.copyOf(requireNonNull(sortedBy, "sortedBy is null")); + } + + public static Optional fromStorageDescriptor(Map tableParameters, StorageDescriptor storageDescriptor, String tablePartitionName) + { + boolean bucketColsSet = storageDescriptor.isSetBucketCols() && !storageDescriptor.getBucketCols().isEmpty(); + boolean numBucketsSet = storageDescriptor.isSetNumBuckets() && storageDescriptor.getNumBuckets() > 0; + if (!numBucketsSet) { + // In Hive, a table is considered as not bucketed when its bucketCols is set but its numBucket is not set. + return Optional.empty(); + } + if (!bucketColsSet) { + throw new PrestoException(HIVE_INVALID_METADATA, "Table/partition metadata has 'numBuckets' set, but 'bucketCols' is not set: " + tablePartitionName); + } + List sortedBy = ImmutableList.of(); + if (storageDescriptor.isSetSortCols()) { + sortedBy = storageDescriptor.getSortCols().stream() + .map(order -> SortingColumn.fromMetastoreApiOrder(order, tablePartitionName)) + .collect(toImmutableList()); + } + BucketingVersion bucketingVersion = HiveBucketing.getBucketingVersion(tableParameters); + return Optional.of(new HiveBucketProperty(storageDescriptor.getBucketCols(), bucketingVersion, storageDescriptor.getNumBuckets(), sortedBy)); + } + + @JsonProperty + public List getBucketedBy() + { + return bucketedBy; + } + + @JsonProperty + public BucketingVersion getBucketingVersion() + { + return bucketingVersion; + } + + @JsonProperty + public int getBucketCount() + { + return bucketCount; + } + + @JsonProperty + public List getSortedBy() + { + return sortedBy; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HiveBucketProperty that = (HiveBucketProperty) o; + return bucketingVersion == that.bucketingVersion && + bucketCount == that.bucketCount && + Objects.equals(bucketedBy, that.bucketedBy) && + Objects.equals(sortedBy, that.sortedBy); + } + + @Override + public int hashCode() + { + return Objects.hash(bucketedBy, bucketingVersion, bucketCount, sortedBy); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("bucketedBy", bucketedBy) + .add("bucketingVersion", bucketingVersion) + .add("bucketCount", bucketCount) + .add("sortedBy", sortedBy) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketing.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketing.java new file mode 100644 index 00000000..6325591a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveBucketing.java @@ -0,0 +1,325 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.util.HiveBucketingV1; +import io.prestosql.plugin.hive.util.HiveBucketingV2; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.StandardErrorCode; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.RowBlock; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.NullableValue; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.predicate.ValueSet; +import org.apache.hadoop.hive.ql.io.BucketCodec; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Set; +import java.util.stream.Collectors; + +import static io.prestosql.plugin.hive.HiveBucketing.BucketingVersion.BUCKETING_V1; +import static io.prestosql.plugin.hive.HiveBucketing.BucketingVersion.BUCKETING_V2; +import static io.prestosql.plugin.hive.HiveColumnHandle.BUCKET_COLUMN_NAME; +import static io.prestosql.plugin.hive.HiveUtil.getRegularColumnHandles; +import static java.lang.String.format; +import static java.util.Map.Entry; +import static java.util.function.Function.identity; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.TABLE_BUCKETING_VERSION; +import static org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory.TIMESTAMP; + +public final class HiveBucketing +{ + public enum BucketingVersion + { + BUCKETING_V1(1), + BUCKETING_V2(2), + /**/; + + private final int version; + + BucketingVersion(int version) + { + this.version = version; + } + + public int getVersion() + { + return version; + } + } + + private static final Set SUPPORTED_TYPES_FOR_BUCKET_FILTER = ImmutableSet.of( + HiveType.HIVE_BYTE, + HiveType.HIVE_SHORT, + HiveType.HIVE_INT, + HiveType.HIVE_LONG, + HiveType.HIVE_BOOLEAN, + HiveType.HIVE_STRING); + + static final int MAX_BUCKET_NUMBER = 1_000_000 - 1; + + private HiveBucketing() {} + + public static int getHiveBucket(BucketingVersion bucketingVersion, int bucketCount, List types, Page page, int position) + { + return getBucketNumber(getBucketHashCode(bucketingVersion, types, page, position), bucketCount); + } + + public static int getHiveBucket(BucketingVersion bucketingVersion, int bucketCount, List types, Object[] values) + { + return getBucketNumber(getBucketHashCode(bucketingVersion, types, values), bucketCount); + } + + @VisibleForTesting + static int getBucketNumber(int hashCode, int bucketCount) + { + return (hashCode & Integer.MAX_VALUE) % bucketCount; + } + + @VisibleForTesting + static int getBucketHashCode(BucketingVersion bucketingVersion, List types, Page page, int position) + { + int channelCount = page.getChannelCount(); + return getBucketHashCode(bucketingVersion, types, page, position, channelCount); + } + + static int getBucketHashCode(BucketingVersion bucketingVersion, List types, Page page, int position, int channelCount) + { + switch (bucketingVersion) { + case BUCKETING_V1: + return HiveBucketingV1.getBucketHashCode(types, page, position, channelCount); + case BUCKETING_V2: + return HiveBucketingV2.getBucketHashCode(types, page, position, channelCount); + default: + throw new IllegalArgumentException("Unsupported bucketing version: " + bucketingVersion); + } + } + + @VisibleForTesting + static int getBucketHashCode(BucketingVersion bucketingVersion, List types, Object[] values) + { + switch (bucketingVersion) { + case BUCKETING_V1: + return HiveBucketingV1.getBucketHashCode(types, values); + case BUCKETING_V2: + return HiveBucketingV2.getBucketHashCode(types, values); + default: + throw new IllegalArgumentException("Unsupported bucketing version: " + bucketingVersion); + } + } + + public static Optional getHiveBucketHandle(Table table) + { + Optional hiveBucketProperty = table.getStorage().getBucketProperty(); + if (!hiveBucketProperty.isPresent()) { + return Optional.empty(); + } + + Map map = getRegularColumnHandles(table).stream() + .collect(Collectors.toMap(HiveColumnHandle::getName, identity())); + + ImmutableList.Builder bucketColumns = ImmutableList.builder(); + for (String bucketColumnName : hiveBucketProperty.get().getBucketedBy()) { + HiveColumnHandle bucketColumnHandle = map.get(bucketColumnName); + if (bucketColumnHandle == null) { + return Optional.empty(); + } + bucketColumns.add(bucketColumnHandle); + } + + BucketingVersion bucketingVersion = hiveBucketProperty.get().getBucketingVersion(); + int bucketCount = hiveBucketProperty.get().getBucketCount(); + return Optional.of(new HiveBucketHandle(bucketColumns.build(), bucketingVersion, bucketCount, bucketCount)); + } + + public static Optional getHiveBucketFilter(Table table, TupleDomain effectivePredicate) + { + if (!getHiveBucketHandle(table).isPresent()) { + return Optional.empty(); + } + + if (bucketedOnTimestamp(table.getStorage().getBucketProperty().get(), table)) { + return Optional.empty(); + } + + Optional> bindings = TupleDomain.extractFixedValues(effectivePredicate); + if (!bindings.isPresent()) { + return Optional.empty(); + } + OptionalInt singleBucket = getHiveBucket(table, bindings.get()); + if (singleBucket.isPresent()) { + return Optional.of(new HiveBucketFilter(ImmutableSet.of(singleBucket.getAsInt()))); + } + + if (!effectivePredicate.getDomains().isPresent()) { + return Optional.empty(); + } + Optional domain = effectivePredicate.getDomains().get().entrySet().stream() + .filter(entry -> ((HiveColumnHandle) entry.getKey()).getName().equals(BUCKET_COLUMN_NAME)) + .findFirst() + .map(Entry::getValue); + if (!domain.isPresent()) { + return Optional.empty(); + } + ValueSet values = domain.get().getValues(); + ImmutableSet.Builder builder = ImmutableSet.builder(); + int bucketCount = table.getStorage().getBucketProperty().get().getBucketCount(); + for (int i = 0; i < bucketCount; i++) { + if (values.containsValue((long) i)) { + builder.add(i); + } + } + return Optional.of(new HiveBucketFilter(builder.build())); + } + + private static OptionalInt getHiveBucket(Table table, Map bindings) + { + if (bindings.isEmpty()) { + return OptionalInt.empty(); + } + + List bucketColumns = table.getStorage().getBucketProperty().get().getBucketedBy(); + Map hiveTypes = new HashMap<>(); + for (Column column : table.getDataColumns()) { + hiveTypes.put(column.getName(), column.getType()); + } + + // Verify the bucket column types are supported + for (String column : bucketColumns) { + if (!SUPPORTED_TYPES_FOR_BUCKET_FILTER.contains(hiveTypes.get(column))) { + return OptionalInt.empty(); + } + } + + // Get bindings for bucket columns + Map bucketBindings = new HashMap<>(); + for (Entry entry : bindings.entrySet()) { + HiveColumnHandle colHandle = (HiveColumnHandle) entry.getKey(); + if (!entry.getValue().isNull() && bucketColumns.contains(colHandle.getName())) { + bucketBindings.put(colHandle.getName(), entry.getValue().getValue()); + } + } + + // Check that we have bindings for all bucket columns + if (bucketBindings.size() != bucketColumns.size()) { + return OptionalInt.empty(); + } + + // Get bindings of bucket columns + ImmutableList.Builder typeInfos = ImmutableList.builder(); + Object[] values = new Object[bucketColumns.size()]; + for (int i = 0; i < bucketColumns.size(); i++) { + String column = bucketColumns.get(i); + typeInfos.add(hiveTypes.get(column).getTypeInfo()); + values[i] = bucketBindings.get(column); + } + + BucketingVersion bucketingVersion = getBucketingVersion(table); + return OptionalInt.of(getHiveBucket(bucketingVersion, table.getStorage().getBucketProperty().get().getBucketCount(), typeInfos.build(), values)); + } + + public static BucketingVersion getBucketingVersion(Table table) + { + return getBucketingVersion(table.getParameters()); + } + + public static BucketingVersion getBucketingVersion(Map tableProperties) + { + String bucketingVersion = tableProperties.getOrDefault(TABLE_BUCKETING_VERSION, "1"); + switch (bucketingVersion) { + case "1": + return BUCKETING_V1; + case "2": + return BUCKETING_V2; + default: + // org.apache.hadoop.hive.ql.exec.Utilities.getBucketingVersion is more permissive and treats any non-number as "1" + throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, format("Unsupported bucketing version: '%s'", bucketingVersion)); + } + } + + public static boolean bucketedOnTimestamp(HiveBucketProperty bucketProperty, Table table) + { + return bucketProperty.getBucketedBy().stream() + .map(columnName -> table.getColumn(columnName) + .orElseThrow(() -> new IllegalArgumentException(format("Cannot find column '%s' in %s", columnName, table)))) + .map(Column::getType) + .map(HiveType::getTypeInfo) + .anyMatch(HiveBucketing::bucketedOnTimestamp); + } + + private static boolean bucketedOnTimestamp(TypeInfo type) + { + switch (type.getCategory()) { + case PRIMITIVE: + return ((PrimitiveTypeInfo) type).getPrimitiveCategory() == TIMESTAMP; + case LIST: + return bucketedOnTimestamp(((ListTypeInfo) type).getListElementTypeInfo()); + case MAP: + MapTypeInfo mapTypeInfo = (MapTypeInfo) type; + return bucketedOnTimestamp(mapTypeInfo.getMapKeyTypeInfo()) || + bucketedOnTimestamp(mapTypeInfo.getMapValueTypeInfo()); + default: + // TODO: support more types, e.g. ROW + throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive category: " + type.getCategory()); + } + } + + public static class HiveBucketFilter + { + private final Set bucketsToKeep; + + @JsonCreator + public HiveBucketFilter(@JsonProperty("bucketsToKeep") Set bucketsToKeep) + { + this.bucketsToKeep = bucketsToKeep; + } + + @JsonProperty + public Set getBucketsToKeep() + { + return bucketsToKeep; + } + } + + /** + * Extracts the bucketNumber from page. Its expected that page contains $rowId as last column + * + * @return BucketNumber + */ + static int extractBucketNumber(Page page, int position) + { + Block block = page.getBlock(page.getChannelCount() - 1); + RowBlock rowBlock = (RowBlock) block.getSingleValueBlock(position); + int encodedBucketNumber = rowBlock.getRawFieldBlocks()[1].getInt(0, 0); + return BucketCodec.determineVersion(encodedBucketNumber).decodeWriterId(encodedBucketNumber); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCatalogName.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCatalogName.java new file mode 100644 index 00000000..cb2fb016 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCatalogName.java @@ -0,0 +1,32 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import static java.util.Objects.requireNonNull; + +public class HiveCatalogName +{ + private final String catalogName; + + public HiveCatalogName(String catalogName) + { + this.catalogName = requireNonNull(catalogName, "catalogName is null"); + } + + @Override + public String toString() + { + return catalogName; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCoercionPolicy.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCoercionPolicy.java new file mode 100644 index 00000000..738cd6a0 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCoercionPolicy.java @@ -0,0 +1,126 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.type.VarcharType; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; + +import javax.inject.Inject; + +import java.util.List; + +import static io.prestosql.plugin.hive.HiveUtil.extractStructFieldTypes; +import static java.lang.Math.min; +import static java.util.Objects.requireNonNull; + +public class HiveCoercionPolicy + implements CoercionPolicy +{ + private final TypeManager typeManager; + + @Inject + public HiveCoercionPolicy(TypeManager typeManager) + { + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + } + + @Override + public boolean canCoerce(HiveType fromHiveType, HiveType toHiveType) + { + Type fromType = typeManager.getType(fromHiveType.getTypeSignature()); + Type toType = typeManager.getType(toHiveType.getTypeSignature()); + if (fromType instanceof VarcharType && toType instanceof VarcharType) { + return true; + } + if (fromType instanceof VarcharType) { + return toHiveType.equals(HiveType.HIVE_BYTE) || toHiveType.equals(HiveType.HIVE_SHORT) || toHiveType.equals(HiveType.HIVE_INT) || toHiveType.equals(HiveType.HIVE_LONG); + } + if (toType instanceof VarcharType) { + return fromHiveType.equals(HiveType.HIVE_BYTE) || fromHiveType.equals(HiveType.HIVE_SHORT) || fromHiveType.equals(HiveType.HIVE_INT) || fromHiveType.equals(HiveType.HIVE_LONG); + } + if (fromHiveType.equals(HiveType.HIVE_BYTE)) { + return toHiveType.equals(HiveType.HIVE_SHORT) || toHiveType.equals(HiveType.HIVE_INT) || toHiveType.equals(HiveType.HIVE_LONG); + } + if (fromHiveType.equals(HiveType.HIVE_SHORT)) { + return toHiveType.equals(HiveType.HIVE_INT) || toHiveType.equals(HiveType.HIVE_LONG); + } + if (fromHiveType.equals(HiveType.HIVE_INT)) { + return toHiveType.equals(HiveType.HIVE_LONG); + } + if (fromHiveType.equals(HiveType.HIVE_FLOAT)) { + return toHiveType.equals(HiveType.HIVE_DOUBLE) || toType instanceof DecimalType; + } + if (fromHiveType.equals(HiveType.HIVE_DOUBLE)) { + return toHiveType.equals(HiveType.HIVE_FLOAT) || toType instanceof DecimalType; + } + if (fromType instanceof DecimalType) { + return toType instanceof DecimalType || toHiveType.equals(HiveType.HIVE_FLOAT) || toHiveType.equals(HiveType.HIVE_DOUBLE); + } + + return canCoerceForList(fromHiveType, toHiveType) || canCoerceForMap(fromHiveType, toHiveType) || canCoerceForStruct(fromHiveType, toHiveType); + } + + private boolean canCoerceForMap(HiveType fromHiveType, HiveType toHiveType) + { + if (!fromHiveType.getCategory().equals(Category.MAP) || !toHiveType.getCategory().equals(Category.MAP)) { + return false; + } + HiveType fromKeyType = HiveType.valueOf(((MapTypeInfo) fromHiveType.getTypeInfo()).getMapKeyTypeInfo().getTypeName()); + HiveType fromValueType = HiveType.valueOf(((MapTypeInfo) fromHiveType.getTypeInfo()).getMapValueTypeInfo().getTypeName()); + HiveType toKeyType = HiveType.valueOf(((MapTypeInfo) toHiveType.getTypeInfo()).getMapKeyTypeInfo().getTypeName()); + HiveType toValueType = HiveType.valueOf(((MapTypeInfo) toHiveType.getTypeInfo()).getMapValueTypeInfo().getTypeName()); + return (fromKeyType.equals(toKeyType) || canCoerce(fromKeyType, toKeyType)) && + (fromValueType.equals(toValueType) || canCoerce(fromValueType, toValueType)); + } + + private boolean canCoerceForList(HiveType fromHiveType, HiveType toHiveType) + { + if (!fromHiveType.getCategory().equals(Category.LIST) || !toHiveType.getCategory().equals(Category.LIST)) { + return false; + } + HiveType fromElementType = HiveType.valueOf(((ListTypeInfo) fromHiveType.getTypeInfo()).getListElementTypeInfo().getTypeName()); + HiveType toElementType = HiveType.valueOf(((ListTypeInfo) toHiveType.getTypeInfo()).getListElementTypeInfo().getTypeName()); + return fromElementType.equals(toElementType) || canCoerce(fromElementType, toElementType); + } + + private boolean canCoerceForStruct(HiveType fromHiveType, HiveType toHiveType) + { + if (!fromHiveType.getCategory().equals(Category.STRUCT) || !toHiveType.getCategory().equals(Category.STRUCT)) { + return false; + } + List fromFieldNames = ((StructTypeInfo) fromHiveType.getTypeInfo()).getAllStructFieldNames(); + List toFieldNames = ((StructTypeInfo) toHiveType.getTypeInfo()).getAllStructFieldNames(); + List fromFieldTypes = extractStructFieldTypes(fromHiveType); + List toFieldTypes = extractStructFieldTypes(toHiveType); + // Rule: + // * Fields may be added or dropped from the end. + // * For all other field indices, the corresponding fields must have + // the same name, and the type must be coercible. + for (int i = 0; i < min(fromFieldTypes.size(), toFieldTypes.size()); i++) { + if (!fromFieldNames.get(i).equals(toFieldNames.get(i))) { + return false; + } + if (!fromFieldTypes.get(i).equals(toFieldTypes.get(i)) && !canCoerce(fromFieldTypes.get(i), toFieldTypes.get(i))) { + return false; + } + } + return true; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCoercionRecordCursor.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCoercionRecordCursor.java new file mode 100644 index 00000000..8fa356ef --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCoercionRecordCursor.java @@ -0,0 +1,696 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import io.airlift.slice.Slice; +import io.prestosql.spi.PageBuilder; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.type.VarcharType; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; + +import java.util.List; + +import static io.airlift.slice.Slices.utf8Slice; +import static io.prestosql.plugin.hive.HiveUtil.extractStructFieldTypes; +import static io.prestosql.plugin.hive.HiveUtil.isArrayType; +import static io.prestosql.plugin.hive.HiveUtil.isMapType; +import static io.prestosql.plugin.hive.HiveUtil.isRowType; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static java.lang.Float.intBitsToFloat; +import static java.lang.Math.min; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class HiveCoercionRecordCursor + implements RecordCursor +{ + private final RecordCursor delegate; + private final List columnMappings; + private final Coercer[] coercers; + private final BridgingRecordCursor bridgingRecordCursor; + + public HiveCoercionRecordCursor( + List columnMappings, + TypeManager typeManager, + RecordCursor delegate) + { + requireNonNull(columnMappings, "columns is null"); + requireNonNull(typeManager, "typeManager is null"); + this.bridgingRecordCursor = new BridgingRecordCursor(); + + this.delegate = requireNonNull(delegate, "delegate is null"); + this.columnMappings = ImmutableList.copyOf(columnMappings); + + int size = columnMappings.size(); + + this.coercers = new Coercer[size]; + + for (int columnIndex = 0; columnIndex < size; columnIndex++) { + HivePageSourceProvider.ColumnMapping columnMapping = columnMappings.get(columnIndex); + + if (columnMapping.getCoercionFrom().isPresent()) { + coercers[columnIndex] = createCoercer(typeManager, columnMapping.getCoercionFrom().get(), columnMapping.getHiveColumnHandle().getHiveType(), bridgingRecordCursor); + } + } + } + + @Override + public long getCompletedBytes() + { + return delegate.getCompletedBytes(); + } + + @Override + public Type getType(int field) + { + return delegate.getType(field); + } + + @Override + public boolean advanceNextPosition() + { + for (int i = 0; i < columnMappings.size(); i++) { + if (coercers[i] != null) { + coercers[i].reset(); + } + } + return delegate.advanceNextPosition(); + } + + @Override + public boolean getBoolean(int field) + { + if (coercers[field] == null) { + return delegate.getBoolean(field); + } + return coercers[field].getBoolean(delegate, field); + } + + @Override + public long getLong(int field) + { + if (coercers[field] == null) { + return delegate.getLong(field); + } + return coercers[field].getLong(delegate, field); + } + + @Override + public double getDouble(int field) + { + if (coercers[field] == null) { + return delegate.getDouble(field); + } + return coercers[field].getDouble(delegate, field); + } + + @Override + public Slice getSlice(int field) + { + if (coercers[field] == null) { + return delegate.getSlice(field); + } + return coercers[field].getSlice(delegate, field); + } + + @Override + public Object getObject(int field) + { + if (coercers[field] == null) { + return delegate.getObject(field); + } + return coercers[field].getObject(delegate, field); + } + + @Override + public boolean isNull(int field) + { + if (coercers[field] == null) { + return delegate.isNull(field); + } + return coercers[field].isNull(delegate, field); + } + + @Override + public void close() + { + delegate.close(); + } + + @Override + public long getReadTimeNanos() + { + return delegate.getReadTimeNanos(); + } + + @Override + public long getSystemMemoryUsage() + { + return delegate.getSystemMemoryUsage(); + } + + @VisibleForTesting + RecordCursor getRegularColumnRecordCursor() + { + return delegate; + } + + private abstract static class Coercer + { + private boolean isNull; + private boolean loaded; + + private boolean booleanValue; + private long longValue; + private double doubleValue; + private Slice sliceValue; + private Object objectValue; + + public void reset() + { + isNull = false; + loaded = false; + } + + public boolean isNull(RecordCursor delegate, int field) + { + assureLoaded(delegate, field); + return isNull; + } + + public boolean getBoolean(RecordCursor delegate, int field) + { + assureLoaded(delegate, field); + return booleanValue; + } + + public long getLong(RecordCursor delegate, int field) + { + assureLoaded(delegate, field); + return longValue; + } + + public double getDouble(RecordCursor delegate, int field) + { + assureLoaded(delegate, field); + return doubleValue; + } + + public Slice getSlice(RecordCursor delegate, int field) + { + assureLoaded(delegate, field); + return sliceValue; + } + + public Object getObject(RecordCursor delegate, int field) + { + assureLoaded(delegate, field); + return objectValue; + } + + private void assureLoaded(RecordCursor delegate, int field) + { + if (!loaded) { + isNull = delegate.isNull(field); + if (!isNull) { + coerce(delegate, field); + } + loaded = true; + } + } + + protected abstract void coerce(RecordCursor delegate, int field); + + protected void setBoolean(boolean value) + { + booleanValue = value; + } + + protected void setLong(long value) + { + longValue = value; + } + + protected void setDouble(double value) + { + doubleValue = value; + } + + protected void setSlice(Slice value) + { + sliceValue = value; + } + + protected void setObject(Object value) + { + objectValue = value; + } + + protected void setIsNull(boolean isNull) + { + this.isNull = isNull; + } + } + + private static Coercer createCoercer(TypeManager typeManager, HiveType fromHiveType, HiveType toHiveType, BridgingRecordCursor bridgingRecordCursor) + { + Type fromType = typeManager.getType(fromHiveType.getTypeSignature()); + Type toType = typeManager.getType(toHiveType.getTypeSignature()); + if (toType instanceof VarcharType && (fromHiveType.equals(HiveType.HIVE_BYTE) || fromHiveType.equals(HiveType.HIVE_SHORT) || fromHiveType.equals(HiveType.HIVE_INT) || fromHiveType.equals(HiveType.HIVE_LONG))) { + return new IntegerNumberToVarcharCoercer(); + } + if (fromType instanceof VarcharType && (toHiveType.equals(HiveType.HIVE_BYTE) || toHiveType.equals(HiveType.HIVE_SHORT) || toHiveType.equals(HiveType.HIVE_INT) || toHiveType.equals(HiveType.HIVE_LONG))) { + return new VarcharToIntegerNumberCoercer(toHiveType); + } + if (fromHiveType.equals(HiveType.HIVE_BYTE) && toHiveType.equals(HiveType.HIVE_SHORT) || toHiveType.equals(HiveType.HIVE_INT) || toHiveType.equals(HiveType.HIVE_LONG)) { + return new IntegerNumberUpscaleCoercer(); + } + if (fromHiveType.equals(HiveType.HIVE_SHORT) && toHiveType.equals(HiveType.HIVE_INT) || toHiveType.equals(HiveType.HIVE_LONG)) { + return new IntegerNumberUpscaleCoercer(); + } + if (fromHiveType.equals(HiveType.HIVE_INT) && toHiveType.equals(HiveType.HIVE_LONG)) { + return new IntegerNumberUpscaleCoercer(); + } + if (fromHiveType.equals(HiveType.HIVE_FLOAT) && toHiveType.equals(HiveType.HIVE_DOUBLE)) { + return new FloatToDoubleCoercer(); + } + if (isArrayType(fromType) && isArrayType(toType)) { + return new ListCoercer(typeManager, fromHiveType, toHiveType, bridgingRecordCursor); + } + if (isMapType(fromType) && isMapType(toType)) { + return new MapCoercer(typeManager, fromHiveType, toHiveType, bridgingRecordCursor); + } + if (isRowType(fromType) && isRowType(toType)) { + return new StructCoercer(typeManager, fromHiveType, toHiveType, bridgingRecordCursor); + } + + throw new PrestoException(NOT_SUPPORTED, format("Unsupported coercion from %s to %s", fromHiveType, toHiveType)); + } + + private static class IntegerNumberUpscaleCoercer + extends Coercer + { + @Override + public void coerce(RecordCursor delegate, int field) + { + setLong(delegate.getLong(field)); + } + } + + private static class IntegerNumberToVarcharCoercer + extends Coercer + { + @Override + public void coerce(RecordCursor delegate, int field) + { + setSlice(utf8Slice(String.valueOf(delegate.getLong(field)))); + } + } + + private static class FloatToDoubleCoercer + extends Coercer + { + @Override + protected void coerce(RecordCursor delegate, int field) + { + setDouble(intBitsToFloat((int) delegate.getLong(field))); + } + } + + private static class VarcharToIntegerNumberCoercer + extends Coercer + { + private final long maxValue; + private final long minValue; + + public VarcharToIntegerNumberCoercer(HiveType type) + { + if (type.equals(HiveType.HIVE_BYTE)) { + minValue = Byte.MIN_VALUE; + maxValue = Byte.MAX_VALUE; + } + else if (type.equals(HiveType.HIVE_SHORT)) { + minValue = Short.MIN_VALUE; + maxValue = Short.MAX_VALUE; + } + else if (type.equals(HiveType.HIVE_INT)) { + minValue = Integer.MIN_VALUE; + maxValue = Integer.MAX_VALUE; + } + else if (type.equals(HiveType.HIVE_LONG)) { + minValue = Long.MIN_VALUE; + maxValue = Long.MAX_VALUE; + } + else { + throw new PrestoException(NOT_SUPPORTED, format("Could not create Coercer from varchar to %s", type)); + } + } + + @Override + public void coerce(RecordCursor delegate, int field) + { + try { + long value = Long.parseLong(delegate.getSlice(field).toStringUtf8()); + if (minValue <= value && value <= maxValue) { + setLong(value); + } + else { + setIsNull(true); + } + } + catch (NumberFormatException e) { + setIsNull(true); + } + } + } + + private static class ListCoercer + extends Coercer + { + private final Type fromElementType; + private final Type toType; + private final Type toElementType; + private final Coercer elementCoercer; + private final BridgingRecordCursor bridgingRecordCursor; + private final PageBuilder pageBuilder; + + public ListCoercer(TypeManager typeManager, HiveType fromHiveType, HiveType toHiveType, BridgingRecordCursor bridgingRecordCursor) + { + requireNonNull(typeManager, "typeManage is null"); + requireNonNull(fromHiveType, "fromHiveType is null"); + requireNonNull(toHiveType, "toHiveType is null"); + this.bridgingRecordCursor = requireNonNull(bridgingRecordCursor, "bridgingRecordCursor is null"); + HiveType fromElementHiveType = HiveType.valueOf(((ListTypeInfo) fromHiveType.getTypeInfo()).getListElementTypeInfo().getTypeName()); + HiveType toElementHiveType = HiveType.valueOf(((ListTypeInfo) toHiveType.getTypeInfo()).getListElementTypeInfo().getTypeName()); + this.fromElementType = fromElementHiveType.getType(typeManager); + this.toType = toHiveType.getType(typeManager); + this.toElementType = toElementHiveType.getType(typeManager); + this.elementCoercer = fromElementHiveType.equals(toElementHiveType) ? null : createCoercer(typeManager, fromElementHiveType, toElementHiveType, bridgingRecordCursor); + this.pageBuilder = elementCoercer == null ? null : new PageBuilder(ImmutableList.of(toType)); + } + + @Override + public void coerce(RecordCursor delegate, int field) + { + if (delegate.isNull(field)) { + setIsNull(true); + return; + } + Block block = (Block) delegate.getObject(field); + if (pageBuilder.isFull()) { + pageBuilder.reset(); + } + BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(0); + BlockBuilder listBuilder = blockBuilder.beginBlockEntry(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (elementCoercer == null) { + toElementType.appendTo(block, i, listBuilder); + } + else { + if (block.isNull(i)) { + listBuilder.appendNull(); + } + else { + rewriteBlock(fromElementType, toElementType, block, i, listBuilder, elementCoercer, bridgingRecordCursor); + } + } + } + blockBuilder.closeEntry(); + pageBuilder.declarePosition(); + setObject(toType.getObject(blockBuilder, blockBuilder.getPositionCount() - 1)); + } + } + + private static class MapCoercer + extends Coercer + { + private final List fromKeyValueTypes; + private final Type toType; + private final List toKeyValueTypes; + private final Coercer[] coercers; + private final BridgingRecordCursor bridgingRecordCursor; + private final PageBuilder pageBuilder; + + public MapCoercer(TypeManager typeManager, HiveType fromHiveType, HiveType toHiveType, BridgingRecordCursor bridgingRecordCursor) + { + requireNonNull(typeManager, "typeManage is null"); + requireNonNull(fromHiveType, "fromHiveType is null"); + requireNonNull(toHiveType, "toHiveType is null"); + this.bridgingRecordCursor = requireNonNull(bridgingRecordCursor, "bridgingRecordCursor is null"); + HiveType fromKeyHiveType = HiveType.valueOf(((MapTypeInfo) fromHiveType.getTypeInfo()).getMapKeyTypeInfo().getTypeName()); + HiveType fromValueHiveType = HiveType.valueOf(((MapTypeInfo) fromHiveType.getTypeInfo()).getMapValueTypeInfo().getTypeName()); + HiveType toKeyHiveType = HiveType.valueOf(((MapTypeInfo) toHiveType.getTypeInfo()).getMapKeyTypeInfo().getTypeName()); + HiveType toValueHiveType = HiveType.valueOf(((MapTypeInfo) toHiveType.getTypeInfo()).getMapValueTypeInfo().getTypeName()); + this.fromKeyValueTypes = fromHiveType.getType(typeManager).getTypeParameters(); + this.toType = toHiveType.getType(typeManager); + this.toKeyValueTypes = toType.getTypeParameters(); + this.coercers = new Coercer[2]; + coercers[0] = fromKeyHiveType.equals(toKeyHiveType) ? null : createCoercer(typeManager, fromKeyHiveType, toKeyHiveType, bridgingRecordCursor); + coercers[1] = fromValueHiveType.equals(toValueHiveType) ? null : createCoercer(typeManager, fromValueHiveType, toValueHiveType, bridgingRecordCursor); + this.pageBuilder = coercers[0] == null && coercers[1] == null ? null : new PageBuilder(ImmutableList.of(toType)); + } + + @Override + public void coerce(RecordCursor delegate, int field) + { + if (delegate.isNull(field)) { + setIsNull(true); + return; + } + Block block = (Block) delegate.getObject(field); + if (pageBuilder.isFull()) { + pageBuilder.reset(); + } + BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(0); + BlockBuilder mapBuilder = blockBuilder.beginBlockEntry(); + for (int i = 0; i < block.getPositionCount(); i++) { + int k = i % 2; + if (coercers[k] == null) { + toKeyValueTypes.get(k).appendTo(block, i, mapBuilder); + } + else { + if (block.isNull(i)) { + mapBuilder.appendNull(); + } + else { + rewriteBlock(fromKeyValueTypes.get(k), toKeyValueTypes.get(k), block, i, mapBuilder, coercers[k], bridgingRecordCursor); + } + } + } + blockBuilder.closeEntry(); + pageBuilder.declarePosition(); + setObject(toType.getObject(blockBuilder, blockBuilder.getPositionCount() - 1)); + } + } + + private static class StructCoercer + extends Coercer + { + private final Type toType; + private final List fromFieldTypes; + private final List toFieldTypes; + private final Coercer[] coercers; + private final BridgingRecordCursor bridgingRecordCursor; + private final PageBuilder pageBuilder; + + public StructCoercer(TypeManager typeManager, HiveType fromHiveType, HiveType toHiveType, BridgingRecordCursor bridgingRecordCursor) + { + requireNonNull(typeManager, "typeManage is null"); + requireNonNull(fromHiveType, "fromHiveType is null"); + requireNonNull(toHiveType, "toHiveType is null"); + this.bridgingRecordCursor = requireNonNull(bridgingRecordCursor, "bridgingRecordCursor is null"); + List fromFieldHiveTypes = extractStructFieldTypes(fromHiveType); + List toFieldHiveTypes = extractStructFieldTypes(toHiveType); + this.fromFieldTypes = fromHiveType.getType(typeManager).getTypeParameters(); + this.toType = toHiveType.getType(typeManager); + this.toFieldTypes = toType.getTypeParameters(); + this.coercers = new Coercer[toFieldHiveTypes.size()]; + for (int i = 0; i < min(fromFieldHiveTypes.size(), toFieldHiveTypes.size()); i++) { + if (!fromFieldTypes.get(i).equals(toFieldTypes.get(i))) { + coercers[i] = createCoercer(typeManager, fromFieldHiveTypes.get(i), toFieldHiveTypes.get(i), bridgingRecordCursor); + } + } + this.pageBuilder = new PageBuilder(ImmutableList.of(toType)); + } + + @Override + public void coerce(RecordCursor delegate, int field) + { + if (delegate.isNull(field)) { + setIsNull(true); + return; + } + Block block = (Block) delegate.getObject(field); + if (pageBuilder.isFull()) { + pageBuilder.reset(); + } + BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(0); + BlockBuilder rowBuilder = blockBuilder.beginBlockEntry(); + for (int i = 0; i < toFieldTypes.size(); i++) { + if (i >= fromFieldTypes.size() || block.isNull(i)) { + rowBuilder.appendNull(); + } + else if (coercers[i] == null) { + toFieldTypes.get(i).appendTo(block, i, rowBuilder); + } + else { + rewriteBlock(fromFieldTypes.get(i), toFieldTypes.get(i), block, i, rowBuilder, coercers[i], bridgingRecordCursor); + } + } + blockBuilder.closeEntry(); + pageBuilder.declarePosition(); + setObject(toType.getObject(blockBuilder, blockBuilder.getPositionCount() - 1)); + } + } + + private static void rewriteBlock( + Type fromType, + Type toType, + Block block, + int position, + BlockBuilder blockBuilder, + Coercer coercer, + BridgingRecordCursor bridgingRecordCursor) + { + Class fromJavaType = fromType.getJavaType(); + if (fromJavaType == long.class) { + bridgingRecordCursor.setValue(fromType.getLong(block, position)); + } + else if (fromJavaType == double.class) { + bridgingRecordCursor.setValue(fromType.getDouble(block, position)); + } + else if (fromJavaType == boolean.class) { + bridgingRecordCursor.setValue(fromType.getBoolean(block, position)); + } + else if (fromJavaType == Slice.class) { + bridgingRecordCursor.setValue(fromType.getSlice(block, position)); + } + else if (fromJavaType == Block.class) { + bridgingRecordCursor.setValue(fromType.getObject(block, position)); + } + else { + bridgingRecordCursor.setValue(null); + } + coercer.reset(); + Class toJaveType = toType.getJavaType(); + if (coercer.isNull(bridgingRecordCursor, 0)) { + blockBuilder.appendNull(); + } + else if (toJaveType == long.class) { + toType.writeLong(blockBuilder, coercer.getLong(bridgingRecordCursor, 0)); + } + else if (toJaveType == double.class) { + toType.writeDouble(blockBuilder, coercer.getDouble(bridgingRecordCursor, 0)); + } + else if (toJaveType == boolean.class) { + toType.writeBoolean(blockBuilder, coercer.getBoolean(bridgingRecordCursor, 0)); + } + else if (toJaveType == Slice.class) { + toType.writeSlice(blockBuilder, coercer.getSlice(bridgingRecordCursor, 0)); + } + else if (toJaveType == Block.class) { + toType.writeObject(blockBuilder, coercer.getObject(bridgingRecordCursor, 0)); + } + else { + throw new PrestoException(NOT_SUPPORTED, format("Unsupported coercion from %s to %s", fromType.getDisplayName(), toType.getDisplayName())); + } + coercer.reset(); + bridgingRecordCursor.close(); + } + + private static class BridgingRecordCursor + implements RecordCursor + { + private Object value; + + public void setValue(Object value) + { + this.value = value; + } + + @Override + public long getCompletedBytes() + { + return 0; + } + + @Override + public long getReadTimeNanos() + { + return 0; + } + + @Override + public Type getType(int field) + { + throw new UnsupportedOperationException(); + } + + @Override + public boolean advanceNextPosition() + { + return true; + } + + @Override + public boolean getBoolean(int field) + { + return (Boolean) value; + } + + @Override + public long getLong(int field) + { + return (Long) value; + } + + @Override + public double getDouble(int field) + { + return (Double) value; + } + + @Override + public Slice getSlice(int field) + { + return (Slice) value; + } + + @Override + public Object getObject(int field) + { + return value; + } + + @Override + public boolean isNull(int field) + { + return value == null; + } + + @Override + public void close() + { + this.value = null; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveColumnHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveColumnHandle.java new file mode 100644 index 00000000..5a66146e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveColumnHandle.java @@ -0,0 +1,267 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.Lists; +import io.prestosql.plugin.hive.orc.OrcPageSourceFactory; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ColumnMetadata; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.type.TypeSignature; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Objects; +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.DUMMY_OFFLOADED; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.SYNTHESIZED; +import static io.prestosql.plugin.hive.HiveType.HIVE_INT; +import static io.prestosql.plugin.hive.HiveType.HIVE_LONG; +import static io.prestosql.plugin.hive.HiveType.HIVE_STRING; +import static java.util.Objects.requireNonNull; + +public class HiveColumnHandle + implements ColumnHandle +{ + public static final int PATH_COLUMN_INDEX = -11; + public static final String PATH_COLUMN_NAME = "$path"; + public static final HiveType PATH_HIVE_TYPE = HIVE_STRING; + public static final TypeSignature PATH_TYPE_SIGNATURE = PATH_HIVE_TYPE.getTypeSignature(); + + public static final int BUCKET_COLUMN_INDEX = -12; + public static final String BUCKET_COLUMN_NAME = "$bucket"; + public static final HiveType BUCKET_HIVE_TYPE = HIVE_INT; + public static final TypeSignature BUCKET_TYPE_SIGNATURE = BUCKET_HIVE_TYPE.getTypeSignature(); + + public static final int ROW_ID__COLUMN_INDEX = -13; + public static final String UPDATE_ROW_ID_COLUMN_NAME = "$rowId"; + + public static final int DUMMY_OFFLOADED_COLUMN_INDEX = -20; + public static final String DUMMY_OFFLOADED_COLUMN_NAME = "count_star"; + + // Ids <= MAX_PARTITION_KEY_COLUMN_INDEX, can be used for distinguishing between different partition prefilled columns. + // NOTE: Incase any new hidden columns added, their index should be more than below value or below value should be adjusted. + public static final int MAX_PARTITION_KEY_COLUMN_INDEX = -14; + + public enum ColumnType + { + PARTITION_KEY, + REGULAR, + SYNTHESIZED, + DUMMY_OFFLOADED, + } + + private final String name; + private final HiveType hiveType; + private final TypeSignature typeName; + private final int hiveColumnIndex; + private final ColumnType columnType; + private final Optional comment; + //If the column is a partitionColumn or bucketing column, then this is required + private final boolean required; + + public HiveColumnHandle( + String name, + HiveType hiveType, + TypeSignature typeSignature, + int hiveColumnIndex, + ColumnType columnType, + Optional comment) + { + this(name, hiveType, typeSignature, hiveColumnIndex, columnType, comment, false); + } + + @JsonCreator + public HiveColumnHandle( + @JsonProperty("name") String name, + @JsonProperty("hiveType") HiveType hiveType, + @JsonProperty("typeSignature") TypeSignature typeSignature, + @JsonProperty("hiveColumnIndex") int hiveColumnIndex, + @JsonProperty("columnType") ColumnType columnType, + @JsonProperty("comment") Optional comment, + @JsonProperty("required") boolean required) + { + this.name = requireNonNull(name, "name is null"); + checkArgument(hiveColumnIndex >= 0 || columnType == PARTITION_KEY || columnType == SYNTHESIZED || columnType == DUMMY_OFFLOADED, "hiveColumnIndex is negative"); + this.hiveColumnIndex = hiveColumnIndex; + this.hiveType = requireNonNull(hiveType, "hiveType is null"); + this.typeName = requireNonNull(typeSignature, "type is null"); + this.columnType = requireNonNull(columnType, "columnType is null"); + this.comment = requireNonNull(comment, "comment is null"); + this.required = required; + } + + @JsonProperty + public String getName() + { + return name; + } + + @Override + public String getColumnName() + { + return name; + } + + @JsonProperty + public HiveType getHiveType() + { + return hiveType; + } + + @JsonProperty + public int getHiveColumnIndex() + { + return hiveColumnIndex; + } + + public boolean isPartitionKey() + { + return columnType == PARTITION_KEY; + } + + public boolean isRegular() + { + return columnType == REGULAR; + } + + public boolean isHidden() + { + return columnType == SYNTHESIZED; + } + + public ColumnMetadata getColumnMetadata(TypeManager typeManager) + { + return new ColumnMetadata(name, typeManager.getType(typeName), true, null, null, isHidden(), Collections.emptyMap(), required); + } + + @JsonProperty + public Optional getComment() + { + return comment; + } + + @JsonProperty + public TypeSignature getTypeSignature() + { + return typeName; + } + + @JsonProperty + public ColumnType getColumnType() + { + return columnType; + } + + @JsonProperty + public boolean isRequired() + { + return required; + } + + @Override + public int hashCode() + { + return Objects.hash(name, hiveColumnIndex, hiveType, columnType, comment, required); + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + HiveColumnHandle other = (HiveColumnHandle) obj; + return Objects.equals(this.name, other.name) && + Objects.equals(this.hiveColumnIndex, other.hiveColumnIndex) && + Objects.equals(this.hiveType, other.hiveType) && + Objects.equals(this.columnType, other.columnType) && + Objects.equals(this.comment, other.comment) && + Objects.equals(this.required, other.required); + } + + @Override + public String toString() + { + return name + ":" + hiveType + ":" + hiveColumnIndex + ":" + columnType; + } + + public static HiveColumnHandle updateRowIdHandle() + { + // Hive connector only supports metadata delete. It does not support generic row-by-row deletion. + // Metadata delete is implemented in Hetu by generating a plan for row-by-row delete first, + // and then optimize it into metadata delete. As a result, Hive connector must provide partial + // plan-time support for row-by-row delete so that planning doesn't fail. This is why we need + // rowid handle. Note that in Hive connector, rowid handle is not implemented beyond plan-time. + ArrayList acidColumnNames = Lists.newArrayList( + OrcPageSourceFactory.ACID_COLUMN_ORIGINAL_TRANSACTION, + OrcPageSourceFactory.ACID_COLUMN_BUCKET, + OrcPageSourceFactory.ACID_COLUMN_ROW_ID, + OrcPageSourceFactory.ACID_COLUMN_CURRENT_TRANSACTION, + OrcPageSourceFactory.ACID_COLUMN_OPERATION); + + ArrayList acidColumnTypes = Lists.newArrayList( + HIVE_LONG.getTypeInfo(), + HIVE_INT.getTypeInfo(), + HIVE_LONG.getTypeInfo(), + HIVE_LONG.getTypeInfo(), + HIVE_INT.getTypeInfo()); + StructTypeInfo structTypeInfo = new StructTypeInfo(); + structTypeInfo.setAllStructFieldNames(acidColumnNames); + structTypeInfo.setAllStructFieldTypeInfos(acidColumnTypes); + HiveType rowIdType = HiveType.createHiveType(structTypeInfo); + return new HiveColumnHandle(UPDATE_ROW_ID_COLUMN_NAME, rowIdType, rowIdType.getTypeSignature(), ROW_ID__COLUMN_INDEX, SYNTHESIZED, Optional.empty()); + } + + public static HiveColumnHandle pathColumnHandle() + { + return new HiveColumnHandle(PATH_COLUMN_NAME, PATH_HIVE_TYPE, PATH_TYPE_SIGNATURE, PATH_COLUMN_INDEX, SYNTHESIZED, Optional.empty()); + } + + /** + * The column indicating the bucket id. + * When table bucketing differs from partition bucketing, this column indicates + * what bucket the row will fall in under the table bucketing scheme. + */ + public static HiveColumnHandle bucketColumnHandle() + { + return new HiveColumnHandle(BUCKET_COLUMN_NAME, BUCKET_HIVE_TYPE, BUCKET_TYPE_SIGNATURE, BUCKET_COLUMN_INDEX, SYNTHESIZED, Optional.empty()); + } + + public static boolean isPathColumnHandle(HiveColumnHandle column) + { + return column.getHiveColumnIndex() == PATH_COLUMN_INDEX; + } + + public static boolean isBucketColumnHandle(HiveColumnHandle column) + { + return column.getHiveColumnIndex() == BUCKET_COLUMN_INDEX; + } + + public static boolean isUpdateColumnHandle(HiveColumnHandle column) + { + return column.getHiveColumnIndex() == ROW_ID__COLUMN_INDEX; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCompressionCodec.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCompressionCodec.java new file mode 100644 index 00000000..2f72f64d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveCompressionCodec.java @@ -0,0 +1,61 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.orc.metadata.CompressionKind; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.io.compress.Lz4Codec; +import org.apache.hadoop.io.compress.SnappyCodec; +import org.apache.hadoop.io.compress.ZStandardCodec; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; + +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +public enum HiveCompressionCodec +{ + NONE(null, CompressionKind.NONE, CompressionCodecName.UNCOMPRESSED), + SNAPPY(SnappyCodec.class, CompressionKind.SNAPPY, CompressionCodecName.SNAPPY), + LZ4(Lz4Codec.class, CompressionKind.LZ4, CompressionCodecName.LZ4), + ZSTD(ZStandardCodec.class, CompressionKind.ZSTD, CompressionCodecName.ZSTD), + GZIP(GzipCodec.class, CompressionKind.ZLIB, CompressionCodecName.GZIP); + + private final Optional> codec; + private final CompressionKind orcCompressionKind; + private final CompressionCodecName parquetCompressionCodec; + + HiveCompressionCodec(Class codec, CompressionKind orcCompressionKind, CompressionCodecName parquetCompressionCodec) + { + this.codec = Optional.ofNullable(codec); + this.orcCompressionKind = requireNonNull(orcCompressionKind, "orcCompressionKind is null"); + this.parquetCompressionCodec = requireNonNull(parquetCompressionCodec, "parquetCompressionCodec is null"); + } + + public Optional> getCodec() + { + return codec; + } + + public CompressionKind getOrcCompressionKind() + { + return orcCompressionKind; + } + + public CompressionCodecName getParquetCompressionCodec() + { + return parquetCompressionCodec; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveConfig.java new file mode 100644 index 00000000..bb9056df --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveConfig.java @@ -0,0 +1,2099 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableList; +import com.google.common.net.HostAndPort; +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; +import io.airlift.configuration.DefunctConfig; +import io.airlift.log.Logger; +import io.airlift.units.DataSize; +import io.airlift.units.Duration; +import io.airlift.units.MaxDataSize; +import io.airlift.units.MinDataSize; +import io.airlift.units.MinDuration; +import io.prestosql.orc.OrcWriteValidation.OrcWriteValidationMode; +import io.prestosql.plugin.hive.s3.S3FileSystemType; +import io.prestosql.spi.function.Mandatory; +import io.prestosql.spi.queryeditorui.PropertyType; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nullable; +import javax.validation.constraints.DecimalMax; +import javax.validation.constraints.DecimalMin; +import javax.validation.constraints.Max; +import javax.validation.constraints.Min; +import javax.validation.constraints.NotNull; + +import java.io.File; +import java.io.IOException; +import java.text.Normalizer; +import java.util.List; +import java.util.Optional; +import java.util.TimeZone; +import java.util.concurrent.TimeUnit; + +import static io.airlift.units.DataSize.Unit.BYTE; +import static io.airlift.units.DataSize.Unit.GIGABYTE; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static java.util.concurrent.TimeUnit.HOURS; +import static java.util.concurrent.TimeUnit.MINUTES; + +@DefunctConfig({ + "dfs.domain-socket-path", + "hive.file-system-cache-ttl", + "hive.max-global-split-iterator-threads", + "hive.max-sort-files-per-bucket", + "hive.bucket-writing", + "hive.optimized-reader.enabled", + "hive.orc.optimized-writer.enabled", + "hive.rcfile-optimized-writer.enabled", + "hive.time-zone", +}) +public class HiveConfig +{ + private static final Logger log = Logger.get(HiveConfig.class); + private static final Splitter SPLITTER = Splitter.on(',').trimResults().omitEmptyStrings(); + public static final double MIN_OFFLOAD_FACTOR = 0.5; + public static final long MIN_OFFLOAD_ROW_NUM = 500; + + private DataSize maxSplitSize = new DataSize(64, MEGABYTE); + private int maxPartitionsPerScan = 100_000; + private int maxOutstandingSplits = 1_000; + private DataSize maxOutstandingSplitsSize = new DataSize(256, MEGABYTE); + private int maxSplitIteratorThreads = 1_000; + private int minPartitionBatchSize = 10; + private int maxPartitionBatchSize = 100; + private int maxInitialSplits = 200; + private int splitLoaderConcurrency = 4; + private Integer maxSplitsPerSecond; + private DataSize maxInitialSplitSize; + private int domainCompactionThreshold = 100; + private DataSize writerSortBufferSize = new DataSize(64, MEGABYTE); + private boolean forceLocalScheduling; + private boolean recursiveDirWalkerEnabled; + + private int maxConcurrentFileRenames = 20; + + private boolean allowCorruptWritesForTesting; + + private Duration metastoreCacheTtl = new Duration(0, TimeUnit.SECONDS); + private Duration metastoreRefreshInterval = new Duration(1, TimeUnit.SECONDS); + + private Duration metastoreDBCacheTtl = new Duration(0, TimeUnit.SECONDS); + private Duration metastoreDBRefreshInterval = new Duration(1, TimeUnit.SECONDS); + + private long metastoreCacheMaximumSize = 10000; + private long perTransactionMetastoreCacheMaximumSize = 1000; + private int maxMetastoreRefreshThreads = 100; + private HostAndPort metastoreSocksProxy; + private Duration metastoreTimeout = new Duration(10, TimeUnit.SECONDS); + + private Duration ipcPingInterval = new Duration(10, TimeUnit.SECONDS); + private Duration dfsTimeout = new Duration(60, TimeUnit.SECONDS); + private Duration dfsConnectTimeout = new Duration(500, TimeUnit.MILLISECONDS); + private Duration dfsKeyProviderCacheTtl = new Duration(30, TimeUnit.MINUTES); + private int dfsConnectMaxRetries = 5; + private boolean verifyChecksum = true; + private String domainSocketPath; + + private S3FileSystemType s3FileSystemType = S3FileSystemType.PRESTO; + + private HiveStorageFormat hiveStorageFormat = HiveStorageFormat.ORC; + private HiveCompressionCodec hiveCompressionCodec = HiveCompressionCodec.GZIP; + private boolean respectTableFormat = true; + private boolean immutablePartitions; + private boolean createEmptyBucketFiles; + private int maxPartitionsPerWriter = 100; + private int maxOpenSortFiles = 50; + private int writeValidationThreads = 16; + + private List resourceConfigFiles = ImmutableList.of(); + + private DataSize textMaxLineLength = new DataSize(100, MEGABYTE); + + private String orcLegacyTimeZone = TimeZone.getDefault().getID(); + + private String parquetTimeZone = TimeZone.getDefault().getID(); + private boolean useParquetColumnNames; + private boolean failOnCorruptedParquetStatistics = true; + private DataSize parquetMaxReadBlockSize = new DataSize(16, MEGABYTE); + + private boolean assumeCanonicalPartitionKeys; + + private boolean useOrcColumnNames; + private boolean orcBloomFiltersEnabled; + private double orcDefaultBloomFilterFpp = 0.05; + private DataSize orcMaxMergeDistance = new DataSize(1, MEGABYTE); + private DataSize orcMaxBufferSize = new DataSize(8, MEGABYTE); + private DataSize orcTinyStripeThreshold = new DataSize(1, BYTE); + private DataSize orcStreamBufferSize = new DataSize(8, MEGABYTE); + private DataSize orcMaxReadBlockSize = new DataSize(16, MEGABYTE); + private boolean orcLazyReadSmallRanges = true; + private boolean orcWriteLegacyVersion; + private double orcWriterValidationPercentage; + private OrcWriteValidationMode orcWriterValidationMode = OrcWriteValidationMode.BOTH; + + private boolean orcFileTailCacheEnabled; + private Duration orcFileTailCacheTtl = new Duration(4, HOURS); + private long orcFileTailCacheLimit = 50_000; + private boolean orcStripeFooterCacheEnabled; + private Duration orcStripeFooterCacheTtl = new Duration(4, HOURS); + private long orcStripeFooterCacheLimit = 250_000; + private boolean orcRowIndexCacheEnabled; + private Duration orcRowIndexCacheTtl = new Duration(4, HOURS); + private long orcRowIndexCacheLimit = 250_000; + private boolean orcBloomFiltersCacheEnabled; + private Duration orcBloomFiltersCacheTtl = new Duration(4, HOURS); + private long orcBloomFiltersCacheLimit = 250_000; + private boolean orcRowDataCacheEnabled; + private Duration orcRowDataCacheTtl = new Duration(4, HOURS); + private DataSize orcRowDataCacheMaximumWeight = new DataSize(20, GIGABYTE); + + private String rcfileTimeZone = TimeZone.getDefault().getID(); + private boolean rcfileWriterValidate; + + private HiveMetastoreAuthenticationType hiveMetastoreAuthenticationType = HiveMetastoreAuthenticationType.NONE; + private HdfsAuthenticationType hdfsAuthenticationType = HdfsAuthenticationType.NONE; + private boolean hdfsImpersonationEnabled; + private boolean hdfsWireEncryptionEnabled; + + private boolean skipDeletionForAlter; + private boolean skipTargetCleanupOnRollback; + + private boolean bucketExecutionEnabled = true; + private boolean sortedWritingEnabled = true; + + private int fileSystemMaxCacheSize = 1000; + + private boolean optimizeMismatchedBucketCount; + private boolean writesToNonManagedTablesEnabled; + private boolean createsOfNonManagedTablesEnabled = true; + + private boolean tableStatisticsEnabled = true; + private int partitionStatisticsSampleSize = 100; + private boolean ignoreCorruptedStatistics; + private boolean collectColumnStatisticsOnWrite = true; + + private String recordingPath; + private boolean replay; + private Duration recordingDuration = new Duration(10, MINUTES); + private boolean s3SelectPushdownEnabled; + private int s3SelectPushdownMaxConnections = 500; + + private boolean temporaryStagingDirectoryEnabled = true; + private String temporaryStagingDirectoryPath = "/tmp/presto-${USER}"; + + private Duration fileStatusCacheExpireAfterWrite = new Duration(24, TimeUnit.HOURS); + private long fileStatusCacheMaxSize = 1000 * 1000; + private List fileStatusCacheTables = ImmutableList.of(); + + private Optional hiveTransactionHeartbeatInterval = Optional.empty(); + private int hiveTransactionHeartbeatThreads = 5; + + private boolean tableCreatesWithLocationAllowed = true; + + private boolean dynamicFilterPartitionFilteringEnabled = true; + private int dynamicFilteringRowFilteringThreshold = 2000; + + private boolean orcCacheStatsMetricCollectionEnabled; + + private int vacuumDeltaNumThreshold = 10; + private double vacuumDeltaPercentThreshold = 0.1; + private boolean autoVacuumEnabled; + private boolean orcPredicatePushdownEnabled; + + private boolean omniDataSslEnabled; + private Optional omniDataSslPkiDir = Optional.empty(); + private Optional omniDataSslClientCertFilePath = Optional.empty(); + private Optional omniDataSslPrivateKeyFilePath = Optional.empty(); + private Optional omniDataSslTrustCertFilePath = Optional.empty(); + private Optional omniDataSslCrlFilePath = Optional.empty(); + + private boolean omniDataEnabled; + private boolean filterOffloadEnabled = true; + private double minFilterOffloadFactor = MIN_OFFLOAD_FACTOR; + private boolean aggregatorOffloadEnabled = true; + private double minAggregatorOffloadFactor = MIN_OFFLOAD_FACTOR; + private long minOffloadRowNumber = MIN_OFFLOAD_ROW_NUM; + + private int hmsWriteBatchSize = 8; + + public int getMaxInitialSplits() + { + return maxInitialSplits; + } + + private boolean tlsEnabled; + + private Duration vacuumCleanupRecheckInterval = new Duration(5, MINUTES); + private int vacuumServiceThreads = 2; + private int metastoreClientServiceThreads = 4; + private Optional vacuumCollectorInterval = Optional.of(new Duration(5, MINUTES)); + + private int maxNumbSplitsToGroup = 1; + + private boolean workerMetaStoreCacheEnabled; + + @Config("hive.max-initial-splits") + public HiveConfig setMaxInitialSplits(int maxInitialSplits) + { + this.maxInitialSplits = maxInitialSplits; + return this; + } + + public DataSize getMaxInitialSplitSize() + { + if (maxInitialSplitSize == null) { + return new DataSize(maxSplitSize.getValue() / 2, maxSplitSize.getUnit()); + } + return maxInitialSplitSize; + } + + @Config("hive.max-initial-split-size") + public HiveConfig setMaxInitialSplitSize(DataSize maxInitialSplitSize) + { + this.maxInitialSplitSize = maxInitialSplitSize; + return this; + } + + @Min(1) + public int getSplitLoaderConcurrency() + { + return splitLoaderConcurrency; + } + + @Config("hive.split-loader-concurrency") + public HiveConfig setSplitLoaderConcurrency(int splitLoaderConcurrency) + { + this.splitLoaderConcurrency = splitLoaderConcurrency; + return this; + } + + @Min(1) + @Nullable + public Integer getMaxSplitsPerSecond() + { + return maxSplitsPerSecond; + } + + @Config("hive.max-splits-per-second") + @ConfigDescription("Throttles the maximum number of splits that can be assigned to tasks per second") + public HiveConfig setMaxSplitsPerSecond(Integer maxSplitsPerSecond) + { + this.maxSplitsPerSecond = maxSplitsPerSecond; + return this; + } + + @Min(1) + public int getDomainCompactionThreshold() + { + return domainCompactionThreshold; + } + + @Config("hive.domain-compaction-threshold") + @ConfigDescription("Maximum ranges to allow in a tuple domain without compacting it") + public HiveConfig setDomainCompactionThreshold(int domainCompactionThreshold) + { + this.domainCompactionThreshold = domainCompactionThreshold; + return this; + } + + @MinDataSize("1MB") + @MaxDataSize("1GB") + public DataSize getWriterSortBufferSize() + { + return writerSortBufferSize; + } + + @Config("hive.writer-sort-buffer-size") + public HiveConfig setWriterSortBufferSize(DataSize writerSortBufferSize) + { + this.writerSortBufferSize = writerSortBufferSize; + return this; + } + + public boolean isForceLocalScheduling() + { + return forceLocalScheduling; + } + + @Config("hive.force-local-scheduling") + public HiveConfig setForceLocalScheduling(boolean forceLocalScheduling) + { + this.forceLocalScheduling = forceLocalScheduling; + return this; + } + + @Min(1) + public int getMaxConcurrentFileRenames() + { + return maxConcurrentFileRenames; + } + + @Config("hive.max-concurrent-file-renames") + public HiveConfig setMaxConcurrentFileRenames(int maxConcurrentFileRenames) + { + this.maxConcurrentFileRenames = maxConcurrentFileRenames; + return this; + } + + @Config("hive.recursive-directories") + public HiveConfig setRecursiveDirWalkerEnabled(boolean recursiveDirWalkerEnabled) + { + this.recursiveDirWalkerEnabled = recursiveDirWalkerEnabled; + return this; + } + + public boolean getRecursiveDirWalkerEnabled() + { + return recursiveDirWalkerEnabled; + } + + @NotNull + public DataSize getMaxSplitSize() + { + return maxSplitSize; + } + + @Config("hive.max-split-size") + public HiveConfig setMaxSplitSize(DataSize maxSplitSize) + { + this.maxSplitSize = maxSplitSize; + return this; + } + + @Min(1) + public int getMaxPartitionsPerScan() + { + return maxPartitionsPerScan; + } + + @Config("hive.max-partitions-per-scan") + @ConfigDescription("Maximum allowed partitions for a single table scan") + public HiveConfig setMaxPartitionsPerScan(int maxPartitionsPerScan) + { + this.maxPartitionsPerScan = maxPartitionsPerScan; + return this; + } + + @Min(1) + public int getMaxOutstandingSplits() + { + return maxOutstandingSplits; + } + + @Config("hive.max-outstanding-splits") + @ConfigDescription("Target number of buffered splits for each table scan in a query, before the scheduler tries to pause itself") + public HiveConfig setMaxOutstandingSplits(int maxOutstandingSplits) + { + this.maxOutstandingSplits = maxOutstandingSplits; + return this; + } + + @MinDataSize("1MB") + public DataSize getMaxOutstandingSplitsSize() + { + return maxOutstandingSplitsSize; + } + + @Config("hive.max-outstanding-splits-size") + @ConfigDescription("Maximum amount of memory allowed for split buffering for each table scan in a query, before the query is failed") + public HiveConfig setMaxOutstandingSplitsSize(DataSize maxOutstandingSplits) + { + this.maxOutstandingSplitsSize = maxOutstandingSplits; + return this; + } + + @Min(1) + public int getMaxSplitIteratorThreads() + { + return maxSplitIteratorThreads; + } + + @Config("hive.max-split-iterator-threads") + public HiveConfig setMaxSplitIteratorThreads(int maxSplitIteratorThreads) + { + this.maxSplitIteratorThreads = maxSplitIteratorThreads; + return this; + } + + @Deprecated + public boolean getAllowCorruptWritesForTesting() + { + return allowCorruptWritesForTesting; + } + + @Deprecated + @Config("hive.allow-corrupt-writes-for-testing") + @ConfigDescription("Allow Hive connector to write data even when data will likely be corrupt") + public HiveConfig setAllowCorruptWritesForTesting(boolean allowCorruptWritesForTesting) + { + this.allowCorruptWritesForTesting = allowCorruptWritesForTesting; + return this; + } + + @NotNull + public @MinDuration("0ms") Duration getMetastoreCacheTtl() + { + return metastoreCacheTtl; + } + + @Config("hive.metastore-cache-ttl") + public HiveConfig setMetastoreCacheTtl(Duration metastoreCacheTtl) + { + this.metastoreCacheTtl = metastoreCacheTtl; + return this; + } + + @NotNull + public @MinDuration("1ms") Duration getMetastoreRefreshInterval() + { + return metastoreRefreshInterval; + } + + @Config("hive.metastore-refresh-interval") + public HiveConfig setMetastoreRefreshInterval(Duration metastoreRefreshInterval) + { + this.metastoreRefreshInterval = metastoreRefreshInterval; + return this; + } + + @NotNull + public @MinDuration("0ms") Duration getMetastoreDBCacheTtl() + { + return metastoreDBCacheTtl; + } + + @Config("hive.metastore-db-cache-ttl") + public HiveConfig setMetastoreDBCacheTtl(Duration metastoreCacheTtl) + { + this.metastoreDBCacheTtl = metastoreCacheTtl; + return this; + } + + @NotNull + public @MinDuration("1ms") Duration getMetastoreDBRefreshInterval() + { + return metastoreDBRefreshInterval; + } + + @Config("hive.metastore-db-refresh-interval") + public HiveConfig setMetastoreDBRefreshInterval(Duration metastoreDBRefreshInterval) + { + this.metastoreDBRefreshInterval = metastoreDBRefreshInterval; + return this; + } + + @Min(1) + public long getMetastoreCacheMaximumSize() + { + return metastoreCacheMaximumSize; + } + + @Config("hive.metastore-cache-maximum-size") + public HiveConfig setMetastoreCacheMaximumSize(long metastoreCacheMaximumSize) + { + this.metastoreCacheMaximumSize = metastoreCacheMaximumSize; + return this; + } + + @Min(1) + public long getPerTransactionMetastoreCacheMaximumSize() + { + return perTransactionMetastoreCacheMaximumSize; + } + + @Config("hive.per-transaction-metastore-cache-maximum-size") + public HiveConfig setPerTransactionMetastoreCacheMaximumSize(long perTransactionMetastoreCacheMaximumSize) + { + this.perTransactionMetastoreCacheMaximumSize = perTransactionMetastoreCacheMaximumSize; + return this; + } + + @Min(10) + public int getMaxMetastoreRefreshThreads() + { + return maxMetastoreRefreshThreads; + } + + @Config("hive.metastore-refresh-max-threads") + public HiveConfig setMaxMetastoreRefreshThreads(int maxMetastoreRefreshThreads) + { + this.maxMetastoreRefreshThreads = maxMetastoreRefreshThreads; + return this; + } + + public HostAndPort getMetastoreSocksProxy() + { + return metastoreSocksProxy; + } + + @Config("hive.metastore.thrift.client.socks-proxy") + public HiveConfig setMetastoreSocksProxy(HostAndPort metastoreSocksProxy) + { + this.metastoreSocksProxy = metastoreSocksProxy; + return this; + } + + @NotNull + public Duration getMetastoreTimeout() + { + return metastoreTimeout; + } + + @Config("hive.metastore-timeout") + public HiveConfig setMetastoreTimeout(Duration metastoreTimeout) + { + this.metastoreTimeout = metastoreTimeout; + return this; + } + + @Min(1) + public int getMinPartitionBatchSize() + { + return minPartitionBatchSize; + } + + @Config("hive.metastore.partition-batch-size.min") + public HiveConfig setMinPartitionBatchSize(int minPartitionBatchSize) + { + this.minPartitionBatchSize = minPartitionBatchSize; + return this; + } + + @Min(1) + public int getMaxPartitionBatchSize() + { + return maxPartitionBatchSize; + } + + @Config("hive.metastore.partition-batch-size.max") + public HiveConfig setMaxPartitionBatchSize(int maxPartitionBatchSize) + { + this.maxPartitionBatchSize = maxPartitionBatchSize; + return this; + } + + @NotNull + public List getResourceConfigFiles() + { + return resourceConfigFiles; + } + + @Mandatory(name = "hive.config.resources", + description = "An optional comma-separated list of HDFS configuration files. These files must exist on the machines running openLooKeng. Only specify this if absolutely necessary to access HDFS. Ensure to upload these files.", + defaultValue = "core-site.xml,hdfs-site.xml", + readOnly = true, + type = PropertyType.FILES) + @Config("hive.config.resources") + public HiveConfig setResourceConfigFiles(String files) + { + this.resourceConfigFiles = Splitter.on(',').trimResults().omitEmptyStrings().splitToList(files); + return this; + } + + public HiveConfig setResourceConfigFiles(List files) + { + this.resourceConfigFiles = ImmutableList.copyOf(files); + return this; + } + + @NotNull + @MinDuration("1ms") + public Duration getIpcPingInterval() + { + return ipcPingInterval; + } + + @Config("hive.dfs.ipc-ping-interval") + public HiveConfig setIpcPingInterval(Duration pingInterval) + { + this.ipcPingInterval = pingInterval; + return this; + } + + @NotNull + @MinDuration("1ms") + public Duration getDfsTimeout() + { + return dfsTimeout; + } + + @Config("hive.dfs-timeout") + public HiveConfig setDfsTimeout(Duration dfsTimeout) + { + this.dfsTimeout = dfsTimeout; + return this; + } + + @NotNull + @MinDuration("0ms") + public Duration getDfsKeyProviderCacheTtl() + { + return dfsKeyProviderCacheTtl; + } + + @Config("hive.dfs.key-provider.cache-ttl") + public HiveConfig setDfsKeyProviderCacheTtl(Duration dfsClientKeyProviderCacheTtl) + { + this.dfsKeyProviderCacheTtl = dfsClientKeyProviderCacheTtl; + return this; + } + + @MinDuration("1ms") + @NotNull + public Duration getDfsConnectTimeout() + { + return dfsConnectTimeout; + } + + @Config("hive.dfs.connect.timeout") + public HiveConfig setDfsConnectTimeout(Duration dfsConnectTimeout) + { + this.dfsConnectTimeout = dfsConnectTimeout; + return this; + } + + @Min(0) + public int getDfsConnectMaxRetries() + { + return dfsConnectMaxRetries; + } + + @Config("hive.dfs.connect.max-retries") + public HiveConfig setDfsConnectMaxRetries(int dfsConnectMaxRetries) + { + this.dfsConnectMaxRetries = dfsConnectMaxRetries; + return this; + } + + public HiveStorageFormat getHiveStorageFormat() + { + return hiveStorageFormat; + } + + @Config("hive.storage-format") + public HiveConfig setHiveStorageFormat(HiveStorageFormat hiveStorageFormat) + { + this.hiveStorageFormat = hiveStorageFormat; + return this; + } + + public HiveCompressionCodec getHiveCompressionCodec() + { + return hiveCompressionCodec; + } + + @Config("hive.compression-codec") + public HiveConfig setHiveCompressionCodec(HiveCompressionCodec hiveCompressionCodec) + { + this.hiveCompressionCodec = hiveCompressionCodec; + return this; + } + + public boolean isRespectTableFormat() + { + return respectTableFormat; + } + + @Config("hive.respect-table-format") + @ConfigDescription("Should new partitions be written using the existing table format or the default Presto format") + public HiveConfig setRespectTableFormat(boolean respectTableFormat) + { + this.respectTableFormat = respectTableFormat; + return this; + } + + public boolean isImmutablePartitions() + { + return immutablePartitions; + } + + @Config("hive.immutable-partitions") + @ConfigDescription("Can new data be inserted into existing partitions or existing unpartitioned tables") + public HiveConfig setImmutablePartitions(boolean immutablePartitions) + { + this.immutablePartitions = immutablePartitions; + return this; + } + + public boolean isCreateEmptyBucketFiles() + { + return createEmptyBucketFiles; + } + + @Config("hive.create-empty-bucket-files") + @ConfigDescription("Create empty files for buckets that have no data") + public HiveConfig setCreateEmptyBucketFiles(boolean createEmptyBucketFiles) + { + this.createEmptyBucketFiles = createEmptyBucketFiles; + return this; + } + + @Min(1) + public int getMaxPartitionsPerWriter() + { + return maxPartitionsPerWriter; + } + + @Config("hive.max-partitions-per-writers") + @ConfigDescription("Maximum number of partitions per writer") + public HiveConfig setMaxPartitionsPerWriter(int maxPartitionsPerWriter) + { + this.maxPartitionsPerWriter = maxPartitionsPerWriter; + return this; + } + + @Min(2) + @Max(1000) + public int getMaxOpenSortFiles() + { + return maxOpenSortFiles; + } + + @Config("hive.max-open-sort-files") + @ConfigDescription("Maximum number of writer temporary files to read in one pass") + public HiveConfig setMaxOpenSortFiles(int maxOpenSortFiles) + { + this.maxOpenSortFiles = maxOpenSortFiles; + return this; + } + + public int getWriteValidationThreads() + { + return writeValidationThreads; + } + + @Config("hive.write-validation-threads") + @ConfigDescription("Number of threads used for verifying data after a write") + public HiveConfig setWriteValidationThreads(int writeValidationThreads) + { + this.writeValidationThreads = writeValidationThreads; + return this; + } + + public String getDomainSocketPath() + { + return domainSocketPath; + } + + @Config("hive.dfs.domain-socket-path") + public HiveConfig setDomainSocketPath(String domainSocketPath) + { + this.domainSocketPath = domainSocketPath; + return this; + } + + @NotNull + public S3FileSystemType getS3FileSystemType() + { + return s3FileSystemType; + } + + @Config("hive.s3-file-system-type") + public HiveConfig setS3FileSystemType(S3FileSystemType s3FileSystemType) + { + this.s3FileSystemType = s3FileSystemType; + return this; + } + + public boolean isVerifyChecksum() + { + return verifyChecksum; + } + + @Config("hive.dfs.verify-checksum") + public HiveConfig setVerifyChecksum(boolean verifyChecksum) + { + this.verifyChecksum = verifyChecksum; + return this; + } + + public boolean isUseOrcColumnNames() + { + return useOrcColumnNames; + } + + @Config("hive.orc.use-column-names") + @ConfigDescription("Access ORC columns using names from the file") + public HiveConfig setUseOrcColumnNames(boolean useOrcColumnNames) + { + this.useOrcColumnNames = useOrcColumnNames; + return this; + } + + @NotNull + public DataSize getOrcMaxMergeDistance() + { + return orcMaxMergeDistance; + } + + @Config("hive.orc.max-merge-distance") + public HiveConfig setOrcMaxMergeDistance(DataSize orcMaxMergeDistance) + { + this.orcMaxMergeDistance = orcMaxMergeDistance; + return this; + } + + @NotNull + public DataSize getOrcMaxBufferSize() + { + return orcMaxBufferSize; + } + + @Config("hive.orc.max-buffer-size") + public HiveConfig setOrcMaxBufferSize(DataSize orcMaxBufferSize) + { + this.orcMaxBufferSize = orcMaxBufferSize; + return this; + } + + @NotNull + public DataSize getOrcStreamBufferSize() + { + return orcStreamBufferSize; + } + + @Config("hive.orc.stream-buffer-size") + public HiveConfig setOrcStreamBufferSize(DataSize orcStreamBufferSize) + { + this.orcStreamBufferSize = orcStreamBufferSize; + return this; + } + + @NotNull + public DataSize getOrcTinyStripeThreshold() + { + return orcTinyStripeThreshold; + } + + @Config("hive.orc.tiny-stripe-threshold") + public HiveConfig setOrcTinyStripeThreshold(DataSize orcTinyStripeThreshold) + { + this.orcTinyStripeThreshold = orcTinyStripeThreshold; + return this; + } + + @NotNull + public DataSize getOrcMaxReadBlockSize() + { + return orcMaxReadBlockSize; + } + + @Config("hive.orc.max-read-block-size") + public HiveConfig setOrcMaxReadBlockSize(DataSize orcMaxReadBlockSize) + { + this.orcMaxReadBlockSize = orcMaxReadBlockSize; + return this; + } + + @Deprecated + public boolean isOrcLazyReadSmallRanges() + { + return orcLazyReadSmallRanges; + } + + // TODO remove config option once efficacy is proven + @Deprecated + @Config("hive.orc.lazy-read-small-ranges") + @ConfigDescription("ORC read small disk ranges lazily") + public HiveConfig setOrcLazyReadSmallRanges(boolean orcLazyReadSmallRanges) + { + this.orcLazyReadSmallRanges = orcLazyReadSmallRanges; + return this; + } + + public boolean isOrcBloomFiltersEnabled() + { + return orcBloomFiltersEnabled; + } + + @Config("hive.orc.bloom-filters.enabled") + public HiveConfig setOrcBloomFiltersEnabled(boolean orcBloomFiltersEnabled) + { + this.orcBloomFiltersEnabled = orcBloomFiltersEnabled; + return this; + } + + public double getOrcDefaultBloomFilterFpp() + { + return orcDefaultBloomFilterFpp; + } + + @Config("hive.orc.default-bloom-filter-fpp") + @ConfigDescription("ORC Bloom filter false positive probability") + public HiveConfig setOrcDefaultBloomFilterFpp(double orcDefaultBloomFilterFpp) + { + this.orcDefaultBloomFilterFpp = orcDefaultBloomFilterFpp; + return this; + } + + public boolean isOrcWriteLegacyVersion() + { + return orcWriteLegacyVersion; + } + + @Config("hive.orc.writer.use-legacy-version-number") + @ConfigDescription("Write ORC files with a version number that is readable by Hive 2.0.0 to 2.2.0") + public HiveConfig setOrcWriteLegacyVersion(boolean orcWriteLegacyVersion) + { + this.orcWriteLegacyVersion = orcWriteLegacyVersion; + return this; + } + + @DecimalMin("0.0") + @DecimalMax("100.0") + public double getOrcWriterValidationPercentage() + { + return orcWriterValidationPercentage; + } + + @Config("hive.orc.writer.validation-percentage") + @ConfigDescription("Percentage of ORC files to validate after write by re-reading the whole file") + public HiveConfig setOrcWriterValidationPercentage(double orcWriterValidationPercentage) + { + this.orcWriterValidationPercentage = orcWriterValidationPercentage; + return this; + } + + @NotNull + public OrcWriteValidationMode getOrcWriterValidationMode() + { + return orcWriterValidationMode; + } + + @Config("hive.orc.writer.validation-mode") + @ConfigDescription("Level of detail in ORC validation. Lower levels require more memory.") + public HiveConfig setOrcWriterValidationMode(OrcWriteValidationMode orcWriterValidationMode) + { + this.orcWriterValidationMode = orcWriterValidationMode; + return this; + } + + public DateTimeZone getRcfileDateTimeZone() + { + return DateTimeZone.forTimeZone(TimeZone.getTimeZone(rcfileTimeZone)); + } + + @NotNull + public String getRcfileTimeZone() + { + return rcfileTimeZone; + } + + @Config("hive.rcfile.time-zone") + @ConfigDescription("Time zone for RCFile binary read and write") + public HiveConfig setRcfileTimeZone(String rcfileTimeZone) + { + this.rcfileTimeZone = rcfileTimeZone; + return this; + } + + public boolean isRcfileWriterValidate() + { + return rcfileWriterValidate; + } + + @Config("hive.rcfile.writer.validate") + @ConfigDescription("Validate RCFile after write by re-reading the whole file") + public HiveConfig setRcfileWriterValidate(boolean rcfileWriterValidate) + { + this.rcfileWriterValidate = rcfileWriterValidate; + return this; + } + + public boolean isAssumeCanonicalPartitionKeys() + { + return assumeCanonicalPartitionKeys; + } + + @Config("hive.assume-canonical-partition-keys") + public HiveConfig setAssumeCanonicalPartitionKeys(boolean assumeCanonicalPartitionKeys) + { + this.assumeCanonicalPartitionKeys = assumeCanonicalPartitionKeys; + return this; + } + + @MinDataSize("1B") + @MaxDataSize("1GB") + @NotNull + public DataSize getTextMaxLineLength() + { + return textMaxLineLength; + } + + @Config("hive.text.max-line-length") + @ConfigDescription("Maximum line length for text files") + public HiveConfig setTextMaxLineLength(DataSize textMaxLineLength) + { + this.textMaxLineLength = textMaxLineLength; + return this; + } + + public DateTimeZone getOrcLegacyDateTimeZone() + { + return DateTimeZone.forTimeZone(TimeZone.getTimeZone(orcLegacyTimeZone)); + } + + @NotNull + public String getOrcLegacyTimeZone() + { + return orcLegacyTimeZone; + } + + @Config("hive.orc.time-zone") + @ConfigDescription("Time zone for legacy ORC files that do not contain a time zone") + public HiveConfig setOrcLegacyTimeZone(String orcLegacyTimeZone) + { + this.orcLegacyTimeZone = orcLegacyTimeZone; + return this; + } + + public DateTimeZone getParquetDateTimeZone() + { + return DateTimeZone.forTimeZone(TimeZone.getTimeZone(parquetTimeZone)); + } + + @NotNull + public String getParquetTimeZone() + { + return parquetTimeZone; + } + + @Config("hive.parquet.time-zone") + @ConfigDescription("Time zone for Parquet read and write") + public HiveConfig setParquetTimeZone(String parquetTimeZone) + { + this.parquetTimeZone = parquetTimeZone; + return this; + } + + public boolean isUseParquetColumnNames() + { + return useParquetColumnNames; + } + + @Config("hive.parquet.use-column-names") + @ConfigDescription("Access Parquet columns using names from the file") + public HiveConfig setUseParquetColumnNames(boolean useParquetColumnNames) + { + this.useParquetColumnNames = useParquetColumnNames; + return this; + } + + public boolean isFailOnCorruptedParquetStatistics() + { + return failOnCorruptedParquetStatistics; + } + + @Config("hive.parquet.fail-on-corrupted-statistics") + @ConfigDescription("Fail when scanning Parquet files with corrupted statistics") + public HiveConfig setFailOnCorruptedParquetStatistics(boolean failOnCorruptedParquetStatistics) + { + this.failOnCorruptedParquetStatistics = failOnCorruptedParquetStatistics; + return this; + } + + @NotNull + public DataSize getParquetMaxReadBlockSize() + { + return parquetMaxReadBlockSize; + } + + @Config("hive.parquet.max-read-block-size") + public HiveConfig setParquetMaxReadBlockSize(DataSize parquetMaxReadBlockSize) + { + this.parquetMaxReadBlockSize = parquetMaxReadBlockSize; + return this; + } + + public boolean isOptimizeMismatchedBucketCount() + { + return optimizeMismatchedBucketCount; + } + + @Config("hive.optimize-mismatched-bucket-count") + public HiveConfig setOptimizeMismatchedBucketCount(boolean optimizeMismatchedBucketCount) + { + this.optimizeMismatchedBucketCount = optimizeMismatchedBucketCount; + return this; + } + + public List getFileStatusCacheTables() + { + return fileStatusCacheTables; + } + + @Config("hive.file-status-cache-tables") + public HiveConfig setFileStatusCacheTables(String fileStatusCacheTables) + { + this.fileStatusCacheTables = SPLITTER.splitToList(fileStatusCacheTables); + return this; + } + + public long getFileStatusCacheMaxSize() + { + return fileStatusCacheMaxSize; + } + + @Config("hive.file-status-cache-size") + public HiveConfig setFileStatusCacheMaxSize(long fileStatusCacheMaxSize) + { + this.fileStatusCacheMaxSize = fileStatusCacheMaxSize; + return this; + } + + public Duration getFileStatusCacheExpireAfterWrite() + { + return fileStatusCacheExpireAfterWrite; + } + + @Config("hive.file-status-cache-expire-time") + public HiveConfig setFileStatusCacheExpireAfterWrite(Duration fileStatusCacheExpireAfterWrite) + { + this.fileStatusCacheExpireAfterWrite = fileStatusCacheExpireAfterWrite; + return this; + } + + public int getMetastoreWriteBatchSize() + { + return hmsWriteBatchSize; + } + + @Config("hive.metastore-write-batch-size") + @ConfigDescription("Batch size for writing to hms") + public HiveConfig setMetastoreWriteBatchSize(int hmsWriteBatchSize) + { + this.hmsWriteBatchSize = hmsWriteBatchSize; + return this; + } + + public enum HiveMetastoreAuthenticationType + { + NONE, + KERBEROS + } + + @NotNull + public HiveMetastoreAuthenticationType getHiveMetastoreAuthenticationType() + { + return hiveMetastoreAuthenticationType; + } + + @Config("hive.metastore.authentication.type") + @ConfigDescription("Hive Metastore authentication type") + public HiveConfig setHiveMetastoreAuthenticationType( + HiveMetastoreAuthenticationType hiveMetastoreAuthenticationType) + { + this.hiveMetastoreAuthenticationType = hiveMetastoreAuthenticationType; + return this; + } + + public enum HdfsAuthenticationType + { + NONE, + KERBEROS, + } + + @NotNull + public HdfsAuthenticationType getHdfsAuthenticationType() + { + return hdfsAuthenticationType; + } + + @Config("hive.hdfs.authentication.type") + @ConfigDescription("HDFS authentication type") + public HiveConfig setHdfsAuthenticationType(HdfsAuthenticationType hdfsAuthenticationType) + { + this.hdfsAuthenticationType = hdfsAuthenticationType; + return this; + } + + public boolean isHdfsImpersonationEnabled() + { + return hdfsImpersonationEnabled; + } + + @Config("hive.hdfs.impersonation.enabled") + @ConfigDescription("Should Presto user be impersonated when communicating with HDFS") + public HiveConfig setHdfsImpersonationEnabled(boolean hdfsImpersonationEnabled) + { + this.hdfsImpersonationEnabled = hdfsImpersonationEnabled; + return this; + } + + public boolean isHdfsWireEncryptionEnabled() + { + return hdfsWireEncryptionEnabled; + } + + @Config("hive.hdfs.wire-encryption.enabled") + @ConfigDescription("Should be turned on when HDFS wire encryption is enabled") + public HiveConfig setHdfsWireEncryptionEnabled(boolean hdfsWireEncryptionEnabled) + { + this.hdfsWireEncryptionEnabled = hdfsWireEncryptionEnabled; + return this; + } + + public boolean isSkipDeletionForAlter() + { + return skipDeletionForAlter; + } + + @Config("hive.skip-deletion-for-alter") + @ConfigDescription("Skip deletion of old partition data when a partition is deleted and then inserted in the same transaction") + public HiveConfig setSkipDeletionForAlter(boolean skipDeletionForAlter) + { + this.skipDeletionForAlter = skipDeletionForAlter; + return this; + } + + public boolean isSkipTargetCleanupOnRollback() + { + return skipTargetCleanupOnRollback; + } + + @Config("hive.skip-target-cleanup-on-rollback") + @ConfigDescription("Skip deletion of target directories when a metastore operation fails") + public HiveConfig setSkipTargetCleanupOnRollback(boolean skipTargetCleanupOnRollback) + { + this.skipTargetCleanupOnRollback = skipTargetCleanupOnRollback; + return this; + } + + public boolean isBucketExecutionEnabled() + { + return bucketExecutionEnabled; + } + + @Config("hive.bucket-execution") + @ConfigDescription("Enable bucket-aware execution: only use a single worker per bucket") + public HiveConfig setBucketExecutionEnabled(boolean bucketExecutionEnabled) + { + this.bucketExecutionEnabled = bucketExecutionEnabled; + return this; + } + + public boolean isSortedWritingEnabled() + { + return sortedWritingEnabled; + } + + @Config("hive.sorted-writing") + @ConfigDescription("Enable writing to bucketed sorted tables") + public HiveConfig setSortedWritingEnabled(boolean sortedWritingEnabled) + { + this.sortedWritingEnabled = sortedWritingEnabled; + return this; + } + + public int getFileSystemMaxCacheSize() + { + return fileSystemMaxCacheSize; + } + + @Config("hive.fs.cache.max-size") + @ConfigDescription("Hadoop FileSystem cache size") + public HiveConfig setFileSystemMaxCacheSize(int fileSystemMaxCacheSize) + { + this.fileSystemMaxCacheSize = fileSystemMaxCacheSize; + return this; + } + + @Config("hive.non-managed-table-writes-enabled") + @ConfigDescription("Enable writes to non-managed (external) tables") + public HiveConfig setWritesToNonManagedTablesEnabled(boolean writesToNonManagedTablesEnabled) + { + this.writesToNonManagedTablesEnabled = writesToNonManagedTablesEnabled; + return this; + } + + public boolean getWritesToNonManagedTablesEnabled() + { + return writesToNonManagedTablesEnabled; + } + + @Deprecated + @Config("hive.non-managed-table-creates-enabled") + @ConfigDescription("Enable non-managed (external) table creates") + public HiveConfig setCreatesOfNonManagedTablesEnabled(boolean createsOfNonManagedTablesEnabled) + { + this.createsOfNonManagedTablesEnabled = createsOfNonManagedTablesEnabled; + return this; + } + + @Deprecated + public boolean getCreatesOfNonManagedTablesEnabled() + { + return createsOfNonManagedTablesEnabled; + } + + @Config("hive.table-statistics-enabled") + @ConfigDescription("Enable use of table statistics") + public HiveConfig setTableStatisticsEnabled(boolean tableStatisticsEnabled) + { + this.tableStatisticsEnabled = tableStatisticsEnabled; + return this; + } + + public boolean isTableStatisticsEnabled() + { + return tableStatisticsEnabled; + } + + @Min(1) + public int getPartitionStatisticsSampleSize() + { + return partitionStatisticsSampleSize; + } + + @Config("hive.partition-statistics-sample-size") + @ConfigDescription("Maximum sample size of the partitions column statistics") + public HiveConfig setPartitionStatisticsSampleSize(int partitionStatisticsSampleSize) + { + this.partitionStatisticsSampleSize = partitionStatisticsSampleSize; + return this; + } + + public boolean isIgnoreCorruptedStatistics() + { + return ignoreCorruptedStatistics; + } + + @Config("hive.ignore-corrupted-statistics") + @ConfigDescription("Ignore corrupted statistics rather than failing") + public HiveConfig setIgnoreCorruptedStatistics(boolean ignoreCorruptedStatistics) + { + this.ignoreCorruptedStatistics = ignoreCorruptedStatistics; + return this; + } + + public boolean isCollectColumnStatisticsOnWrite() + { + return collectColumnStatisticsOnWrite; + } + + @Config("hive.collect-column-statistics-on-write") + @ConfigDescription("Enables automatic column level statistics collection on write") + public HiveConfig setCollectColumnStatisticsOnWrite(boolean collectColumnStatisticsOnWrite) + { + this.collectColumnStatisticsOnWrite = collectColumnStatisticsOnWrite; + return this; + } + + @Config("hive.metastore-recording-path") + public HiveConfig setRecordingPath(String recordingPath) + { + this.recordingPath = recordingPath; + return this; + } + + public String getRecordingPath() + { + return recordingPath; + } + + @Config("hive.replay-metastore-recording") + public HiveConfig setReplay(boolean replay) + { + this.replay = replay; + return this; + } + + public boolean isReplay() + { + return replay; + } + + @Config("hive.metastore-recording-duration") + public HiveConfig setRecordingDuration(Duration recordingDuration) + { + this.recordingDuration = recordingDuration; + return this; + } + + @NotNull + public Duration getRecordingDuration() + { + return recordingDuration; + } + + public boolean isS3SelectPushdownEnabled() + { + return s3SelectPushdownEnabled; + } + + @Config("hive.s3select-pushdown.enabled") + @ConfigDescription("Enable query pushdown to AWS S3 Select service") + public HiveConfig setS3SelectPushdownEnabled(boolean s3SelectPushdownEnabled) + { + this.s3SelectPushdownEnabled = s3SelectPushdownEnabled; + return this; + } + + @Min(1) + public int getS3SelectPushdownMaxConnections() + { + return s3SelectPushdownMaxConnections; + } + + @Config("hive.s3select-pushdown.max-connections") + public HiveConfig setS3SelectPushdownMaxConnections(int s3SelectPushdownMaxConnections) + { + this.s3SelectPushdownMaxConnections = s3SelectPushdownMaxConnections; + return this; + } + + @Config("hive.temporary-staging-directory-enabled") + @ConfigDescription("Should use (if possible) temporary staging directory for write operations") + public HiveConfig setTemporaryStagingDirectoryEnabled(boolean temporaryStagingDirectoryEnabled) + { + this.temporaryStagingDirectoryEnabled = temporaryStagingDirectoryEnabled; + return this; + } + + public boolean isTemporaryStagingDirectoryEnabled() + { + return temporaryStagingDirectoryEnabled; + } + + @Config("hive.temporary-staging-directory-path") + @ConfigDescription("Location of temporary staging directory for write operations. Use ${USER} placeholder to use different location for each user.") + public HiveConfig setTemporaryStagingDirectoryPath(String temporaryStagingDirectoryPath) + { + this.temporaryStagingDirectoryPath = temporaryStagingDirectoryPath; + return this; + } + + @NotNull + public String getTemporaryStagingDirectoryPath() + { + return temporaryStagingDirectoryPath; + } + + @NotNull + public boolean isOrcFileTailCacheEnabled() + { + return orcFileTailCacheEnabled; + } + + @Config("hive.orc.file-tail.cache.enabled") + @ConfigDescription("Enable caching of Orc file tail.") + public HiveConfig setOrcFileTailCacheEnabled(boolean orcFileTailCacheEnabled) + { + this.orcFileTailCacheEnabled = orcFileTailCacheEnabled; + return this; + } + + @NotNull + public @MinDuration("0ms") Duration getOrcFileTailCacheTtl() + { + return orcFileTailCacheTtl; + } + + @Config("hive.orc.file-tail.cache.ttl") + @ConfigDescription("Orc file tail cache TTL.") + public HiveConfig setOrcFileTailCacheTtl(Duration orcFileTailCacheTtl) + { + this.orcFileTailCacheTtl = orcFileTailCacheTtl; + return this; + } + + @NotNull + public long getOrcFileTailCacheLimit() + { + return orcFileTailCacheLimit; + } + + @Config("hive.orc.file-tail.cache.limit") + @ConfigDescription("Orc file tail cache limit.") + public HiveConfig setOrcFileTailCacheLimit(long orcFileTailCacheLimit) + { + this.orcFileTailCacheLimit = orcFileTailCacheLimit; + return this; + } + + public boolean isOrcStripeFooterCacheEnabled() + { + return orcStripeFooterCacheEnabled; + } + + @Config("hive.orc.stripe-footer.cache.enabled") + @ConfigDescription("Enable caching of Orc stripe footer.") + public HiveConfig setOrcStripeFooterCacheEnabled(boolean orcStripeFooterCacheEnabled) + { + this.orcStripeFooterCacheEnabled = orcStripeFooterCacheEnabled; + return this; + } + + @MinDuration("0ms") + public Duration getOrcStripeFooterCacheTtl() + { + return orcStripeFooterCacheTtl; + } + + @Config("hive.orc.stripe-footer.cache.ttl") + @ConfigDescription("Orc strip footer cache TTL.") + public HiveConfig setOrcStripeFooterCacheTtl(Duration orcStripeFooterCacheTtl) + { + this.orcStripeFooterCacheTtl = orcStripeFooterCacheTtl; + return this; + } + + @Min(0) + public long getOrcStripeFooterCacheLimit() + { + return orcStripeFooterCacheLimit; + } + + @Config("hive.orc.stripe-footer.cache.limit") + @ConfigDescription("Orc stripe footer cache limit.") + public HiveConfig setOrcStripeFooterCacheLimit(long orcStripeFooterCacheLimit) + { + this.orcStripeFooterCacheLimit = orcStripeFooterCacheLimit; + return this; + } + + public boolean isOrcRowIndexCacheEnabled() + { + return orcRowIndexCacheEnabled; + } + + @Config("hive.orc.row-index.cache.enabled") + @ConfigDescription("Enable caching of Orc row group index.") + public HiveConfig setOrcRowIndexCacheEnabled(boolean orcRowIndexCacheEnabled) + { + this.orcRowIndexCacheEnabled = orcRowIndexCacheEnabled; + return this; + } + + @MinDuration("0ms") + public Duration getOrcRowIndexCacheTtl() + { + return orcRowIndexCacheTtl; + } + + @Config("hive.orc.row-index.cache.ttl") + public HiveConfig setOrcRowIndexCacheTtl(Duration orcRowIndexCacheTtl) + { + this.orcRowIndexCacheTtl = orcRowIndexCacheTtl; + return this; + } + + @Min(0) + public long getOrcRowIndexCacheLimit() + { + return orcRowIndexCacheLimit; + } + + @Config("hive.orc.row-index.cache.limit") + public HiveConfig setOrcRowIndexCacheLimit(long orcRowIndexCacheLimit) + { + this.orcRowIndexCacheLimit = orcRowIndexCacheLimit; + return this; + } + + public boolean isOrcBloomFiltersCacheEnabled() + { + return orcBloomFiltersCacheEnabled; + } + + @Config("hive.orc.bloom-filters.cache.enabled") + @ConfigDescription("Enable caching of Orc bloom filters.") + public HiveConfig setOrcBloomFiltersCacheEnabled(boolean orcBloomFiltersCacheEnabled) + { + this.orcBloomFiltersCacheEnabled = orcBloomFiltersCacheEnabled; + return this; + } + + @MinDuration("0ms") + public Duration getOrcBloomFiltersCacheTtl() + { + return orcBloomFiltersCacheTtl; + } + + @Config("hive.orc.bloom-filters.cache.ttl") + public HiveConfig setOrcBloomFiltersCacheTtl(Duration orcBloomFiltersCacheTtl) + { + this.orcBloomFiltersCacheTtl = orcBloomFiltersCacheTtl; + return this; + } + + @Min(0) + public long getOrcBloomFiltersCacheLimit() + { + return orcBloomFiltersCacheLimit; + } + + @Config("hive.orc.bloom-filters.cache.limit") + public HiveConfig setOrcBloomFiltersCacheLimit(long orcBloomFiltersCacheLimit) + { + this.orcBloomFiltersCacheLimit = orcBloomFiltersCacheLimit; + return this; + } + + public boolean isOrcRowDataCacheEnabled() + { + return orcRowDataCacheEnabled; + } + + @Config("hive.orc.row-data.block.cache.enabled") + @ConfigDescription("Flag to enable caching Orc row data as blocks") + public HiveConfig setOrcRowDataCacheEnabled(boolean orcRowDataCacheEnabled) + { + this.orcRowDataCacheEnabled = orcRowDataCacheEnabled; + return this; + } + + @MinDuration("0ms") + public Duration getOrcRowDataCacheTtl() + { + return orcRowDataCacheTtl; + } + + @Config("hive.orc.row-data.block.cache.ttl") + @ConfigDescription("Orc Row data block cache TTL.") + public HiveConfig setOrcRowDataCacheTtl(Duration orcRowDataCacheTtl) + { + this.orcRowDataCacheTtl = orcRowDataCacheTtl; + return this; + } + + public DataSize getOrcRowDataCacheMaximumWeight() + { + return orcRowDataCacheMaximumWeight; + } + + @Config("hive.orc.row-data.block.cache.max.weight") + @ConfigDescription("Orc Row data block cache max weight.") + public HiveConfig setOrcRowDataCacheMaximumWeight(DataSize orcRowDataCacheMaximumWeight) + { + this.orcRowDataCacheMaximumWeight = orcRowDataCacheMaximumWeight; + return this; + } + + @Config("hive.transaction-heartbeat-interval") + @ConfigDescription("Interval after which heartbeat is sent for open Hive transaction") + public HiveConfig setHiveTransactionHeartbeatInterval(Duration interval) + { + this.hiveTransactionHeartbeatInterval = Optional.ofNullable(interval); + return this; + } + + @NotNull + public Optional getHiveTransactionHeartbeatInterval() + { + return hiveTransactionHeartbeatInterval; + } + + public int getHiveTransactionHeartbeatThreads() + { + return hiveTransactionHeartbeatThreads; + } + + @Config("hive.transaction-heartbeat-threads") + @ConfigDescription("Number of threads to run in the Hive transaction heartbeat service") + public HiveConfig setHiveTransactionHeartbeatThreads(int hiveTransactionHeartbeatThreads) + { + this.hiveTransactionHeartbeatThreads = hiveTransactionHeartbeatThreads; + return this; + } + + @Config("hive.table-creates-with-location-allowed") + @ConfigDescription("Allow setting table location in CREATE TABLE and CREATE TABLE AS SELECT statements") + public HiveConfig setTableCreatesWithLocationAllowed(boolean tableCreatesWithLocationAllowed) + { + this.tableCreatesWithLocationAllowed = tableCreatesWithLocationAllowed; + return this; + } + + public boolean getTableCreatesWithLocationAllowed() + { + return tableCreatesWithLocationAllowed; + } + + public boolean isTlsEnabled() + { + return tlsEnabled; + } + + @Config("hive.metastore.thrift.client.ssl.enabled") + @ConfigDescription("Whether TLS security is enabled") + public HiveConfig setTlsEnabled(boolean tlsEnabled) + { + this.tlsEnabled = tlsEnabled; + return this; + } + + @Config("hive.dynamic-filter-partition-filtering") + @ConfigDescription("Filter out hive splits early based on partition value using dynamic filter") + public HiveConfig setDynamicFilterPartitionFilteringEnabled(boolean dynamicFilterPartitionFilteringEnabled) + { + this.dynamicFilterPartitionFilteringEnabled = dynamicFilterPartitionFilteringEnabled; + return this; + } + + public boolean isDynamicFilterPartitionFilteringEnabled() + { + return dynamicFilterPartitionFilteringEnabled; + } + + @Config("hive.dynamic-filtering-row-filtering-threshold") + @ConfigDescription("Filter out hive rows early if the dynamic filter size is below the threshold") + public HiveConfig setDynamicFilteringRowFilteringThreshold(int dynamicFilteringRowFilteringThreshold) + { + this.dynamicFilteringRowFilteringThreshold = dynamicFilteringRowFilteringThreshold; + return this; + } + + @Min(1) + public int getDynamicFilteringRowFilteringThreshold() + { + return dynamicFilteringRowFilteringThreshold; + } + + public boolean isOrcCacheStatsMetricCollectionEnabled() + { + return orcCacheStatsMetricCollectionEnabled; + } + + @Config("hive.orc-cache-stats-metric-collection.enabled") + @ConfigDescription("Whether orc cache stats metric collection is enabled") + public HiveConfig setOrcCacheStatsMetricCollectionEnabled(boolean orcCacheStatsMetricCollectionEnabled) + { + this.orcCacheStatsMetricCollectionEnabled = orcCacheStatsMetricCollectionEnabled; + return this; + } + + @Config("hive.vacuum-cleanup-recheck-interval") + @ConfigDescription("Interval after which vacuum cleanup task will be resubmitted") + public HiveConfig setVacuumCleanupRecheckInterval(Duration interval) + { + this.vacuumCleanupRecheckInterval = interval; + return this; + } + + @NotNull + @MinDuration("5m") + public Duration getVacuumCleanupRecheckInterval() + { + return vacuumCleanupRecheckInterval; + } + + @Config("hive.vacuum-service-threads") + @ConfigDescription("Number of threads to run in the vacuum service") + public HiveConfig setVacuumServiceThreads(int vacuumServiceThreads) + { + this.vacuumServiceThreads = vacuumServiceThreads; + return this; + } + + public int getVacuumServiceThreads() + { + return vacuumServiceThreads; + } + + @Config("hive.metastore-client-service-threads") + @ConfigDescription("Number of threads for metastore client") + public HiveConfig setMetastoreClientServiceThreads(int metastoreClientServiceThreads) + { + this.metastoreClientServiceThreads = metastoreClientServiceThreads; + return this; + } + + public int getMetastoreClientServiceThreads() + { + return metastoreClientServiceThreads; + } + + @Config("hive.vacuum-delta-num-threshold") + @ConfigDescription("Maximum number of delta directories to allow without compacting it") + public HiveConfig setVacuumDeltaNumThreshold(int vacuumDeltaNumThreshold) + { + this.vacuumDeltaNumThreshold = vacuumDeltaNumThreshold; + return this; + } + + @Min(2) + public int getVacuumDeltaNumThreshold() + { + return vacuumDeltaNumThreshold; + } + + @Config("hive.vacuum-delta-percent-threshold") + @ConfigDescription("Maximum percent of delta directories to allow without compacting it") + public HiveConfig setVacuumDeltaPercentThreshold(double vacuumDeltaPercentThreshold) + { + this.vacuumDeltaPercentThreshold = vacuumDeltaPercentThreshold; + return this; + } + + @DecimalMin("0.1") + @DecimalMax("1.0") + public double getVacuumDeltaPercentThreshold() + { + return vacuumDeltaPercentThreshold; + } + + @Config("hive.auto-vacuum-enabled") + @ConfigDescription("Enable auto-vacuum on Hive tables") + public HiveConfig setAutoVacuumEnabled(boolean autoVacuumEnabled) + { + this.autoVacuumEnabled = autoVacuumEnabled; + return this; + } + + public boolean getAutoVacuumEnabled() + { + return autoVacuumEnabled; + } + + @Config("hive.orc-predicate-pushdown-enabled") + @ConfigDescription("Enables processing of predicates within ORC reading") + public HiveConfig setOrcPredicatePushdownEnabled(boolean orcPredicatePushdownEnabled) + { + this.orcPredicatePushdownEnabled = orcPredicatePushdownEnabled; + return this; + } + + public boolean isOrcPredicatePushdownEnabled() + { + return orcPredicatePushdownEnabled; + } + + @Config("hive.vacuum-collector-interval") + @ConfigDescription("Interval after which vacuum collector task will be resubmitted") + public HiveConfig setVacuumCollectorInterval(Duration interval) + { + this.vacuumCollectorInterval = Optional.ofNullable(interval); + return this; + } + + @NotNull + public Optional getVacuumCollectorInterval() + { + return vacuumCollectorInterval; + } + + @Min(1) + public int getMaxSplitsToGroup() + { + return maxNumbSplitsToGroup; + } + + @Config("hive.max-splits-to-group") + @ConfigDescription("max number of small splits can be grouped") + public HiveConfig setMaxSplitsToGroup(int maxNumbSplitsToGroup) + { + this.maxNumbSplitsToGroup = maxNumbSplitsToGroup; + return this; + } + + @Config("hive.worker-metastore-cache-enabled") + public HiveConfig setWorkerMetaStoreCacheEnabled(boolean isEnabled) + { + this.workerMetaStoreCacheEnabled = isEnabled; + return this; + } + + public boolean getWorkerMetaStoreCacheEnabled() + { + return this.workerMetaStoreCacheEnabled; + } + + public boolean isOmniDataSslEnabled() + { + return omniDataSslEnabled; + } + + private Optional getNormalizedFilePath(String filePath) + { + if (filePath == null || filePath.isEmpty()) { + return Optional.empty(); + } + String outputPath; + try { + String normalizePath = Normalizer.normalize(filePath, Normalizer.Form.NFKC); + outputPath = new File(normalizePath).getCanonicalPath(); + } + catch (IOException | IllegalArgumentException exception) { + log.error("File path [%s] is invalid, exception %s", filePath, exception.getMessage()); + return Optional.empty(); + } + File file = new File(outputPath); + if (!file.exists()) { + log.error("File [%s] is not exist.", outputPath); + return Optional.empty(); + } + return Optional.of(outputPath); + } + + @Config("omni-data.ssl.enabled") + public HiveConfig setOmniDataSslEnabled(boolean omniDataSslEnabled) + { + this.omniDataSslEnabled = omniDataSslEnabled; + return this; + } + + public Optional getOmniDataSslPkiDir() + { + return omniDataSslPkiDir; + } + + @Config("omni-data.ssl.pki.dir") + @ConfigDescription("Directory of Public Key Infrastructure.") + public HiveConfig setOmniDataSslPkiDir(String omniDataSslPkiDir) + { + this.omniDataSslPkiDir = getNormalizedFilePath(omniDataSslPkiDir); + return this; + } + + public Optional getOmniDataSslClientCertFilePath() + { + return omniDataSslClientCertFilePath; + } + + @Config("omni-data.ssl.client.cert.file.path") + @ConfigDescription("Path to the SSL client certificate file.") + public HiveConfig setOmniDataSslClientCertFilePath(String omniDataSslClientCertFilePath) + { + this.omniDataSslClientCertFilePath = getNormalizedFilePath(omniDataSslClientCertFilePath); + return this; + } + + public Optional getOmniDataSslPrivateKeyFilePath() + { + return omniDataSslPrivateKeyFilePath; + } + + @Config("omni-data.ssl.private.key.file.path") + @ConfigDescription("Path to the SSL private key file.") + public HiveConfig setOmniDataSslPrivateKeyFilePath(String omniDataSslPrivateKeyFilePath) + { + this.omniDataSslPrivateKeyFilePath = getNormalizedFilePath(omniDataSslPrivateKeyFilePath); + return this; + } + + public Optional getOmniDataSslTrustCertFilePath() + { + return omniDataSslTrustCertFilePath; + } + + @Config("omni-data.ssl.trust.cert.file.path") + @ConfigDescription("Path to the SSL trust certificate file.") + public HiveConfig setOmniDataSslTrustCertFilePath(String omniDataSslTrustCertFilePath) + { + this.omniDataSslTrustCertFilePath = getNormalizedFilePath(omniDataSslTrustCertFilePath); + return this; + } + + public Optional getOmniDataSslCrlFilePath() + { + return omniDataSslCrlFilePath; + } + + @Config("omni-data.ssl.crl.file.path") + @ConfigDescription("Path to the SSL Certificate Revocation List file.") + public HiveConfig setOmniDataSslCrlFilePath(String omniDataSslCrlFilePath) + { + this.omniDataSslCrlFilePath = getNormalizedFilePath(omniDataSslCrlFilePath); + return this; + } + + @Config("hive.filter-offload-enabled") + @ConfigDescription("Enables offload filter operators to storage device.") + public HiveConfig setFilterOffloadEnabled(boolean filterOffloadEnabled) + { + this.filterOffloadEnabled = filterOffloadEnabled; + return this; + } + + public boolean isFilterOffloadEnabled() + { + return filterOffloadEnabled; + } + + @Config("hive.aggregator-offload-enabled") + @ConfigDescription("Enables offload aggregator operators to storage device.") + public HiveConfig setAggregatorOffloadEnabled(boolean aggregatorOffloadEnabled) + { + this.aggregatorOffloadEnabled = aggregatorOffloadEnabled; + return this; + } + + @Config("hive.omnidata-enabled") + @ConfigDescription("Enables omnidata feature.") + public HiveConfig setOmniDataEnabled(boolean omniDataEnabled) + { + this.omniDataEnabled = omniDataEnabled; + return this; + } + + public boolean isOmniDataEnabled() + { + return omniDataEnabled; + } + + public boolean isAggregatorOffloadEnabled() + { + return aggregatorOffloadEnabled; + } + + @Config("hive.min-filter-offload-factor") + @ConfigDescription("The minimum data filtering threshold for predicate expression offload.") + public HiveConfig setMinFilterOffloadFactor(double minFilterOffloadFactor) + { + this.minFilterOffloadFactor = minFilterOffloadFactor; + return this; + } + + @DecimalMin("0.0") + @DecimalMax("1.0") + public double getMinFilterOffloadFactor() + { + return minFilterOffloadFactor; + } + + @Config("hive.min-aggregator-offload-factor") + @ConfigDescription("The minimum data aggregation threshold for aggregation expression offload.") + public HiveConfig setMinAggregatorOffloadFactor(double minAggregatorOffloadFactor) + { + this.minAggregatorOffloadFactor = minAggregatorOffloadFactor; + return this; + } + + @DecimalMin("0.0") + @DecimalMax("1.0") + public double getMinAggregatorOffloadFactor() + { + return minAggregatorOffloadFactor; + } + + @Config("hive.min-offload-row-number") + @ConfigDescription("The minimum table size for operator offload.") + public HiveConfig setMinOffloadRowNumber(long filterFactor) + { + this.minOffloadRowNumber = filterFactor; + return this; + } + + @Min(1) + public long getMinOffloadRowNumber() + { + return minOffloadRowNumber; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveConnector.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveConnector.java new file mode 100644 index 00000000..dbcf916d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveConnector.java @@ -0,0 +1,240 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import io.airlift.bootstrap.LifeCycleManager; +import io.airlift.log.Logger; +import io.prestosql.spi.classloader.ThreadContextClassLoader; +import io.prestosql.spi.connector.Connector; +import io.prestosql.spi.connector.ConnectorAccessControl; +import io.prestosql.spi.connector.ConnectorMetadata; +import io.prestosql.spi.connector.ConnectorNodePartitioningProvider; +import io.prestosql.spi.connector.ConnectorPageSinkProvider; +import io.prestosql.spi.connector.ConnectorPageSourceProvider; +import io.prestosql.spi.connector.ConnectorPlanOptimizerProvider; +import io.prestosql.spi.connector.ConnectorSplitManager; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.connector.SystemTable; +import io.prestosql.spi.connector.classloader.ClassLoaderSafeConnectorMetadata; +import io.prestosql.spi.procedure.Procedure; +import io.prestosql.spi.session.PropertyMetadata; +import io.prestosql.spi.transaction.IsolationLevel; + +import java.util.List; +import java.util.Set; +import java.util.function.Supplier; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.prestosql.spi.transaction.IsolationLevel.READ_UNCOMMITTED; +import static io.prestosql.spi.transaction.IsolationLevel.checkConnectorSupports; +import static java.util.Objects.requireNonNull; + +public class HiveConnector + implements Connector +{ + private static final Logger log = Logger.get(HiveConnector.class); + + private final LifeCycleManager lifeCycleManager; + private final Supplier metadataFactory; + private final ConnectorSplitManager splitManager; + private final ConnectorPageSourceProvider pageSourceProvider; + private final ConnectorPageSinkProvider pageSinkProvider; + private final ConnectorNodePartitioningProvider nodePartitioningProvider; + private final Set systemTables; + private final Set procedures; + private final List> sessionProperties; + private final List> schemaProperties; + private final List> tableProperties; + private final List> analyzeProperties; + + private final ConnectorAccessControl accessControl; + private final ClassLoader classLoader; + private final ConnectorPlanOptimizerProvider planOptimizerProvider; + + private final HiveTransactionManager transactionManager; + + public HiveConnector( + LifeCycleManager lifeCycleManager, + Supplier metadataFactory, + HiveTransactionManager transactionManager, + ConnectorSplitManager splitManager, + ConnectorPageSourceProvider pageSourceProvider, + ConnectorPageSinkProvider pageSinkProvider, + ConnectorNodePartitioningProvider nodePartitioningProvider, + Set systemTables, + Set procedures, + List> sessionProperties, + List> schemaProperties, + List> tableProperties, + List> analyzeProperties, + ConnectorAccessControl accessControl, + ConnectorPlanOptimizerProvider planOptimizerProvider, + ClassLoader classLoader) + { + this.lifeCycleManager = requireNonNull(lifeCycleManager, "lifeCycleManager is null"); + this.metadataFactory = requireNonNull(metadataFactory, "metadata is null"); + this.transactionManager = requireNonNull(transactionManager, "transactionManager is null"); + this.splitManager = requireNonNull(splitManager, "splitManager is null"); + this.pageSourceProvider = requireNonNull(pageSourceProvider, "pageSourceProvider is null"); + this.pageSinkProvider = requireNonNull(pageSinkProvider, "pageSinkProvider is null"); + this.nodePartitioningProvider = requireNonNull(nodePartitioningProvider, "nodePartitioningProvider is null"); + this.systemTables = ImmutableSet.copyOf(requireNonNull(systemTables, "systemTables is null")); + this.procedures = ImmutableSet.copyOf(requireNonNull(procedures, "procedures is null")); + this.sessionProperties = ImmutableList.copyOf(requireNonNull(sessionProperties, "sessionProperties is null")); + this.schemaProperties = ImmutableList.copyOf(requireNonNull(schemaProperties, "schemaProperties is null")); + this.tableProperties = ImmutableList.copyOf(requireNonNull(tableProperties, "tableProperties is null")); + this.analyzeProperties = ImmutableList.copyOf(requireNonNull(analyzeProperties, "analyzeProperties is null")); + this.accessControl = requireNonNull(accessControl, "accessControl is null"); + this.classLoader = requireNonNull(classLoader, "classLoader is null"); + this.planOptimizerProvider = requireNonNull(planOptimizerProvider, "planOptimizerProvider is null"); + } + + @Override + public ConnectorMetadata getMetadata(ConnectorTransactionHandle transaction) + { + ConnectorMetadata metadata = transactionManager.get(transaction); + checkArgument(metadata != null, "no such transaction: %s", transaction); + return new ClassLoaderSafeConnectorMetadata(metadata, classLoader); + } + + @Override + public ConnectorSplitManager getSplitManager() + { + return splitManager; + } + + @Override + public ConnectorPageSourceProvider getPageSourceProvider() + { + return pageSourceProvider; + } + + @Override + public ConnectorPageSinkProvider getPageSinkProvider() + { + return pageSinkProvider; + } + + @Override + public ConnectorNodePartitioningProvider getNodePartitioningProvider() + { + return nodePartitioningProvider; + } + + @Override + public ConnectorPlanOptimizerProvider getConnectorPlanOptimizerProvider() + { + return planOptimizerProvider; + } + + @Override + public Set getSystemTables() + { + return systemTables; + } + + @Override + public Set getProcedures() + { + return procedures; + } + + @Override + public List> getSessionProperties() + { + return sessionProperties; + } + + @Override + public List> getSchemaProperties() + { + return schemaProperties; + } + + @Override + public List> getAnalyzeProperties() + { + return analyzeProperties; + } + + @Override + public List> getTableProperties() + { + return tableProperties; + } + + @Override + public ConnectorAccessControl getAccessControl() + { + return accessControl; + } + + @Override + public boolean isSingleStatementWritesOnly() + { + return false; + } + + @Override + public ConnectorTransactionHandle beginTransaction(IsolationLevel isolationLevel, boolean readOnly) + { + checkConnectorSupports(READ_UNCOMMITTED, isolationLevel); + ConnectorTransactionHandle transaction = new HiveTransactionHandle(); + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { + transactionManager.put(transaction, metadataFactory.get()); + } + return transaction; + } + + @Override + public void commit(ConnectorTransactionHandle transaction) + { + TransactionalMetadata metadata = transactionManager.remove(transaction); + checkArgument(metadata != null, "no such transaction: %s", transaction); + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { + metadata.commit(); + } + } + + @Override + public void rollback(ConnectorTransactionHandle transaction) + { + TransactionalMetadata metadata = transactionManager.remove(transaction); + checkArgument(metadata != null, "no such transaction: %s", transaction); + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { + metadata.rollback(); + } + } + + @Override + public final void shutdown() + { + try { + lifeCycleManager.stop(); + } + catch (Exception e) { + log.error(e, "Error shutting down connector"); + } + } + + @Override + public ConnectorMetadata getConnectorMetadata() + { + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { + return new ClassLoaderSafeConnectorMetadata(metadataFactory.get(), classLoader); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveConnectorFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveConnectorFactory.java new file mode 100644 index 00000000..f4b828c6 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveConnectorFactory.java @@ -0,0 +1,178 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableSet; +import com.google.inject.Injector; +import com.google.inject.Key; +import com.google.inject.TypeLiteral; +import io.airlift.bootstrap.Bootstrap; +import io.airlift.bootstrap.LifeCycleManager; +import io.airlift.event.client.EventModule; +import io.airlift.json.JsonModule; +import io.prestosql.plugin.base.jmx.MBeanServerModule; +import io.prestosql.plugin.hive.authentication.HiveAuthenticationModule; +import io.prestosql.plugin.hive.gcs.HiveGcsModule; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.HiveMetastoreModule; +import io.prestosql.plugin.hive.omnidata.OmniDataNodeManager; +import io.prestosql.plugin.hive.rule.HivePushdownUtil; +import io.prestosql.plugin.hive.s3.HiveS3Module; +import io.prestosql.plugin.hive.security.HiveSecurityModule; +import io.prestosql.plugin.hive.security.SystemTableAwareAccessControl; +import io.prestosql.spi.NodeManager; +import io.prestosql.spi.PageIndexerFactory; +import io.prestosql.spi.PageSorter; +import io.prestosql.spi.VersionEmbedder; +import io.prestosql.spi.classloader.ThreadContextClassLoader; +import io.prestosql.spi.connector.Connector; +import io.prestosql.spi.connector.ConnectorAccessControl; +import io.prestosql.spi.connector.ConnectorContext; +import io.prestosql.spi.connector.ConnectorFactory; +import io.prestosql.spi.connector.ConnectorHandleResolver; +import io.prestosql.spi.connector.ConnectorNodePartitioningProvider; +import io.prestosql.spi.connector.ConnectorPageSinkProvider; +import io.prestosql.spi.connector.ConnectorPageSourceProvider; +import io.prestosql.spi.connector.ConnectorPlanOptimizerProvider; +import io.prestosql.spi.connector.ConnectorSplitManager; +import io.prestosql.spi.connector.classloader.ClassLoaderSafeConnectorPageSinkProvider; +import io.prestosql.spi.connector.classloader.ClassLoaderSafeConnectorPageSourceProvider; +import io.prestosql.spi.connector.classloader.ClassLoaderSafeConnectorSplitManager; +import io.prestosql.spi.connector.classloader.ClassLoaderSafeNodePartitioningProvider; +import io.prestosql.spi.function.FunctionMetadataManager; +import io.prestosql.spi.function.StandardFunctionResolution; +import io.prestosql.spi.heuristicindex.IndexClient; +import io.prestosql.spi.plan.FilterStatsCalculatorService; +import io.prestosql.spi.procedure.Procedure; +import io.prestosql.spi.relation.RowExpressionService; +import io.prestosql.spi.type.TypeManager; +import org.weakref.jmx.guice.MBeanModule; + +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.isNullOrEmpty; +import static com.google.common.base.Throwables.throwIfUnchecked; +import static java.util.Objects.requireNonNull; + +public class HiveConnectorFactory + implements ConnectorFactory +{ + private final String name; + private final ClassLoader classLoader; + private final Optional metastore; + + public HiveConnectorFactory(String name, ClassLoader classLoader, Optional metastore) + { + checkArgument(!isNullOrEmpty(name), "name is null or empty"); + this.name = name; + this.classLoader = requireNonNull(classLoader, "classLoader is null"); + this.metastore = requireNonNull(metastore, "metastore is null"); + } + + @Override + public String getName() + { + return name; + } + + @Override + public ConnectorHandleResolver getHandleResolver() + { + return new HiveHandleResolver(); + } + + @Override + public Connector create(String catalogName, Map config, ConnectorContext context) + { + requireNonNull(config, "config is null"); + + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(classLoader)) { + Bootstrap app = new Bootstrap( + new EventModule(), + new MBeanModule(), + new ConnectorObjectNameGeneratorModule(catalogName), + new JsonModule(), + new HiveModule(), + new HiveS3Module(), + new HiveGcsModule(), + new HiveMetastoreModule(metastore), + new HiveSecurityModule(), + new HiveAuthenticationModule(), + new HiveProcedureModule(), + new MBeanServerModule(), + binder -> { + binder.bind(NodeVersion.class).toInstance(new NodeVersion(context.getNodeManager().getCurrentNode().getVersion())); + binder.bind(NodeManager.class).toInstance(context.getNodeManager()); + binder.bind(VersionEmbedder.class).toInstance(context.getVersionEmbedder()); + binder.bind(TypeManager.class).toInstance(context.getTypeManager()); + binder.bind(PageIndexerFactory.class).toInstance(context.getPageIndexerFactory()); + binder.bind(PageSorter.class).toInstance(context.getPageSorter()); + binder.bind(HiveCatalogName.class).toInstance(new HiveCatalogName(catalogName)); + binder.bind(IndexClient.class).toInstance(context.getIndexClient()); + binder.bind(StandardFunctionResolution.class).toInstance(context.getStandardFunctionResolution()); + binder.bind(FunctionMetadataManager.class).toInstance(context.getFunctionMetadataManager()); + binder.bind(FilterStatsCalculatorService.class).toInstance(context.getFilterStatsCalculatorService()); + binder.bind(RowExpressionService.class).toInstance(context.getRowExpressionService()); + }); + + Injector injector = app + .strictConfig() + .doNotInitializeLogging() + .setRequiredConfigurationProperties(config) + .initialize(); + + LifeCycleManager lifeCycleManager = injector.getInstance(LifeCycleManager.class); + HiveMetadataFactory metadataFactory = injector.getInstance(HiveMetadataFactory.class); + HiveTransactionManager transactionManager = injector.getInstance(HiveTransactionManager.class); + ConnectorSplitManager splitManager = injector.getInstance(ConnectorSplitManager.class); + ConnectorPageSourceProvider connectorPageSource = injector.getInstance(ConnectorPageSourceProvider.class); + ConnectorPageSinkProvider pageSinkProvider = injector.getInstance(ConnectorPageSinkProvider.class); + ConnectorNodePartitioningProvider connectorDistributionProvider = injector.getInstance(ConnectorNodePartitioningProvider.class); + HiveSessionProperties hiveSessionProperties = injector.getInstance(HiveSessionProperties.class); + HiveTableProperties hiveTableProperties = injector.getInstance(HiveTableProperties.class); + HiveAnalyzeProperties hiveAnalyzeProperties = injector.getInstance(HiveAnalyzeProperties.class); + ConnectorAccessControl accessControl = new SystemTableAwareAccessControl(injector.getInstance(ConnectorAccessControl.class)); + Set procedures = injector.getInstance(Key.get(new TypeLiteral>() {})); + ConnectorPlanOptimizerProvider planOptimizerProvider = injector.getInstance(ConnectorPlanOptimizerProvider.class); + OmniDataNodeManager nodeManagerInstance = injector.getInstance(OmniDataNodeManager.class); + HivePushdownUtil.setOmniDataNodeManager(nodeManagerInstance); + nodeManagerInstance.startPollingNodeStates(); + + return new HiveConnector( + lifeCycleManager, + metadataFactory, + transactionManager, + new ClassLoaderSafeConnectorSplitManager(splitManager, classLoader), + new ClassLoaderSafeConnectorPageSourceProvider(connectorPageSource, classLoader), + new ClassLoaderSafeConnectorPageSinkProvider(pageSinkProvider, classLoader), + new ClassLoaderSafeNodePartitioningProvider(connectorDistributionProvider, classLoader), + ImmutableSet.of(), + procedures, + hiveSessionProperties.getSessionProperties(), + HiveSchemaProperties.SCHEMA_PROPERTIES, + hiveTableProperties.getTableProperties(), + hiveAnalyzeProperties.getAnalyzeProperties(), + accessControl, + planOptimizerProvider, + classLoader); + } + catch (Exception e) { + throwIfUnchecked(e); + throw new RuntimeException(e); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveDecimalParser.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveDecimalParser.java new file mode 100644 index 00000000..8da87b81 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveDecimalParser.java @@ -0,0 +1,37 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.type.DecimalType; + +import java.math.BigDecimal; + +import static io.prestosql.spi.type.Decimals.rescale; +import static java.math.RoundingMode.HALF_UP; +import static java.nio.charset.StandardCharsets.UTF_8; + +public final class HiveDecimalParser +{ + private HiveDecimalParser() {} + + public static BigDecimal parseHiveDecimal(byte[] bytes, int start, int length, DecimalType columnType) + { + BigDecimal parsed = new BigDecimal(new String(bytes, start, length, UTF_8)); + if (parsed.scale() > columnType.getScale()) { + // Hive rounds HALF_UP too + parsed = parsed.setScale(columnType.getScale(), HALF_UP); + } + return rescale(parsed, columnType); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveDeleteAsInsertTableHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveDeleteAsInsertTableHandle.java new file mode 100644 index 00000000..766085bc --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveDeleteAsInsertTableHandle.java @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.prestosql.plugin.hive.metastore.HivePageSinkMetadata; +import io.prestosql.spi.connector.ConnectorDeleteAsInsertTableHandle; + +import java.util.List; +import java.util.Optional; + +public class HiveDeleteAsInsertTableHandle + extends HiveWritableTableHandle + implements ConnectorDeleteAsInsertTableHandle +{ + @JsonCreator + public HiveDeleteAsInsertTableHandle( + @JsonProperty("schemaName") String schemaName, + @JsonProperty("tableName") String tableName, + @JsonProperty("inputColumns") List inputColumns, + @JsonProperty("pageSinkMetadata") HivePageSinkMetadata pageSinkMetadata, + @JsonProperty("locationHandle") LocationHandle locationHandle, + @JsonProperty("bucketProperty") Optional bucketProperty, + @JsonProperty("tableStorageFormat") HiveStorageFormat tableStorageFormat, + @JsonProperty("partitionStorageFormat") HiveStorageFormat partitionStorageFormat) + { + super( + schemaName, + tableName, + inputColumns, + pageSinkMetadata, + locationHandle, + bucketProperty, + tableStorageFormat, + partitionStorageFormat, + false); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveErrorCode.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveErrorCode.java new file mode 100644 index 00000000..a1ae6b04 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveErrorCode.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.ErrorCode; +import io.prestosql.spi.ErrorCodeSupplier; +import io.prestosql.spi.ErrorType; + +import static io.prestosql.spi.ErrorType.EXTERNAL; +import static io.prestosql.spi.ErrorType.INTERNAL_ERROR; +import static io.prestosql.spi.ErrorType.USER_ERROR; + +public enum HiveErrorCode + implements ErrorCodeSupplier +{ + HIVE_METASTORE_ERROR(0, EXTERNAL), + HIVE_CURSOR_ERROR(1, EXTERNAL), + HIVE_TABLE_OFFLINE(2, USER_ERROR), + HIVE_CANNOT_OPEN_SPLIT(3, EXTERNAL), + HIVE_FILE_NOT_FOUND(4, EXTERNAL), + HIVE_UNKNOWN_ERROR(5, EXTERNAL), + HIVE_PARTITION_OFFLINE(6, USER_ERROR), + HIVE_BAD_DATA(7, EXTERNAL), + HIVE_PARTITION_SCHEMA_MISMATCH(8, EXTERNAL), + HIVE_MISSING_DATA(9, EXTERNAL), + HIVE_INVALID_PARTITION_VALUE(10, EXTERNAL), + HIVE_TIMEZONE_MISMATCH(11, EXTERNAL), + HIVE_INVALID_METADATA(12, EXTERNAL), + HIVE_INVALID_VIEW_DATA(13, EXTERNAL), + HIVE_DATABASE_LOCATION_ERROR(14, EXTERNAL), + HIVE_PATH_ALREADY_EXISTS(15, EXTERNAL), + HIVE_FILESYSTEM_ERROR(16, EXTERNAL), + // code HIVE_WRITER_ERROR(17) is deprecated + HIVE_SERDE_NOT_FOUND(18, EXTERNAL), + HIVE_UNSUPPORTED_FORMAT(19, EXTERNAL), + HIVE_PARTITION_READ_ONLY(20, USER_ERROR), + HIVE_TOO_MANY_OPEN_PARTITIONS(21, USER_ERROR), + HIVE_CONCURRENT_MODIFICATION_DETECTED(22, EXTERNAL), + HIVE_COLUMN_ORDER_MISMATCH(23, USER_ERROR), + HIVE_FILE_MISSING_COLUMN_NAMES(24, EXTERNAL), + HIVE_WRITER_OPEN_ERROR(25, EXTERNAL), + HIVE_WRITER_CLOSE_ERROR(26, EXTERNAL), + HIVE_WRITER_DATA_ERROR(27, EXTERNAL), + HIVE_INVALID_BUCKET_FILES(28, EXTERNAL), + HIVE_EXCEEDED_PARTITION_LIMIT(29, USER_ERROR), + HIVE_WRITE_VALIDATION_FAILED(30, INTERNAL_ERROR), + HIVE_PARTITION_DROPPED_DURING_QUERY(31, EXTERNAL), + HIVE_TABLE_READ_ONLY(32, USER_ERROR), + HIVE_PARTITION_NOT_READABLE(33, USER_ERROR), + HIVE_TABLE_NOT_READABLE(34, USER_ERROR), + HIVE_TABLE_DROPPED_DURING_QUERY(35, EXTERNAL), + // HIVE_TOO_MANY_BUCKET_SORT_FILES(36) is deprecated + HIVE_CORRUPTED_COLUMN_STATISTICS(37, EXTERNAL), + HIVE_EXCEEDED_SPLIT_BUFFERING_LIMIT(38, USER_ERROR), + HIVE_UNKNOWN_COLUMN_STATISTIC_TYPE(39, INTERNAL_ERROR), + HIVE_TABLE_LOCK_NOT_ACQUIRED(40, EXTERNAL), + HIVE_OPERATOR_OFFLOAD_FAIL(41, EXTERNAL) + /**/; + + private final ErrorCode errorCode; + + HiveErrorCode(int code, ErrorType type) + { + errorCode = new ErrorCode(code + 0x0100_0000, name(), type); + } + + @Override + public ErrorCode toErrorCode() + { + return errorCode; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveEventClient.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveEventClient.java new file mode 100644 index 00000000..6c53ce6c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveEventClient.java @@ -0,0 +1,40 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.event.client.AbstractEventClient; +import io.airlift.log.Logger; + +public class HiveEventClient + extends AbstractEventClient +{ + private static final Logger log = Logger.get(HiveEventClient.class); + + @Override + public void postEvent(T event) + { + if (!(event instanceof WriteCompletedEvent)) { + return; + } + WriteCompletedEvent writeCompletedEvent = (WriteCompletedEvent) event; + log.debug("File created: query: %s, schema: %s, table: %s, partition: '%s', format: %s, size: %s, path: %s", + writeCompletedEvent.getQueryId(), + writeCompletedEvent.getSchemaName(), + writeCompletedEvent.getTableName(), + writeCompletedEvent.getPartitionName(), + writeCompletedEvent.getStorageFormat(), + writeCompletedEvent.getBytes(), + writeCompletedEvent.getPath()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveFileWriter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveFileWriter.java new file mode 100644 index 00000000..ad008884 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveFileWriter.java @@ -0,0 +1,55 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.spi.Page; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.util.Optional; + +public interface HiveFileWriter +{ + long getWrittenBytes(); + + long getSystemMemoryUsage(); + + void appendRows(Page dataPage); + + void commit(); + + void rollback(); + + long getValidationCpuNanos(); + + default Optional getVerificationTask() + { + return Optional.empty(); + } + + default void initWriter(boolean isAcid, Path path, FileSystem fileSystem) + { + } + + default ImmutableList getExtraPartitionFiles() + { + return ImmutableList.of(); + } + + default ImmutableList getMiscData() + { + return ImmutableList.of(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveFileWriterFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveFileWriterFactory.java new file mode 100644 index 00000000..b4fdbfce --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveFileWriterFactory.java @@ -0,0 +1,37 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.spi.connector.ConnectorSession; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat; +import org.apache.hadoop.mapred.JobConf; + +import java.util.List; +import java.util.Optional; +import java.util.Properties; + +public interface HiveFileWriterFactory +{ + Optional createFileWriter( + Path path, + List inputColumnNames, + StorageFormat storageFormat, + Properties schema, + JobConf conf, + ConnectorSession session, + Optional acidOptions, + Optional acidWriteType); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveHandleResolver.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveHandleResolver.java new file mode 100644 index 00000000..2fb1d7c1 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveHandleResolver.java @@ -0,0 +1,90 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorDeleteAsInsertTableHandle; +import io.prestosql.spi.connector.ConnectorHandleResolver; +import io.prestosql.spi.connector.ConnectorInsertTableHandle; +import io.prestosql.spi.connector.ConnectorOutputTableHandle; +import io.prestosql.spi.connector.ConnectorPartitioningHandle; +import io.prestosql.spi.connector.ConnectorSplit; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.connector.ConnectorUpdateTableHandle; +import io.prestosql.spi.connector.ConnectorVacuumTableHandle; + +public class HiveHandleResolver + implements ConnectorHandleResolver +{ + @Override + public Class getTableHandleClass() + { + return HiveTableHandle.class; + } + + @Override + public Class getColumnHandleClass() + { + return HiveColumnHandle.class; + } + + @Override + public Class getSplitClass() + { + return HiveSplitWrapper.class; + } + + @Override + public Class getOutputTableHandleClass() + { + return HiveOutputTableHandle.class; + } + + @Override + public Class getInsertTableHandleClass() + { + return HiveInsertTableHandle.class; + } + + @Override + public Class getUpdateTableHandleClass() + { + return HiveUpdateTableHandle.class; + } + + @Override + public Class getDeleteAsInsertTableHandleClass() + { + return HiveDeleteAsInsertTableHandle.class; + } + + @Override + public Class getVacuumTableHandleClass() + { + return HiveVacuumTableHandle.class; + } + + @Override + public Class getTransactionHandleClass() + { + return HiveTransactionHandle.class; + } + + @Override + public Class getPartitioningHandleClass() + { + return HivePartitioningHandle.class; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveHdfsConfiguration.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveHdfsConfiguration.java new file mode 100644 index 00000000..32000095 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveHdfsConfiguration.java @@ -0,0 +1,69 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableSet; +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.util.ConfigurationUtils; +import org.apache.hadoop.conf.Configuration; + +import javax.inject.Inject; + +import java.net.URI; +import java.util.Set; + +import static java.util.Objects.requireNonNull; + +public class HiveHdfsConfiguration + implements HdfsConfiguration +{ + private static final Configuration INITIAL_CONFIGURATION = ConfigurationUtils.getInitialConfiguration(); + + @SuppressWarnings("ThreadLocalNotStaticFinal") + private final ThreadLocal hadoopConfiguration = new ThreadLocal() + { + @Override + protected Configuration initialValue() + { + Configuration configuration = new Configuration(false); + ConfigurationUtils.copy(INITIAL_CONFIGURATION, configuration); + initializer.initializeConfiguration(configuration); + return configuration; + } + }; + + private final HdfsConfigurationInitializer initializer; + private final Set dynamicProviders; + + @Inject + public HiveHdfsConfiguration(HdfsConfigurationInitializer initializer, Set dynamicProviders) + { + this.initializer = requireNonNull(initializer, "initializer is null"); + this.dynamicProviders = ImmutableSet.copyOf(requireNonNull(dynamicProviders, "dynamicProviders is null")); + } + + @Override + public Configuration getConfiguration(HdfsContext context, URI uri) + { + if (dynamicProviders.isEmpty()) { + // use the same configuration for everything + return hadoopConfiguration.get(); + } + Configuration config = ConfigurationUtils.copy(hadoopConfiguration.get()); + for (DynamicConfigurationProvider provider : dynamicProviders) { + provider.updateConfiguration(config, context, uri); + } + return config; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveInputInfo.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveInputInfo.java new file mode 100644 index 00000000..b62b8644 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveInputInfo.java @@ -0,0 +1,48 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.util.List; + +public class HiveInputInfo +{ + private final List partitionIds; + // Code that serialize HiveInputInfo into log would often need the ability to limit the length of log entries. + // This boolean field allows such code to mark the log entry as length limited. + private final boolean truncated; + + @JsonCreator + public HiveInputInfo( + @JsonProperty("partitionIds") List partitionIds, + @JsonProperty("truncated") boolean truncated) + { + this.partitionIds = partitionIds; + this.truncated = truncated; + } + + @JsonProperty + public List getPartitionIds() + { + return partitionIds; + } + + @JsonProperty + public boolean isTruncated() + { + return truncated; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveInsertTableHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveInsertTableHandle.java new file mode 100644 index 00000000..c74642a9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveInsertTableHandle.java @@ -0,0 +1,51 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.prestosql.plugin.hive.metastore.HivePageSinkMetadata; +import io.prestosql.spi.connector.ConnectorInsertTableHandle; + +import java.util.List; +import java.util.Optional; + +public class HiveInsertTableHandle + extends HiveWritableTableHandle + implements ConnectorInsertTableHandle +{ + @JsonCreator + public HiveInsertTableHandle( + @JsonProperty("schemaName") String schemaName, + @JsonProperty("tableName") String tableName, + @JsonProperty("inputColumns") List inputColumns, + @JsonProperty("pageSinkMetadata") HivePageSinkMetadata pageSinkMetadata, + @JsonProperty("locationHandle") LocationHandle locationHandle, + @JsonProperty("bucketProperty") Optional bucketProperty, + @JsonProperty("tableStorageFormat") HiveStorageFormat tableStorageFormat, + @JsonProperty("partitionStorageFormat") HiveStorageFormat partitionStorageFormat, + @JsonProperty("isOverwrite") boolean isOverwrite) + { + super( + schemaName, + tableName, + inputColumns, + pageSinkMetadata, + locationHandle, + bucketProperty, + tableStorageFormat, + partitionStorageFormat, + isOverwrite); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveLocationService.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveLocationService.java new file mode 100644 index 00000000..f415684d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveLocationService.java @@ -0,0 +1,149 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.LocationHandle.WriteMode; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorSession; +import org.apache.hadoop.fs.Path; + +import javax.inject.Inject; + +import java.util.Optional; + +import static io.prestosql.plugin.hive.HiveSessionProperties.isTemporaryStagingDirectoryEnabled; +import static io.prestosql.plugin.hive.HiveWriteUtils.createTemporaryPath; +import static io.prestosql.plugin.hive.HiveWriteUtils.getTableDefaultLocation; +import static io.prestosql.plugin.hive.HiveWriteUtils.isHdfsEncrypted; +import static io.prestosql.plugin.hive.HiveWriteUtils.isS3FileSystem; +import static io.prestosql.plugin.hive.HiveWriteUtils.pathExists; +import static io.prestosql.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_EXISTING_DIRECTORY; +import static io.prestosql.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_NEW_DIRECTORY; +import static io.prestosql.plugin.hive.LocationHandle.WriteMode.STAGE_AND_MOVE_TO_TARGET_DIRECTORY; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class HiveLocationService + implements LocationService +{ + private final HdfsEnvironment hdfsEnvironment; + + @Inject + public HiveLocationService(HdfsEnvironment hdfsEnvironment) + { + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + } + + @Override + public LocationHandle forNewTable(SemiTransactionalHiveMetastore metastore, ConnectorSession session, String schemaName, String tableName, Optional writeIdInfo, Optional tablePath, HiveWriteUtils.OpertionType opertionType) + { + Path targetPath; + HdfsContext context = new HdfsContext(session, schemaName, tableName); + if (tablePath.isPresent()) { + targetPath = tablePath.get(); + } + else { + targetPath = getTableDefaultLocation(context, metastore, hdfsEnvironment, schemaName, tableName); + } + + // verify the target directory for the table + if (pathExists(context, hdfsEnvironment, targetPath)) { + throw new PrestoException(HiveErrorCode.HIVE_PATH_ALREADY_EXISTS, format("Target directory for table '%s.%s' already exists: %s", schemaName, tableName, targetPath)); + } + + if (shouldUseTemporaryDirectory(session, context, targetPath) && (opertionType == HiveWriteUtils.OpertionType.CREATE_TABLE_AS)) { + Path writePath = createTemporaryPath(session, context, hdfsEnvironment, targetPath, opertionType); + return new LocationHandle(targetPath, writePath, false, STAGE_AND_MOVE_TO_TARGET_DIRECTORY, writeIdInfo); + } + else { + return new LocationHandle(targetPath, targetPath, false, DIRECT_TO_TARGET_NEW_DIRECTORY, writeIdInfo); + } + } + + @Override + public LocationHandle forExistingTable(SemiTransactionalHiveMetastore metastore, ConnectorSession session, Table table, Optional writeIdInfo, HiveWriteUtils.OpertionType opertionType) + { + HdfsContext context = new HdfsContext(session, table.getDatabaseName(), table.getTableName()); + Path targetPath = new Path(table.getStorage().getLocation()); + + if (shouldUseTemporaryDirectory(session, context, targetPath)) { + Path writePath = createTemporaryPath(session, context, hdfsEnvironment, targetPath, opertionType); + return new LocationHandle(targetPath, writePath, true, STAGE_AND_MOVE_TO_TARGET_DIRECTORY, writeIdInfo); + } + else { + return new LocationHandle(targetPath, targetPath, true, DIRECT_TO_TARGET_EXISTING_DIRECTORY, writeIdInfo); + } + } + + private boolean shouldUseTemporaryDirectory(ConnectorSession session, HdfsContext context, Path path) + { + return isTemporaryStagingDirectoryEnabled(session) + // skip using temporary directory for S3 + && !isS3FileSystem(context, hdfsEnvironment, path) + // skip using temporary directory if destination is encrypted; it's not possible to move a file between encryption zones + && !isHdfsEncrypted(context, hdfsEnvironment, path); + } + + @Override + public WriteInfo getQueryWriteInfo(LocationHandle locationHandle) + { + return new WriteInfo(locationHandle.getTargetPath(), locationHandle.getWritePath(), locationHandle.getWriteMode()); + } + + @Override + public WriteInfo getTableWriteInfo(LocationHandle locationHandle, boolean overwrite) + { + if (overwrite && locationHandle.getWriteMode() != STAGE_AND_MOVE_TO_TARGET_DIRECTORY) { + throw new PrestoException(NOT_SUPPORTED, "Overwriting unpartitioned table not supported when writing directly to target directory"); + } + return new WriteInfo(locationHandle.getTargetPath(), locationHandle.getWritePath(), locationHandle.getWriteMode()); + } + + @Override + public WriteInfo getPartitionWriteInfo(LocationHandle locationHandle, Optional partition, String partitionName) + { + if (partition.isPresent()) { + // existing partition + WriteMode writeMode = locationHandle.getWriteMode(); + Path targetPath = new Path(partition.get().getStorage().getLocation()); + + Path writePath; + switch (writeMode) { + case STAGE_AND_MOVE_TO_TARGET_DIRECTORY: + writePath = new Path(locationHandle.getWritePath(), partitionName); + break; + case DIRECT_TO_TARGET_EXISTING_DIRECTORY: + writePath = targetPath; + break; + case DIRECT_TO_TARGET_NEW_DIRECTORY: + default: + throw new UnsupportedOperationException(format("inserting into existing partition is not supported for %s", writeMode)); + } + + return new WriteInfo(targetPath, writePath, writeMode); + } + else { + // new partition + return new WriteInfo( + new Path(locationHandle.getTargetPath(), partitionName), + new Path(locationHandle.getWritePath(), partitionName), + locationHandle.getWriteMode()); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveMetadata.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveMetadata.java new file mode 100644 index 00000000..3ad30119 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveMetadata.java @@ -0,0 +1,3236 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.base.Suppliers; +import com.google.common.base.VerifyException; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableSortedMap; +import com.google.common.collect.Iterables; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import io.airlift.json.JsonCodec; +import io.airlift.log.Logger; +import io.airlift.slice.Slice; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.MetastoreUtil; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.PrincipalPrivileges; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.SortingColumn; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.security.AccessControlMetadata; +import io.prestosql.plugin.hive.statistics.HiveStatisticsProvider; +import io.prestosql.plugin.hive.util.ConfigurationUtils; +import io.prestosql.plugin.hive.util.Statistics; +import io.prestosql.spi.PartialAndFinalAggregationType; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ColumnMetadata; +import io.prestosql.spi.connector.ConnectorDeleteAsInsertTableHandle; +import io.prestosql.spi.connector.ConnectorInsertTableHandle; +import io.prestosql.spi.connector.ConnectorMetadata; +import io.prestosql.spi.connector.ConnectorNewTableLayout; +import io.prestosql.spi.connector.ConnectorOutputMetadata; +import io.prestosql.spi.connector.ConnectorOutputTableHandle; +import io.prestosql.spi.connector.ConnectorPartitioningHandle; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.ConnectorTableMetadata; +import io.prestosql.spi.connector.ConnectorTablePartitioning; +import io.prestosql.spi.connector.ConnectorTableProperties; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.connector.ConnectorUpdateTableHandle; +import io.prestosql.spi.connector.ConnectorVacuumTableHandle; +import io.prestosql.spi.connector.ConnectorVacuumTableInfo; +import io.prestosql.spi.connector.ConnectorViewDefinition; +import io.prestosql.spi.connector.Constraint; +import io.prestosql.spi.connector.ConstraintApplicationResult; +import io.prestosql.spi.connector.DiscretePredicates; +import io.prestosql.spi.connector.InMemoryRecordSet; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.SchemaTablePrefix; +import io.prestosql.spi.connector.SystemTable; +import io.prestosql.spi.connector.TableAlreadyExistsException; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.connector.ViewNotFoundException; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.NullableValue; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.security.GrantInfo; +import io.prestosql.spi.security.PrestoPrincipal; +import io.prestosql.spi.security.Privilege; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticMetadata; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.statistics.ComputedStatistics; +import io.prestosql.spi.statistics.TableStatisticType; +import io.prestosql.spi.statistics.TableStatistics; +import io.prestosql.spi.statistics.TableStatisticsMetadata; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.type.VarcharType; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.OpenCSVSerde; +import org.apache.hadoop.mapred.JobConf; + +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.OptionalLong; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.ScheduledExecutorService; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import static com.google.common.base.MoreObjects.firstNonNull; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.google.common.collect.Iterables.concat; +import static com.google.common.collect.Streams.stream; +import static io.prestosql.plugin.hive.HiveBucketing.bucketedOnTimestamp; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; +import static io.prestosql.plugin.hive.HiveStorageFormat.ORC; +import static io.prestosql.plugin.hive.HiveTableProperties.IS_EXTERNAL_TABLE; +import static io.prestosql.plugin.hive.HiveTableProperties.LOCATION_PROPERTY; +import static io.prestosql.plugin.hive.HiveTableProperties.NON_INHERITABLE_PROPERTIES; +import static io.prestosql.plugin.hive.HiveTableProperties.TRANSACTIONAL; +import static io.prestosql.plugin.hive.HiveTableProperties.getExternalLocation; +import static io.prestosql.plugin.hive.HiveTableProperties.getHiveStorageFormat; +import static io.prestosql.plugin.hive.HiveTableProperties.getLocation; +import static io.prestosql.plugin.hive.HiveTableProperties.getPartitionedBy; +import static io.prestosql.plugin.hive.HiveTableProperties.getTransactionalValue; +import static io.prestosql.plugin.hive.HiveTableProperties.isExternalTable; +import static io.prestosql.plugin.hive.HiveUtil.PRESTO_VIEW_FLAG; +import static io.prestosql.plugin.hive.HiveUtil.columnExtraInfo; +import static io.prestosql.plugin.hive.HiveUtil.decodeViewData; +import static io.prestosql.plugin.hive.HiveUtil.encodeViewData; +import static io.prestosql.plugin.hive.HiveUtil.getPartitionKeyColumnHandles; +import static io.prestosql.plugin.hive.HiveUtil.hiveColumnHandles; +import static io.prestosql.plugin.hive.HiveUtil.isPrestoView; +import static io.prestosql.plugin.hive.HiveUtil.toPartitionValues; +import static io.prestosql.plugin.hive.HiveUtil.verifyPartitionTypeSupported; +import static io.prestosql.plugin.hive.HiveWriteUtils.isS3FileSystem; +import static io.prestosql.plugin.hive.HiveWriterFactory.getSnapshotSubFileIndex; +import static io.prestosql.plugin.hive.HiveWriterFactory.isSnapshotFile; +import static io.prestosql.plugin.hive.HiveWriterFactory.isSnapshotSubFile; +import static io.prestosql.plugin.hive.HiveWriterFactory.removeSnapshotFileName; +import static io.prestosql.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; +import static io.prestosql.spi.StandardErrorCode.GENERIC_USER_ERROR; +import static io.prestosql.spi.StandardErrorCode.INVALID_ANALYZE_PROPERTY; +import static io.prestosql.spi.StandardErrorCode.INVALID_SCHEMA_PROPERTY; +import static io.prestosql.spi.StandardErrorCode.INVALID_TABLE_PROPERTY; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.StandardErrorCode.SCHEMA_NOT_EMPTY; +import static io.prestosql.spi.predicate.TupleDomain.withColumnDomains; +import static io.prestosql.spi.security.PrincipalType.USER; +import static io.prestosql.spi.statistics.TableStatisticType.ROW_COUNT; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.VarcharType.createUnboundedVarcharType; +import static java.lang.String.format; +import static java.util.Collections.emptyList; +import static java.util.Collections.emptyMap; +import static java.util.Locale.ENGLISH; +import static java.util.Objects.requireNonNull; +import static java.util.function.Function.identity; +import static java.util.stream.Collectors.joining; +import static java.util.stream.Collectors.toList; +import static java.util.stream.Collectors.toMap; +import static java.util.stream.Collectors.toSet; +import static org.apache.hadoop.hive.metastore.TableType.EXTERNAL_TABLE; +import static org.apache.hadoop.hive.metastore.TableType.MANAGED_TABLE; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE; + +public class HiveMetadata + implements TransactionalMetadata +{ + private static final Logger log = Logger.get(HiveMetadata.class); + + public static final String PRESTO_VERSION_NAME = "presto_version"; + public static final String PRESTO_QUERY_ID_NAME = "presto_query_id"; + public static final String BUCKETING_VERSION = "bucketing_version"; + public static final String TABLE_COMMENT = "comment"; + + public static final String STORAGE_FORMAT = "storage_format"; + + private static final String ORC_BLOOM_FILTER_COLUMNS_KEY = "orc.bloom.filter.columns"; + private static final String ORC_BLOOM_FILTER_FPP_KEY = "orc.bloom.filter.fpp"; + + private static final String TEXT_SKIP_HEADER_COUNT_KEY = "skip.header.line.count"; + private static final String TEXT_SKIP_FOOTER_COUNT_KEY = "skip.footer.line.count"; + + public static final String AVRO_SCHEMA_URL_KEY = "avro.schema.url"; + + private static final String CSV_SEPARATOR_KEY = OpenCSVSerde.SEPARATORCHAR; + private static final String CSV_QUOTE_KEY = OpenCSVSerde.QUOTECHAR; + private static final String CSV_ESCAPE_KEY = OpenCSVSerde.ESCAPECHAR; + + protected final SemiTransactionalHiveMetastore metastore; + protected final HdfsEnvironment hdfsEnvironment; + private final HivePartitionManager partitionManager; + protected final TypeManager typeManager; + protected final LocationService locationService; + private final JsonCodec partitionUpdateCodec; + private final boolean writesToNonManagedTablesEnabled; + private final boolean createsOfNonManagedTablesEnabled; + protected final TypeTranslator typeTranslator; + protected final String prestoVersion; + private final HiveStatisticsProvider hiveStatisticsProvider; + private final AccessControlMetadata accessControlMetadata; + protected final boolean tableCreatesWithLocationAllowed; + + private final int vacuumDeltaNumThreshold; + private final double vacuumDeltaPercentThreshold; + private final boolean autoVacuumEnabled; + protected final ScheduledExecutorService vacuumExecutorService; + protected final ScheduledExecutorService hiveMetastoreClientService; + private final long vacuumCollectorInterval; + + private boolean externalTable; + + public HiveMetadata( + SemiTransactionalHiveMetastore metastore, + HdfsEnvironment hdfsEnvironment, + HivePartitionManager partitionManager, + boolean writesToNonManagedTablesEnabled, + boolean createsOfNonManagedTablesEnabled, + boolean tableCreatesWithLocationAllowed, + TypeManager typeManager, + LocationService locationService, + JsonCodec partitionUpdateCodec, + TypeTranslator typeTranslator, + String prestoVersion, + HiveStatisticsProvider hiveStatisticsProvider, + AccessControlMetadata accessControlMetadata, + boolean autoVacuumEnabled, + int vacuumDeltaNumThreshold, + double vacuumDeltaPercentThreshold, + ScheduledExecutorService vacuumExecutorService, + Optional vacuumCollectorInterval, + ScheduledExecutorService hiveMetastoreClientService) + { + this.metastore = requireNonNull(metastore, "metastore is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.partitionManager = requireNonNull(partitionManager, "partitionManager is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + this.locationService = requireNonNull(locationService, "locationService is null"); + this.partitionUpdateCodec = requireNonNull(partitionUpdateCodec, "partitionUpdateCodec is null"); + this.writesToNonManagedTablesEnabled = writesToNonManagedTablesEnabled; + this.createsOfNonManagedTablesEnabled = createsOfNonManagedTablesEnabled; + this.tableCreatesWithLocationAllowed = tableCreatesWithLocationAllowed; + this.typeTranslator = requireNonNull(typeTranslator, "typeTranslator is null"); + this.prestoVersion = requireNonNull(prestoVersion, "prestoVersion is null"); + this.hiveStatisticsProvider = requireNonNull(hiveStatisticsProvider, "hiveStatisticsProvider is null"); + this.accessControlMetadata = requireNonNull(accessControlMetadata, "accessControlMetadata is null"); + this.externalTable = false; + + this.vacuumDeltaNumThreshold = vacuumDeltaNumThreshold; + this.vacuumDeltaPercentThreshold = vacuumDeltaPercentThreshold; + this.autoVacuumEnabled = autoVacuumEnabled; + this.vacuumExecutorService = vacuumExecutorService; + this.vacuumCollectorInterval = vacuumCollectorInterval.map(Duration::toMillis) + .orElseThrow(() -> new PrestoException(GENERIC_INTERNAL_ERROR, "Vacuum collector interval is not set correctly")); + this.hiveMetastoreClientService = hiveMetastoreClientService; + } + + public SemiTransactionalHiveMetastore getMetastore() + { + return metastore; + } + + @Override + public List listSchemaNames(ConnectorSession session) + { + return metastore.getAllDatabases(); + } + + @Override + public HiveTableHandle getTableHandle(ConnectorSession session, SchemaTableName tableName) + { + requireNonNull(tableName, "tableName is null"); + Optional table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()); + if (!table.isPresent()) { + return null; + } + + // we must not allow system tables due to how permissions are checked in SystemTableAwareAccessControl + if (getSourceTableNameFromSystemTable(tableName).isPresent()) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Unexpected table present in Hive metastore: " + tableName); + } + + MetastoreUtil.verifyOnline(tableName, Optional.empty(), MetastoreUtil.getProtectMode(table.get()), table.get().getParameters()); + + Map parameters = new HashMap<>(); + parameters.putAll(table.get().getParameters()); + + String format = table.get().getStorage().getStorageFormat().getOutputFormatNullable(); + if (format != null) { + parameters.put(STORAGE_FORMAT, format); + } + + return new HiveTableHandle( + tableName.getSchemaName(), + tableName.getTableName(), + parameters, + getPartitionKeyColumnHandles(table.get()), + HiveBucketing.getHiveBucketHandle(table.get())); + } + + @Override + public ConnectorTableHandle getTableHandleForStatisticsCollection(ConnectorSession session, SchemaTableName tableName, Map analyzeProperties) + { + HiveTableHandle handle = getTableHandle(session, tableName); + if (handle == null) { + return null; + } + Optional>> partitionValuesList = HiveAnalyzeProperties.getPartitionList(analyzeProperties); + ConnectorTableMetadata tableMetadata = getTableMetadata(session, handle.getSchemaTableName()); + handle = handle.withAnalyzePartitionValues(partitionValuesList); + + List partitionedBy = getPartitionedBy(tableMetadata.getProperties()); + + partitionValuesList.ifPresent(list -> { + if (partitionedBy.isEmpty()) { + throw new PrestoException(INVALID_ANALYZE_PROPERTY, "Partition list provided but table is not partitioned"); + } + for (List values : list) { + if (values.size() != partitionedBy.size()) { + throw new PrestoException(INVALID_ANALYZE_PROPERTY, "Partition value count does not match partition column count"); + } + } + }); + + HiveTableHandle table = handle; + return partitionValuesList + .map(values -> partitionManager.getPartitions(table, values)) + .map(result -> partitionManager.applyPartitionResult(table, result)) + .orElse(table); + } + + @Override + public Optional getSystemTable(ConnectorSession session, SchemaTableName tableName) + { + if (SystemTableHandler.PARTITIONS.matches(tableName)) { + return getPartitionsSystemTable(session, tableName, SystemTableHandler.PARTITIONS.getSourceTableName(tableName)); + } + if (SystemTableHandler.PROPERTIES.matches(tableName)) { + return getPropertiesSystemTable(session, tableName, SystemTableHandler.PROPERTIES.getSourceTableName(tableName)); + } + return Optional.empty(); + } + + private Optional getPropertiesSystemTable(ConnectorSession session, SchemaTableName tableName, SchemaTableName sourceTableName) + { + Optional
table = metastore.getTable(new HiveIdentity(session), sourceTableName.getSchemaName(), sourceTableName.getTableName()); + if (!table.isPresent() || table.get().getTableType().equals(TableType.VIRTUAL_VIEW.name())) { + throw new TableNotFoundException(tableName); + } + Map sortedTableParameters = ImmutableSortedMap.copyOf(table.get().getParameters()); + List columns = sortedTableParameters.keySet().stream() + .map(key -> new ColumnMetadata(key, VarcharType.VARCHAR)) + .collect(toImmutableList()); + List types = columns.stream() + .map(ColumnMetadata::getType) + .collect(toImmutableList()); + Iterable> propertyValues = ImmutableList.of(ImmutableList.copyOf(sortedTableParameters.values())); + + return Optional.of(createSystemTable(new ConnectorTableMetadata(sourceTableName, columns), constraint -> new InMemoryRecordSet(types, propertyValues).cursor())); + } + + private Optional getPartitionsSystemTable(ConnectorSession session, SchemaTableName tableName, SchemaTableName sourceTableName) + { + HiveTableHandle sourceTableHandle = getTableHandle(session, sourceTableName); + + if (sourceTableHandle == null) { + return Optional.empty(); + } + + SchemaTableName schemaTableName = sourceTableHandle.getSchemaTableName(); + + Table table = metastore.getTable(new HiveIdentity(session), schemaTableName.getSchemaName(), schemaTableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(schemaTableName)); + + List partitionColumns = sourceTableHandle.getPartitionColumns(); + if (partitionColumns.isEmpty()) { + return Optional.empty(); + } + + List partitionColumnTypes = partitionColumns.stream() + .map(HiveColumnHandle::getTypeSignature) + .map(typeManager::getType) + .collect(toImmutableList()); + + List partitionSystemTableColumns = partitionColumns.stream() + .map(column -> new ColumnMetadata( + column.getName(), + typeManager.getType(column.getTypeSignature()), + column.getComment().orElse(null), + column.isHidden())) + .collect(toImmutableList()); + + Map fieldIdToColumnHandle = + IntStream.range(0, partitionColumns.size()) + .boxed() + .collect(toImmutableMap(identity(), partitionColumns::get)); + + return Optional.of(createSystemTable( + new ConnectorTableMetadata(tableName, partitionSystemTableColumns), + constraint -> { + TupleDomain targetTupleDomain = constraint.transform(fieldIdToColumnHandle::get); + Predicate> targetPredicate = convertToPredicate(targetTupleDomain); + Constraint targetConstraint = new Constraint(targetTupleDomain, targetPredicate); + Iterable> records = () -> + stream(partitionManager.getPartitions(metastore, new HiveIdentity(session), sourceTableHandle, targetConstraint, table).getPartitions()) + .map(hivePartition -> + IntStream.range(0, partitionColumns.size()) + .mapToObj(fieldIdToColumnHandle::get) + .map(columnHandle -> hivePartition.getKeys().get(columnHandle).getValue()) + .collect(toList())) + .iterator(); + + return new InMemoryRecordSet(partitionColumnTypes, records).cursor(); + })); + } + + @Override + public ConnectorTableMetadata getTableMetadata(ConnectorSession session, ConnectorTableHandle tableHandle) + { + return getTableMetadata(session, ((HiveTableHandle) tableHandle).getSchemaTableName()); + } + + private ConnectorTableMetadata getTableMetadata(ConnectorSession session, SchemaTableName tableName) + { + try { + return doGetTableMetadata(session, tableName); + } + catch (PrestoException e) { + throw e; + } + catch (RuntimeException e) { + // Errors related to invalid or unsupported information in the Metastore should be handled explicitly (eg. as PrestoException(HIVE_INVALID_METADATA)). + // This is just a catch-all solution so that we have any actionable information when eg. SELECT * FROM information_schema.columns fails. + throw new RuntimeException("Failed to construct table metadata for table " + tableName, e); + } + } + + protected ConnectorTableMetadata doGetTableMetadata(ConnectorSession session, SchemaTableName tableName) + { + Optional
table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()); + if (!table.isPresent() || table.get().getTableType().equals(TableType.VIRTUAL_VIEW.name())) { + throw new TableNotFoundException(tableName); + } + + Function metadataGetter = columnMetadataGetter(table.get(), typeManager); + ImmutableList.Builder columns = ImmutableList.builder(); + for (HiveColumnHandle columnHandle : hiveColumnHandles(table.get())) { + columns.add(metadataGetter.apply(columnHandle)); + } + + // External location property + ImmutableMap.Builder properties = ImmutableMap.builder(); + properties.put(LOCATION_PROPERTY, table.get().getStorage().getLocation()); + properties.put(IS_EXTERNAL_TABLE, table.get().getTableType().equals(EXTERNAL_TABLE.name())); + + // Storage format property + try { + HiveStorageFormat format = extractHiveStorageFormat(table.get()); + properties.put(HiveTableProperties.STORAGE_FORMAT_PROPERTY, format); + } + catch (PrestoException ignored) { + // todo fail if format is not known + } + + // Partitioning property + List partitionedBy = table.get().getPartitionColumns().stream() + .map(Column::getName) + .collect(toList()); + if (!partitionedBy.isEmpty()) { + properties.put(HiveTableProperties.PARTITIONED_BY_PROPERTY, partitionedBy); + } + + // Bucket properties + table.get().getStorage().getBucketProperty().ifPresent(property -> { + properties.put(BUCKETING_VERSION, property.getBucketingVersion().getVersion()); + properties.put(HiveTableProperties.BUCKET_COUNT_PROPERTY, property.getBucketCount()); + properties.put(HiveTableProperties.BUCKETED_BY_PROPERTY, property.getBucketedBy()); + properties.put(HiveTableProperties.SORTED_BY_PROPERTY, property.getSortedBy()); + }); + + // Is transactional table + if (Boolean.valueOf(table.get().getParameters().get(TRANSACTIONAL))) { + properties.put(TRANSACTIONAL, true); + } + + // ORC format specific properties + String orcBloomFilterColumns = table.get().getParameters().get(ORC_BLOOM_FILTER_COLUMNS_KEY); + if (orcBloomFilterColumns != null) { + properties.put(HiveTableProperties.ORC_BLOOM_FILTER_COLUMNS, Splitter.on(',').trimResults().omitEmptyStrings().splitToList(orcBloomFilterColumns)); + } + String orcBloomFilterFfp = table.get().getParameters().get(ORC_BLOOM_FILTER_FPP_KEY); + if (orcBloomFilterFfp != null) { + properties.put(HiveTableProperties.ORC_BLOOM_FILTER_FPP, Double.parseDouble(orcBloomFilterFfp)); + } + + // Avro specific property + String avroSchemaUrl = table.get().getParameters().get(AVRO_SCHEMA_URL_KEY); + if (avroSchemaUrl != null) { + properties.put(HiveTableProperties.AVRO_SCHEMA_URL, avroSchemaUrl); + } + + // Textfile specific property + String textSkipHeaderCount = table.get().getParameters().get(TEXT_SKIP_HEADER_COUNT_KEY); + if (textSkipHeaderCount != null) { + properties.put(HiveTableProperties.TEXTFILE_SKIP_HEADER_LINE_COUNT, Integer.valueOf(textSkipHeaderCount)); + } + String textSkipFooterCount = table.get().getParameters().get(TEXT_SKIP_FOOTER_COUNT_KEY); + if (textSkipFooterCount != null) { + properties.put(HiveTableProperties.TEXTFILE_SKIP_FOOTER_LINE_COUNT, Integer.valueOf(textSkipFooterCount)); + } + + // CSV specific property + getCsvSerdeProperty(table.get(), CSV_SEPARATOR_KEY) + .ifPresent(csvSeparator -> properties.put(HiveTableProperties.CSV_SEPARATOR, csvSeparator)); + getCsvSerdeProperty(table.get(), CSV_QUOTE_KEY) + .ifPresent(csvQuote -> properties.put(HiveTableProperties.CSV_QUOTE, csvQuote)); + getCsvSerdeProperty(table.get(), CSV_ESCAPE_KEY) + .ifPresent(csvEscape -> properties.put(HiveTableProperties.CSV_ESCAPE, csvEscape)); + + Optional comment = Optional.ofNullable(table.get().getParameters().get(TABLE_COMMENT)); + + // add partitioned columns and bucketed columns into immutableColumns + ImmutableList.Builder immutableColumns = ImmutableList.builder(); + List bucketedColumns = new ArrayList<>(); + table.get().getStorage().getBucketProperty().ifPresent(property -> { + bucketedColumns.addAll(property.getBucketedBy()); + }); + + for (HiveColumnHandle columnHandle : hiveColumnHandles(table.get())) { + if (columnHandle.getColumnType().equals(HiveColumnHandle.ColumnType.PARTITION_KEY)) { + immutableColumns.add(metadataGetter.apply(columnHandle)); + } + if (bucketedColumns.contains(columnHandle.getColumnName())) { + immutableColumns.add(metadataGetter.apply(columnHandle)); + } + } + + return new ConnectorTableMetadata(tableName, columns.build(), properties.build(), comment, Optional.of(immutableColumns.build()), Optional.of(NON_INHERITABLE_PROPERTIES)); + } + + private static Optional getCsvSerdeProperty(Table table, String key) + { + return getSerdeProperty(table, key).map(csvSerdeProperty -> { + if (csvSerdeProperty.length() > 1) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Only single character can be set for property: " + key); + } + return csvSerdeProperty; + }); + } + + private static Optional getSerdeProperty(Table table, String key) + { + String serdePropertyValue = table.getStorage().getSerdeParameters().get(key); + String tablePropertyValue = table.getParameters().get(key); + if (serdePropertyValue != null && tablePropertyValue != null && !tablePropertyValue.equals(serdePropertyValue)) { + // in Hive one can set conflicting values for the same property, in such case it looks like table properties are used + throw new PrestoException( + HiveErrorCode.HIVE_INVALID_METADATA, + format("Different values for '%s' set in serde properties and table properties: '%s' and '%s'", key, serdePropertyValue, tablePropertyValue)); + } + return firstNonNullable(tablePropertyValue, serdePropertyValue); + } + + @Override + public Optional getInfo(ConnectorTableHandle table) + { + return ((HiveTableHandle) table).getPartitions() + .map(partitions -> new HiveInputInfo( + partitions.stream() + .map(HivePartition::getPartitionId) + .collect(toImmutableList()), + false)); + } + + @Override + public boolean isHeuristicIndexSupported() + { + return true; + } + + @Override + public boolean isPreAggregationSupported(ConnectorSession session) + { + return true; + } + + @Override + public List listTables(ConnectorSession session, Optional optionalSchemaName) + { + ImmutableList.Builder tableNames = ImmutableList.builder(); + for (String schemaName : listSchemas(session, optionalSchemaName)) { + for (String tableName : metastore.getAllTables(schemaName).orElse(emptyList())) { + tableNames.add(new SchemaTableName(schemaName, tableName)); + } + } + return tableNames.build(); + } + + private List listSchemas(ConnectorSession session, Optional schemaName) + { + if (schemaName.isPresent()) { + return ImmutableList.of(schemaName.get()); + } + return listSchemaNames(session); + } + + @Override + public Map getColumnHandles(ConnectorSession session, ConnectorTableHandle tableHandle) + { + SchemaTableName tableName = ((HiveTableHandle) tableHandle).getSchemaTableName(); + Table table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + return hiveColumnHandles(table).stream() + .collect(toImmutableMap(HiveColumnHandle::getName, identity())); + } + + private Map getColumnHandles(Table table) + { + return hiveColumnHandles(table).stream() + .collect(toImmutableMap(HiveColumnHandle::getName, identity())); + } + + @Override + public long getTableModificationTime(ConnectorSession session, ConnectorTableHandle tableHandle) + { + SchemaTableName tableName = ((HiveTableHandle) tableHandle).getSchemaTableName(); + Table table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + String tableLocation = table.getStorage().getLocation(); + Path tablePath = new Path(tableLocation); + try { + FileSystem fileSystem = this.hdfsEnvironment.getFileSystem(new HdfsContext(session, tableName.getSchemaName()), tablePath); + // We use the directory modification time to represent the table modification time + // since HDFS is append-only and any table modification will trigger directory update. + return fileSystem.getFileStatus(tablePath).getModificationTime(); + } + // We want to make sure the query doesn't fail because of star-tree not being able to get last modified time + catch (Exception e) { + log.error("Exception thrown while trying to get modified time", e); + return -1L; + } + } + + @SuppressWarnings("TryWithIdenticalCatches") + @Override + public Map> listTableColumns(ConnectorSession session, SchemaTablePrefix prefix) + { + requireNonNull(prefix, "prefix is null"); + ImmutableMap.Builder> columns = ImmutableMap.builder(); + for (SchemaTableName tableName : listTables(session, prefix)) { + try { + columns.put(tableName, getTableMetadata(session, tableName).getColumns()); + } + catch (HiveViewNotSupportedException e) { + // view is not supported + } + catch (TableNotFoundException e) { + // table disappeared during listing operation + } + } + return columns.build(); + } + + @Override + public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTableHandle tableHandle, Constraint constraint, boolean includeColumnStatistics) + { + if (!HiveSessionProperties.isStatisticsEnabled(session)) { + return TableStatistics.empty(); + } + SchemaTableName tableName = ((HiveTableHandle) tableHandle).getSchemaTableName(); + Table table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + Map columns = getColumnHandles(table) + .entrySet().stream() + .filter(entry -> !((HiveColumnHandle) entry.getValue()).isHidden()) + .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue)); + Map columnTypes = columns.entrySet().stream() + .collect(toImmutableMap(Map.Entry::getKey, entry -> getColumnMetadata(session, tableHandle, entry.getValue()).getType())); + HivePartitionResult partitionResult = partitionManager.getPartitions(metastore, new HiveIdentity(session), tableHandle, constraint, table); + List partitions = partitionManager.getPartitionsAsList(partitionResult); + return hiveStatisticsProvider.getTableStatistics(session, ((HiveTableHandle) tableHandle).getSchemaTableName(), columns, columnTypes, partitions, includeColumnStatistics, table); + } + + private List listTables(ConnectorSession session, SchemaTablePrefix prefix) + { + if (!prefix.getTable().isPresent()) { + return listTables(session, prefix.getSchema()); + } + return ImmutableList.of(prefix.toSchemaTableName()); + } + + /** + * NOTE: This method does not return column comment + */ + @Override + public ColumnMetadata getColumnMetadata(ConnectorSession session, ConnectorTableHandle tableHandle, ColumnHandle columnHandle) + { + return ((HiveColumnHandle) columnHandle).getColumnMetadata(typeManager); + } + + @Override + public void createSchema(ConnectorSession session, String schemaName, Map properties) + { + Optional location = HiveSchemaProperties.getLocation(properties).map(locationUri -> { + try { + hdfsEnvironment.getFileSystem(new HdfsContext(session, schemaName), new Path(locationUri)); + } + catch (IOException e) { + throw new PrestoException(INVALID_SCHEMA_PROPERTY, "Invalid location URI: " + locationUri, e); + } + return locationUri; + }); + + Database database = Database.builder() + .setDatabaseName(schemaName) + .setLocation(location) + .setOwnerType(USER) + .setOwnerName(session.getUser()) + .build(); + + metastore.createDatabase(new HiveIdentity(session), database); + } + + @Override + public void dropSchema(ConnectorSession session, String schemaName) + { + // basic sanity check to provide a better error message + if (!listTables(session, Optional.of(schemaName)).isEmpty() || + !listViews(session, Optional.of(schemaName)).isEmpty()) { + throw new PrestoException(SCHEMA_NOT_EMPTY, "Schema not empty: " + schemaName); + } + metastore.dropDatabase(new HiveIdentity(session), schemaName); + } + + @Override + public void renameSchema(ConnectorSession session, String source, String target) + { + metastore.renameDatabase(new HiveIdentity(session), source, target); + } + + @Override + public void createTable(ConnectorSession session, ConnectorTableMetadata tableMetadata, boolean ignoreExisting) + { + SchemaTableName schemaTableName = tableMetadata.getTable(); + String schemaName = schemaTableName.getSchemaName(); + String tableName = schemaTableName.getTableName(); + List partitionedBy = getPartitionedBy(tableMetadata.getProperties()); + Optional bucketProperty = HiveTableProperties.getBucketProperty(tableMetadata.getProperties()); + + if ((bucketProperty.isPresent() || !partitionedBy.isEmpty()) && HiveTableProperties.getAvroSchemaUrl(tableMetadata.getProperties()) != null) { + throw new PrestoException(NOT_SUPPORTED, "Bucketing/Partitioning columns not supported when Avro schema url is set"); + } + + List columnHandles = getColumnHandles(tableMetadata, ImmutableSet.copyOf(partitionedBy), typeTranslator); + HiveStorageFormat hiveStorageFormat = HiveTableProperties.getHiveStorageFormat(tableMetadata.getProperties()); + Map tableProperties = getEmptyTableProperties(tableMetadata, bucketProperty, new HdfsContext(session, schemaName, tableName)); + + hiveStorageFormat.validateColumns(columnHandles); + + Map columnHandlesByName = Maps.uniqueIndex(columnHandles, HiveColumnHandle::getName); + List partitionColumns = partitionedBy.stream() + .map(columnHandlesByName::get) + .map(column -> new Column(column.getName(), column.getHiveType(), column.getComment())) + .collect(toList()); + checkPartitionTypesSupported(partitionColumns); + + boolean external = isExternalTable(tableMetadata.getProperties()); + String externalLocation = getExternalLocation(tableMetadata.getProperties()); + if ((external || (externalLocation != null)) && !createsOfNonManagedTablesEnabled) { + throw new PrestoException(NOT_SUPPORTED, "Cannot create non-managed Hive table"); + } + + Path targetPath; + Optional location = getLocation(tableMetadata.getProperties()); + // User specifies the location property + if (location.isPresent()) { + if (!tableCreatesWithLocationAllowed) { + throw new PrestoException(NOT_SUPPORTED, format("Setting %s property is not allowed", LOCATION_PROPERTY)); + } + targetPath = getPath(new HdfsContext(session, schemaName, tableName), location.get(), external); + } + else { + // User specifies external property, but location property is absent + if (external) { + throw new PrestoException(NOT_SUPPORTED, format("Cannot create external Hive table without location. Set it through '%s' property", LOCATION_PROPERTY)); + } + + // User specifies the external location property + if (externalLocation != null) { + external = true; + targetPath = getPath(new HdfsContext(session, schemaName, tableName), externalLocation, true); + } + + // Default option + else { + external = false; + LocationHandle locationHandle = locationService.forNewTable(metastore, session, schemaName, tableName, Optional.empty(), Optional.empty(), HiveWriteUtils.OpertionType.CREATE_TABLE); + targetPath = locationService.getQueryWriteInfo(locationHandle).getTargetPath(); + } + } + + Table table = buildTableObject( + session.getQueryId(), + schemaName, + tableName, + session.getUser(), + columnHandles, + hiveStorageFormat, + partitionedBy, + bucketProperty, + tableProperties, + targetPath, + external, + prestoVersion); + PrincipalPrivileges principalPrivileges = MetastoreUtil.buildInitialPrivilegeSet(table.getOwner()); + HiveBasicStatistics basicStatistics = table.getPartitionColumns().isEmpty() ? HiveBasicStatistics.createZeroStatistics() : HiveBasicStatistics.createEmptyStatistics(); + metastore.createTable( + session, + table, + principalPrivileges, + Optional.empty(), + ignoreExisting, + new PartitionStatistics(basicStatistics, ImmutableMap.of())); + } + + protected Map getEmptyTableProperties(ConnectorTableMetadata tableMetadata, Optional bucketProperty, HdfsContext hdfsContext) + { + HiveStorageFormat hiveStorageFormat = HiveTableProperties.getHiveStorageFormat(tableMetadata.getProperties()); + ImmutableMap.Builder tableProperties = ImmutableMap.builder(); + + bucketProperty.ifPresent(hiveBucketProperty -> + tableProperties.put(BUCKETING_VERSION, Integer.toString(hiveBucketProperty.getBucketingVersion().getVersion()))); + + // ORC format specific properties + if (getTransactionalValue(tableMetadata.getProperties())) { + if (!hiveStorageFormat.equals(HiveStorageFormat.ORC)) { + // only ORC storage format support ACID + throw new PrestoException(NOT_SUPPORTED, "Only ORC storage format supports creating transactional table."); + } + + // set transactional property. + tableProperties.put(TRANSACTIONAL, Boolean.toString(getTransactionalValue(tableMetadata.getProperties()))); + } + List columns = HiveTableProperties.getOrcBloomFilterColumns(tableMetadata.getProperties()); + if (columns != null && !columns.isEmpty()) { + checkFormatForProperty(hiveStorageFormat, HiveStorageFormat.ORC, HiveTableProperties.ORC_BLOOM_FILTER_COLUMNS); + tableProperties.put(ORC_BLOOM_FILTER_COLUMNS_KEY, Joiner.on(",").join(columns)); + tableProperties.put(ORC_BLOOM_FILTER_FPP_KEY, String.valueOf(HiveTableProperties.getOrcBloomFilterFpp(tableMetadata.getProperties()))); + } + + // Avro specific properties + String avroSchemaUrl = HiveTableProperties.getAvroSchemaUrl(tableMetadata.getProperties()); + if (avroSchemaUrl != null) { + checkFormatForProperty(hiveStorageFormat, HiveStorageFormat.AVRO, HiveTableProperties.AVRO_SCHEMA_URL); + tableProperties.put(AVRO_SCHEMA_URL_KEY, validateAndNormalizeAvroSchemaUrl(avroSchemaUrl, hdfsContext)); + } + + // Textfile specific properties + HiveTableProperties.getTextHeaderSkipCount(tableMetadata.getProperties()).ifPresent(headerSkipCount -> { + if (headerSkipCount > 0) { + checkFormatForProperty(hiveStorageFormat, HiveStorageFormat.TEXTFILE, HiveTableProperties.TEXTFILE_SKIP_HEADER_LINE_COUNT); + tableProperties.put(TEXT_SKIP_HEADER_COUNT_KEY, String.valueOf(headerSkipCount)); + } + if (headerSkipCount < 0) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, String.format("Invalid value for %s property: %s", HiveTableProperties.TEXTFILE_SKIP_HEADER_LINE_COUNT, headerSkipCount)); + } + }); + + HiveTableProperties.getTextFooterSkipCount(tableMetadata.getProperties()).ifPresent(footerSkipCount -> { + if (footerSkipCount > 0) { + checkFormatForProperty(hiveStorageFormat, HiveStorageFormat.TEXTFILE, HiveTableProperties.TEXTFILE_SKIP_FOOTER_LINE_COUNT); + tableProperties.put(TEXT_SKIP_FOOTER_COUNT_KEY, String.valueOf(footerSkipCount)); + } + if (footerSkipCount < 0) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, String.format("Invalid value for %s property: %s", HiveTableProperties.TEXTFILE_SKIP_FOOTER_LINE_COUNT, footerSkipCount)); + } + }); + + // CSV specific properties + HiveTableProperties.getCsvProperty(tableMetadata.getProperties(), HiveTableProperties.CSV_ESCAPE) + .ifPresent(escape -> { + checkFormatForProperty(hiveStorageFormat, HiveStorageFormat.CSV, HiveTableProperties.CSV_ESCAPE); + tableProperties.put(CSV_ESCAPE_KEY, escape.toString()); + }); + HiveTableProperties.getCsvProperty(tableMetadata.getProperties(), HiveTableProperties.CSV_QUOTE) + .ifPresent(quote -> { + checkFormatForProperty(hiveStorageFormat, HiveStorageFormat.CSV, HiveTableProperties.CSV_QUOTE); + tableProperties.put(CSV_QUOTE_KEY, quote.toString()); + }); + HiveTableProperties.getCsvProperty(tableMetadata.getProperties(), HiveTableProperties.CSV_SEPARATOR) + .ifPresent(separator -> { + checkFormatForProperty(hiveStorageFormat, HiveStorageFormat.CSV, HiveTableProperties.CSV_SEPARATOR); + tableProperties.put(CSV_SEPARATOR_KEY, separator.toString()); + }); + + // Table comment property + tableMetadata.getComment().ifPresent(value -> tableProperties.put(TABLE_COMMENT, value)); + + return tableProperties.build(); + } + + private static void checkFormatForProperty(HiveStorageFormat actualStorageFormat, HiveStorageFormat expectedStorageFormat, String propertyName) + { + if (actualStorageFormat != expectedStorageFormat) { + throw new PrestoException(INVALID_TABLE_PROPERTY, format("Cannot specify %s table property for storage format: %s", propertyName, actualStorageFormat)); + } + } + + private String validateAndNormalizeAvroSchemaUrl(String url, HdfsContext context) + { + try { + new URL(url).openStream().close(); + return url; + } + catch (MalformedURLException e) { + // try locally + if (new File(url).exists()) { + // hive needs url to have a protocol + return new File(url).toURI().toString(); + } + // try hdfs + try { + if (!hdfsEnvironment.getFileSystem(context, new Path(url)).exists(new Path(url))) { + throw new PrestoException(INVALID_TABLE_PROPERTY, "Cannot locate Avro schema file: " + url); + } + return url; + } + catch (IOException ex) { + throw new PrestoException(INVALID_TABLE_PROPERTY, "Avro schema file is not a valid file system URI: " + url, ex); + } + } + catch (IOException e) { + throw new PrestoException(INVALID_TABLE_PROPERTY, "Cannot open Avro schema file: " + url, e); + } + } + + protected Path getPath(HdfsContext context, String location, Boolean external) + { + try { + Path path = new Path(location); + if (!isS3FileSystem(context, hdfsEnvironment, path)) { + if (!hdfsEnvironment.getFileSystem(context, path).isDirectory(path)) { + throw new PrestoException(INVALID_TABLE_PROPERTY, format("Location is not a directory: %s", location)); + } + } + + if (!external) { + return new Path(path, context.getTableName().get()); + } + + return path; + } + catch (IllegalArgumentException | IOException e) { + throw new PrestoException(INVALID_TABLE_PROPERTY, "Location is not a valid file system URI", e); + } + } + + protected void checkPartitionTypesSupported(List partitionColumns) + { + for (Column partitionColumn : partitionColumns) { + Type partitionType = typeManager.getType(partitionColumn.getType().getTypeSignature()); + verifyPartitionTypeSupported(partitionColumn.getName(), partitionType); + } + } + + protected static Table buildTableObject( + String queryId, + String schemaName, + String tableName, + String tableOwner, + List columnHandles, + BaseStorageFormat hiveStorageFormat, + List partitionedBy, + Optional bucketProperty, + Map additionalTableParameters, + Path targetPath, + boolean external, + String prestoVersion) + { + return buildTableObject( + queryId, + schemaName, + tableName, + tableOwner, + columnHandles, + hiveStorageFormat, + partitionedBy, + bucketProperty, + additionalTableParameters, + targetPath, + external, + prestoVersion, + null); + } + + protected static Table buildTableObject( + String queryId, + String schemaName, + String tableName, + String tableOwner, + List columnHandles, + BaseStorageFormat hiveStorageFormat, + List partitionedBy, + Optional bucketProperty, + Map additionalTableParameters, + Path targetPath, + boolean external, + String prestoVersion, + Map serdeParameters) + { + Map columnHandlesByName = Maps.uniqueIndex(columnHandles, HiveColumnHandle::getName); + List partitionColumns = partitionedBy.stream() + .map(columnHandlesByName::get) + .map(column -> new Column(column.getName(), column.getHiveType(), column.getComment())) + .collect(toList()); + + Set partitionColumnNames = ImmutableSet.copyOf(partitionedBy); + + ImmutableList.Builder columns = ImmutableList.builder(); + for (HiveColumnHandle columnHandle : columnHandles) { + String name = columnHandle.getName(); + HiveType type = columnHandle.getHiveType(); + if (!partitionColumnNames.contains(name)) { + verify(!columnHandle.isPartitionKey(), "Column handles are not consistent with partitioned by property"); + columns.add(new Column(name, type, columnHandle.getComment())); + } + else { + verify(columnHandle.isPartitionKey(), "Column handles are not consistent with partitioned by property"); + } + } + + ImmutableMap.Builder tableParameters = ImmutableMap.builder() + .put(PRESTO_VERSION_NAME, prestoVersion) + .put(PRESTO_QUERY_ID_NAME, queryId) + .putAll(additionalTableParameters); + + if (external) { + tableParameters.put("EXTERNAL", "TRUE"); + } + + Table.Builder tableBuilder = Table.builder() + .setDatabaseName(schemaName) + .setTableName(tableName) + .setOwner(tableOwner) + .setTableType((external ? EXTERNAL_TABLE : MANAGED_TABLE).name()) + .setDataColumns(columns.build()) + .setPartitionColumns(partitionColumns) + .setParameters(tableParameters.build()); + + tableBuilder.getStorageBuilder() + .setStorageFormat(StorageFormat.fromHiveStorageFormat(hiveStorageFormat)) + .setSerdeParameters(ImmutableMap.of(serdeConstants.SERIALIZATION_FORMAT, "1")) + .setBucketProperty(bucketProperty) + .setLocation(targetPath.toString()); + + if (null != serdeParameters) { + tableBuilder.getStorageBuilder().setSerdeParameters(serdeParameters); + } + + return tableBuilder.build(); + } + + @Override + public void addColumn(ConnectorSession session, ConnectorTableHandle tableHandle, ColumnMetadata column) + { + HiveTableHandle handle = (HiveTableHandle) tableHandle; + failIfAvroSchemaIsSet(session, handle); + HiveIdentity hiveIdentity = new HiveIdentity(session); + + Optional
table = metastore.getTable(hiveIdentity, handle.getSchemaName(), handle.getTableName()); + if (!table.isPresent()) { + throw new TableNotFoundException(handle.getSchemaTableName()); + } + verifyStorageFormatForCatalog(table.get().getStorage().getStorageFormat()); + metastore.addColumn(hiveIdentity, handle.getSchemaName(), handle.getTableName(), column.getName(), HiveType.toHiveType(typeTranslator, column.getType()), column.getComment()); + } + + @Override + public void renameColumn(ConnectorSession session, ConnectorTableHandle tableHandle, ColumnHandle source, String target) + { + HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle; + failIfAvroSchemaIsSet(session, hiveTableHandle); + HiveColumnHandle sourceHandle = (HiveColumnHandle) source; + HiveIdentity hiveIdentity = new HiveIdentity(session); + Optional
table = metastore.getTable(hiveIdentity, hiveTableHandle.getSchemaName(), hiveTableHandle.getTableName()); + if (!table.isPresent()) { + throw new TableNotFoundException(hiveTableHandle.getSchemaTableName()); + } + verifyStorageFormatForCatalog(table.get().getStorage().getStorageFormat()); + metastore.renameColumn(hiveIdentity, hiveTableHandle.getSchemaName(), hiveTableHandle.getTableName(), sourceHandle.getName(), target); + } + + @Override + public void dropColumn(ConnectorSession session, ConnectorTableHandle tableHandle, ColumnHandle column) + { + HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle; + failIfAvroSchemaIsSet(session, hiveTableHandle); + HiveColumnHandle columnHandle = (HiveColumnHandle) column; + HiveIdentity hiveIdentity = new HiveIdentity(session); + Optional
table = metastore.getTable(hiveIdentity, hiveTableHandle.getSchemaName(), hiveTableHandle.getTableName()); + if (!table.isPresent()) { + throw new TableNotFoundException(hiveTableHandle.getSchemaTableName()); + } + verifyStorageFormatForCatalog(table.get().getStorage().getStorageFormat()); + metastore.dropColumn(hiveIdentity, hiveTableHandle.getSchemaName(), hiveTableHandle.getTableName(), columnHandle.getName()); + } + + private void failIfAvroSchemaIsSet(ConnectorSession session, HiveTableHandle handle) + { + Table table = metastore.getTable(new HiveIdentity(session), handle.getSchemaName(), handle.getTableName()) + .orElseThrow(() -> new TableNotFoundException(handle.getSchemaTableName())); + if (table.getParameters().containsKey(AVRO_SCHEMA_URL_KEY) || table.getStorage().getSerdeParameters().containsKey(AVRO_SCHEMA_URL_KEY)) { + throw new PrestoException(NOT_SUPPORTED, "ALTER TABLE not supported when Avro schema url is set"); + } + } + + @Override + public void renameTable(ConnectorSession session, ConnectorTableHandle tableHandle, SchemaTableName newTableName) + { + HiveTableHandle handle = (HiveTableHandle) tableHandle; + HiveIdentity hiveIdentity = new HiveIdentity(session); + Optional
target = metastore.getTable(hiveIdentity, handle.getSchemaName(), handle.getTableName()); + if (!target.isPresent()) { + throw new TableNotFoundException(handle.getSchemaTableName()); + } + verifyStorageFormatForCatalog(target.get().getStorage().getStorageFormat()); + metastore.renameTable(hiveIdentity, handle.getSchemaName(), handle.getTableName(), newTableName.getSchemaName(), newTableName.getTableName()); + } + + @Override + public void setTableComment(ConnectorSession session, ConnectorTableHandle tableHandle, Optional comment) + { + HiveTableHandle handle = (HiveTableHandle) tableHandle; + HiveIdentity hiveIdentity = new HiveIdentity(session); + Optional
target = metastore.getTable(hiveIdentity, handle.getSchemaName(), handle.getTableName()); + if (!target.isPresent()) { + throw new TableNotFoundException(handle.getSchemaTableName()); + } + verifyStorageFormatForCatalog(target.get().getStorage().getStorageFormat()); + metastore.commentTable(hiveIdentity, handle.getSchemaName(), handle.getTableName(), comment); + } + + @Override + public void dropTable(ConnectorSession session, ConnectorTableHandle tableHandle) + { + HiveTableHandle handle = (HiveTableHandle) tableHandle; + Optional
target = metastore.getTable(new HiveIdentity(session), handle.getSchemaName(), handle.getTableName()); + if (!target.isPresent()) { + throw new TableNotFoundException(handle.getSchemaTableName()); + } + verifyStorageFormatForCatalog(target.get().getStorage().getStorageFormat()); + metastore.dropTable(session, handle.getSchemaName(), handle.getTableName()); + } + + @Override + public ConnectorTableHandle beginStatisticsCollection(ConnectorSession session, ConnectorTableHandle tableHandle) + { + SchemaTableName tableName = ((HiveTableHandle) tableHandle).getSchemaTableName(); + metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + return tableHandle; + } + + @Override + public void finishStatisticsCollection(ConnectorSession session, ConnectorTableHandle tableHandle, Collection computedStatistics) + { + HiveIdentity identity = new HiveIdentity(session); + HiveTableHandle handle = (HiveTableHandle) tableHandle; + SchemaTableName tableName = handle.getSchemaTableName(); + Table table = metastore.getTable(identity, tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(handle.getSchemaTableName())); + + List partitionColumns = table.getPartitionColumns(); + List partitionColumnNames = partitionColumns.stream() + .map(Column::getName) + .collect(toImmutableList()); + List hiveColumnHandles = hiveColumnHandles(table); + Map columnTypes = hiveColumnHandles.stream() + .filter(columnHandle -> !columnHandle.isHidden()) + .collect(toImmutableMap(HiveColumnHandle::getName, column -> column.getHiveType().getType(typeManager))); + + Map, ComputedStatistics> computedStatisticsMap = Statistics.createComputedStatisticsToPartitionMap(computedStatistics, partitionColumnNames, columnTypes); + + if (partitionColumns.isEmpty()) { + // commit analyze to unpartitioned table + metastore.setTableStatistics(identity, table, createPartitionStatistics(session, columnTypes, computedStatisticsMap.get(ImmutableList.of()))); + } + else { + List> partitionValuesList; + if (handle.getAnalyzePartitionValues().isPresent()) { + partitionValuesList = handle.getAnalyzePartitionValues().get(); + } + else { + partitionValuesList = metastore.getPartitionNames(identity, handle.getSchemaName(), handle.getTableName()) + .orElseThrow(() -> new TableNotFoundException(((HiveTableHandle) tableHandle).getSchemaTableName())) + .stream() + .map(HiveUtil::toPartitionValues) + .collect(toImmutableList()); + } + + ImmutableMap.Builder, PartitionStatistics> partitionStatistics = ImmutableMap.builder(); + Map> columnStatisticTypes = hiveColumnHandles.stream() + .filter(columnHandle -> !partitionColumnNames.contains(columnHandle.getName())) + .filter(column -> !column.isHidden()) + .collect(toImmutableMap(HiveColumnHandle::getName, column -> ImmutableSet.copyOf(metastore.getSupportedColumnStatistics(typeManager.getType(column.getTypeSignature()))))); + Supplier emptyPartitionStatistics = Suppliers.memoize(() -> Statistics.createEmptyPartitionStatistics(columnTypes, columnStatisticTypes)); + + int usedComputedStatistics = 0; + for (List partitionValues : partitionValuesList) { + ComputedStatistics collectedStatistics = computedStatisticsMap.get(partitionValues); + if (collectedStatistics == null) { + partitionStatistics.put(partitionValues, emptyPartitionStatistics.get()); + } + else { + usedComputedStatistics++; + partitionStatistics.put(partitionValues, createPartitionStatistics(session, columnTypes, collectedStatistics)); + } + } + verify(usedComputedStatistics == computedStatistics.size(), "All computed statistics must be used"); + metastore.setPartitionStatistics(identity, table, partitionStatistics.build()); + } + } + + @Override + public HiveOutputTableHandle beginCreateTable(ConnectorSession session, ConnectorTableMetadata tableMetadata, Optional layout) + { + if (getExternalLocation(tableMetadata.getProperties()) != null || isExternalTable(tableMetadata.getProperties())) { + throw new PrestoException(NOT_SUPPORTED, "External tables cannot be created using CREATE TABLE AS"); + } + + if (HiveTableProperties.getAvroSchemaUrl(tableMetadata.getProperties()) != null) { + throw new PrestoException(NOT_SUPPORTED, "CREATE TABLE AS not supported when Avro schema url is set"); + } + + HiveStorageFormat tableStorageFormat = HiveTableProperties.getHiveStorageFormat(tableMetadata.getProperties()); + List partitionedBy = getPartitionedBy(tableMetadata.getProperties()); + Optional bucketProperty = HiveTableProperties.getBucketProperty(tableMetadata.getProperties()); + + // get the root directory for the database + SchemaTableName schemaTableName = tableMetadata.getTable(); + String schemaName = schemaTableName.getSchemaName(); + String tableName = schemaTableName.getTableName(); + + Map tableProperties = getEmptyTableProperties(tableMetadata, bucketProperty, new HdfsContext(session, schemaName, tableName)); + List columnHandles = getColumnHandles(tableMetadata, ImmutableSet.copyOf(partitionedBy), typeTranslator); + HiveStorageFormat partitionStorageFormat = HiveSessionProperties.isRespectTableFormat(session) ? tableStorageFormat : HiveSessionProperties.getHiveStorageFormat(session); + + // unpartitioned tables ignore the partition storage format + HiveStorageFormat actualStorageFormat = partitionedBy.isEmpty() ? tableStorageFormat : partitionStorageFormat; + actualStorageFormat.validateColumns(columnHandles); + + Map columnHandlesByName = Maps.uniqueIndex(columnHandles, HiveColumnHandle::getName); + List partitionColumns = partitionedBy.stream() + .map(columnHandlesByName::get) + .map(column -> new Column(column.getName(), column.getHiveType(), column.getComment())) + .collect(toList()); + checkPartitionTypesSupported(partitionColumns); + + Optional location = getLocation(tableMetadata.getProperties()); + if (location.isPresent() && !tableCreatesWithLocationAllowed) { + throw new PrestoException(NOT_SUPPORTED, format("Setting %s property is not allowed", LOCATION_PROPERTY)); + } + + Optional writeIdInfo = Optional.empty(); + if (AcidUtils.isTransactionalTable(tableProperties)) { + //Create the HiveTableHandle for just to obtain writeIds. + List partitionColumnHandles = partitionedBy.stream() + .map(columnHandlesByName::get) + .collect(toList()); + HiveTableHandle tableHandle = new HiveTableHandle(schemaName, + tableName, tableProperties, partitionColumnHandles, Optional.empty()); + Optional writeId = metastore.getTableWriteId(session, tableHandle, HiveACIDWriteType.INSERT); + if (!writeId.isPresent()) { + throw new IllegalStateException("No validWriteIds present"); + } + writeIdInfo = Optional.of(new WriteIdInfo(writeId.get(), writeId.get(), 0)); + } + + LocationHandle locationHandle; + if (location.isPresent()) { + Path path = getPath(new HdfsContext(session, schemaName, tableName), location.get(), false); + locationHandle = locationService.forNewTable(metastore, session, schemaName, tableName, writeIdInfo, Optional.of(path), HiveWriteUtils.OpertionType.CREATE_TABLE_AS); + } + else { + locationHandle = locationService.forNewTable(metastore, session, schemaName, tableName, writeIdInfo, Optional.empty(), HiveWriteUtils.OpertionType.CREATE_TABLE_AS); + } + HiveOutputTableHandle result = new HiveOutputTableHandle( + schemaName, + tableName, + columnHandles, + metastore.generatePageSinkMetadata(new HiveIdentity(session), schemaTableName), + locationHandle, + tableStorageFormat, + partitionStorageFormat, + partitionedBy, + bucketProperty, + session.getUser(), + tableProperties); + + LocationService.WriteInfo writeInfo = locationService.getQueryWriteInfo(locationHandle); + metastore.declareIntentionToWrite(session, writeInfo.getWriteMode(), writeInfo.getWritePath(), schemaTableName); + + return result; + } + + @Override + public Optional finishCreateTable(ConnectorSession session, ConnectorOutputTableHandle tableHandle, Collection fragments, Collection computedStatistics) + { + return finishCreateTable(session, tableHandle, fragments, computedStatistics, null); + } + + public Optional finishCreateTable( + ConnectorSession session, + ConnectorOutputTableHandle tableHandle, + Collection fragments, + Collection computedStatistics, + Map serdeParameters) + { + HiveOutputTableHandle handle = (HiveOutputTableHandle) tableHandle; + + List partitionUpdates = fragments.stream() + .map(Slice::getBytes) + .map(partitionUpdateCodec::fromJson) + .collect(toList()); + + LocationService.WriteInfo writeInfo = locationService.getQueryWriteInfo(handle.getLocationHandle()); + Table table = buildTableObject( + session.getQueryId(), + handle.getSchemaName(), + handle.getTableName(), + handle.getTableOwner(), + handle.getInputColumns(), + handle.getTableStorageFormat(), + handle.getPartitionedBy(), + handle.getBucketProperty(), + handle.getAdditionalTableParameters(), + writeInfo.getTargetPath(), + externalTable, + prestoVersion, + serdeParameters); + PrincipalPrivileges principalPrivileges = MetastoreUtil.buildInitialPrivilegeSet(handle.getTableOwner()); + + partitionUpdates = PartitionUpdate.mergePartitionUpdates(partitionUpdates); + + if (session.isSnapshotEnabled()) { + Set mergedFileNames = collectMergedFileNames(partitionUpdates); + updateSnapshotFiles(session, handle, false, mergedFileNames, OptionalLong.empty()); + // Remove suffix from file names in partition updates + partitionUpdates = updateSnapshotFileNames(partitionUpdates, session.getQueryId()); + } + + if (handle.getBucketProperty().isPresent() && HiveSessionProperties.isCreateEmptyBucketFiles(session)) { + List partitionUpdatesForMissingBuckets = computePartitionUpdatesForMissingBuckets(session, handle, table, partitionUpdates); + // replace partitionUpdates before creating the empty files so that those files will be cleaned up if we end up rollback + partitionUpdates = PartitionUpdate.mergePartitionUpdates(concat(partitionUpdates, partitionUpdatesForMissingBuckets)); + for (PartitionUpdate partitionUpdate : partitionUpdatesForMissingBuckets) { + Optional partition = table.getPartitionColumns().isEmpty() ? Optional.empty() : Optional.of(buildPartitionObject(session, table, partitionUpdate)); + createEmptyFiles(session, partitionUpdate.getWritePath(), table, partition, partitionUpdate.getFileNames()); + } + } + + Map columnTypes = handle.getInputColumns().stream() + .collect(toImmutableMap(HiveColumnHandle::getName, column -> column.getHiveType().getType(typeManager))); + Map, ComputedStatistics> partitionComputedStatistics = Statistics.createComputedStatisticsToPartitionMap(computedStatistics, handle.getPartitionedBy(), columnTypes); + + PartitionStatistics tableStatistics; + if (table.getPartitionColumns().isEmpty()) { + HiveBasicStatistics basicStatistics = partitionUpdates.stream() + .map(PartitionUpdate::getStatistics) + .reduce((first, second) -> Statistics.reduce(first, second, Statistics.ReduceOperator.ADD)) + .orElse(HiveBasicStatistics.createZeroStatistics()); + tableStatistics = createPartitionStatistics(session, basicStatistics, columnTypes, getColumnStatistics(partitionComputedStatistics, ImmutableList.of())); + } + else { + tableStatistics = new PartitionStatistics(HiveBasicStatistics.createEmptyStatistics(), ImmutableMap.of()); + } + + metastore.createTable(session, table, principalPrivileges, Optional.of(writeInfo.getWritePath()), false, tableStatistics); + + if (!handle.getPartitionedBy().isEmpty()) { + if (HiveSessionProperties.isRespectTableFormat(session)) { + verify(handle.getPartitionStorageFormat() == handle.getTableStorageFormat()); + } + List> futures = partitionUpdates.stream().map(update -> + hiveMetastoreClientService.submit(() -> { + Partition partition = buildPartitionObject(session, table, update); + PartitionStatistics partitionStatistics = createPartitionStatistics( + session, + update.getStatistics(), + columnTypes, + getColumnStatistics(partitionComputedStatistics, partition.getValues())); + metastore.addPartition( + session, + handle.getSchemaName(), + handle.getTableName(), + buildPartitionObject(session, table, update), + update.getWritePath(), + partitionStatistics, + HiveACIDWriteType.NONE); + })).collect(toList()); + futures.forEach(future -> { + try { + future.get(); + } + catch (InterruptedException | ExecutionException ignore) { + } + }); + } + + return Optional.of(new HiveWrittenPartitions( + partitionUpdates.stream() + .map(PartitionUpdate::getName) + .collect(toList()))); + } + + private List computePartitionUpdatesForMissingBuckets( + ConnectorSession session, + HiveWritableTableHandle handle, + Table table, + List partitionUpdates) + { + ImmutableList.Builder partitionUpdatesForMissingBucketsBuilder = ImmutableList.builder(); + HiveStorageFormat storageFormat = table.getPartitionColumns().isEmpty() ? handle.getTableStorageFormat() : handle.getPartitionStorageFormat(); + for (PartitionUpdate partitionUpdate : partitionUpdates) { + int bucketCount = handle.getBucketProperty().get().getBucketCount(); + + List fileNamesForMissingBuckets = computeFileNamesForMissingBuckets( + session, + table, + storageFormat, + partitionUpdate.getTargetPath(), + bucketCount, + partitionUpdate); + partitionUpdatesForMissingBucketsBuilder.add(new PartitionUpdate( + partitionUpdate.getName(), + partitionUpdate.getUpdateMode(), + partitionUpdate.getWritePath(), + partitionUpdate.getTargetPath(), + fileNamesForMissingBuckets, + 0, + 0, + 0, + partitionUpdate.getMiscData())); + } + return partitionUpdatesForMissingBucketsBuilder.build(); + } + + private List computeFileNamesForMissingBuckets( + ConnectorSession session, + Table table, + HiveStorageFormat storageFormat, + Path targetPath, + int bucketCount, + PartitionUpdate partitionUpdate) + { + if (partitionUpdate.getFileNames().size() == bucketCount) { + // fast path for common case + return ImmutableList.of(); + } + HdfsContext hdfsContext = new HdfsContext(session, table.getDatabaseName(), table.getTableName()); + JobConf conf = ConfigurationUtils.toJobConf(hdfsEnvironment.getConfiguration(hdfsContext, targetPath)); + String fileExtension = HiveWriterFactory.getFileExtension(conf, StorageFormat.fromHiveStorageFormat(storageFormat)); + Set fileNames = ImmutableSet.copyOf(partitionUpdate.getFileNames()); + ImmutableList.Builder missingFileNamesBuilder = ImmutableList.builder(); + for (int i = 0; i < bucketCount; i++) { + String fileName = HiveWriterFactory.computeBucketedFileName(session.getQueryId(), i) + fileExtension; + if (!fileNames.contains(fileName)) { + missingFileNamesBuilder.add(fileName); + } + } + List missingFileNames = missingFileNamesBuilder.build(); + verify(fileNames.size() + missingFileNames.size() == bucketCount); + return missingFileNames; + } + + private void createEmptyFiles(ConnectorSession session, Path path, Table table, Optional partition, List fileNames) + { + JobConf conf = ConfigurationUtils.toJobConf(hdfsEnvironment.getConfiguration(new HdfsContext(session, table.getDatabaseName(), table.getTableName()), path)); + + Properties schema; + StorageFormat format; + if (partition.isPresent()) { + schema = MetastoreUtil.getHiveSchema(partition.get(), table); + format = partition.get().getStorage().getStorageFormat(); + } + else { + schema = MetastoreUtil.getHiveSchema(table); + format = table.getStorage().getStorageFormat(); + } + hdfsEnvironment.doAs(session.getUser(), () -> { + for (String fileName : fileNames) { + writeEmptyFile(session, new Path(path, fileName), conf, schema, format.getSerDe(), format.getOutputFormat()); + } + }); + } + + private static void writeEmptyFile(ConnectorSession session, Path target, JobConf conf, Properties properties, String serDe, String outputFormatName) + { + // Some serializers such as Avro set a property in the schema. + HiveWriteUtils.initializeSerializer(conf, properties, serDe); + + // The code below is not a try with resources because RecordWriter is not Closeable. + FileSinkOperator.RecordWriter recordWriter = HiveWriteUtils.createRecordWriter(target, conf, properties, outputFormatName, session); + try { + recordWriter.close(false); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_WRITER_CLOSE_ERROR, "Error write empty file to Hive", e); + } + } + + @Override + public HiveInsertTableHandle beginInsert(ConnectorSession session, ConnectorTableHandle tableHandle) + { + return beginInsertUpdateInternal(session, tableHandle, Optional.empty(), HiveACIDWriteType.INSERT); + } + + @Override + public HiveInsertTableHandle beginInsert(ConnectorSession session, ConnectorTableHandle tableHandle, boolean isOverwrite) + { + return beginInsertUpdateInternal(session, tableHandle, Optional.empty(), HiveACIDWriteType.INSERT_OVERWRITE); + } + + private HiveInsertTableHandle beginInsertUpdateInternal(ConnectorSession session, ConnectorTableHandle tableHandle, + Optional partition, HiveACIDWriteType writeType) + { + HiveIdentity identity = new HiveIdentity(session); + SchemaTableName tableName = ((HiveTableHandle) tableHandle).getSchemaTableName(); + Table table = metastore.getTable(identity, tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + + verifyStorageFormatForCatalog(table.getStorage().getStorageFormat()); + + HiveWriteUtils.checkTableIsWritable(table, writesToNonManagedTablesEnabled, writeType); + + for (Column column : table.getDataColumns()) { + if (!HiveWriteUtils.isWritableType(column.getType())) { + throw new PrestoException(NOT_SUPPORTED, String.format("Inserting into Hive table %s with column type %s not supported", tableName, column.getType())); + } + } + + List handles = hiveColumnHandles(table).stream() + .filter(columnHandle -> !columnHandle.isHidden()) + .collect(toList()); + + if (partition.isPresent() && table.getPartitionColumns().isEmpty()) { + throw new PrestoException(GENERIC_USER_ERROR, String.format("Table %s not partitioned", tableName)); + } + + HiveStorageFormat tableStorageFormat = extractHiveStorageFormat(table); + if (tableStorageFormat == HiveStorageFormat.TEXTFILE) { + if (table.getParameters().containsKey(TEXT_SKIP_HEADER_COUNT_KEY)) { + throw new PrestoException(NOT_SUPPORTED, format("Inserting into Hive table with %s property not supported", TEXT_SKIP_HEADER_COUNT_KEY)); + } + if (table.getParameters().containsKey(TEXT_SKIP_FOOTER_COUNT_KEY)) { + throw new PrestoException(NOT_SUPPORTED, format("Inserting into Hive table with %s property not supported", TEXT_SKIP_FOOTER_COUNT_KEY)); + } + } + + Optional writeIdInfo = Optional.empty(); + if (AcidUtils.isTransactionalTable(((HiveTableHandle) tableHandle) + .getTableParameters().orElseThrow(() -> new IllegalStateException("tableParameters missing")))) { + Optional writeId = metastore.getTableWriteId(session, (HiveTableHandle) tableHandle, writeType); + if (!writeId.isPresent()) { + throw new IllegalStateException("No validWriteIds present"); + } + writeIdInfo = Optional.of(new WriteIdInfo(writeId.get(), writeId.get(), 0)); + } + + HiveWriteUtils.OpertionType operationType = HiveWriteUtils.OpertionType.INSERT; + boolean isInsertExistingPartitionsOverwrite = HiveSessionProperties.getInsertExistingPartitionsBehavior(session) == + HiveSessionProperties.InsertExistingPartitionsBehavior.OVERWRITE; + if (isInsertExistingPartitionsOverwrite || writeType == HiveACIDWriteType.INSERT_OVERWRITE) { + operationType = HiveWriteUtils.OpertionType.INSERT_OVERWRITE; + } + LocationHandle locationHandle = locationService.forExistingTable(metastore, session, table, writeIdInfo, operationType); + HiveInsertTableHandle result = new HiveInsertTableHandle( + tableName.getSchemaName(), + tableName.getTableName(), + handles, + metastore.generatePageSinkMetadata(identity, tableName), + locationHandle, + table.getStorage().getBucketProperty(), + tableStorageFormat, + HiveSessionProperties.isRespectTableFormat(session) ? tableStorageFormat : + HiveSessionProperties.getHiveStorageFormat(session), + writeType == HiveACIDWriteType.INSERT_OVERWRITE); + + LocationService.WriteInfo writeInfo = locationService.getQueryWriteInfo(locationHandle); + metastore.declareIntentionToWrite(session, writeInfo.getWriteMode(), writeInfo.getWritePath(), tableName); + return result; + } + + @Override + public Optional finishInsert(ConnectorSession session, + ConnectorInsertTableHandle insertHandle, + Collection fragments, + Collection computedStatistics) + { + return finishInsertInternal(session, insertHandle, fragments, computedStatistics, null, HiveACIDWriteType.INSERT); + } + + public Optional finishInsert(ConnectorSession session, + ConnectorInsertTableHandle insertHandle, + Collection fragments, + Collection computedStatistics, + List partitions) + { + return finishInsertInternal(session, insertHandle, fragments, computedStatistics, partitions, HiveACIDWriteType.INSERT); + } + + private Optional finishInsertInternal(ConnectorSession session, + ConnectorInsertTableHandle insertHandle, + Collection fragments, + Collection computedStatistics, + List partitions, + HiveACIDWriteType hiveACIDWriteType) + { + HiveInsertTableHandle handle = (HiveInsertTableHandle) insertHandle; + + List partitionUpdates = fragments.stream() + .map(Slice::getBytes) + .map(partitionUpdateCodec::fromJson) + .sorted(Comparator.comparing(PartitionUpdate::getName)) //sort partition updates to ensure same sequence of rename in case of + .collect(toList()); + + HiveStorageFormat tableStorageFormat = handle.getTableStorageFormat(); + partitionUpdates = PartitionUpdate.mergePartitionUpdates(partitionUpdates); + + if (session.isSnapshotEnabled()) { + Set mergedFileNames = collectMergedFileNames(partitionUpdates); + updateSnapshotFiles(session, handle, false, mergedFileNames, OptionalLong.empty()); + // Remove suffix from file names in partition updates + partitionUpdates = updateSnapshotFileNames(partitionUpdates, session.getQueryId()); + } + + Table table = metastore.getTable(new HiveIdentity(session), handle.getSchemaName(), handle.getTableName()) + .orElseThrow(() -> new TableNotFoundException(handle.getSchemaTableName())); + if (!table.getStorage().getStorageFormat().getInputFormat().equals(tableStorageFormat.getInputFormat()) && HiveSessionProperties.isRespectTableFormat(session)) { + throw new PrestoException(HiveErrorCode.HIVE_CONCURRENT_MODIFICATION_DETECTED, "Table format changed during insert"); + } + + if (handle.getBucketProperty().isPresent() && HiveSessionProperties.isCreateEmptyBucketFiles(session)) { + List partitionUpdatesForMissingBuckets = computePartitionUpdatesForMissingBuckets(session, handle, table, partitionUpdates); + // replace partitionUpdates before creating the empty files so that those files will be cleaned up if we end up rollback + partitionUpdates = PartitionUpdate.mergePartitionUpdates(concat(partitionUpdates, partitionUpdatesForMissingBuckets)); + for (PartitionUpdate partitionUpdate : partitionUpdatesForMissingBuckets) { + Optional partition = table.getPartitionColumns().isEmpty() ? Optional.empty() : Optional.of(buildPartitionObject(session, table, partitionUpdate)); + createEmptyFiles(session, partitionUpdate.getWritePath(), table, partition, partitionUpdate.getFileNames()); + } + } + + List partitionedBy = table.getPartitionColumns().stream() + .map(Column::getName) + .collect(toImmutableList()); + Map columnTypes = handle.getInputColumns().stream() + .collect(toImmutableMap(HiveColumnHandle::getName, column -> column.getHiveType().getType(typeManager))); + Map, ComputedStatistics> partitionComputedStatistics = Statistics.createComputedStatisticsToPartitionMap(computedStatistics, partitionedBy, columnTypes); + + for (PartitionUpdate partitionUpdate : partitionUpdates) { + if (partitionUpdate.getName().isEmpty()) { + // insert into unpartitioned table + if (!table.getStorage().getStorageFormat().getInputFormat().equals(handle.getPartitionStorageFormat().getInputFormat()) && HiveSessionProperties.isRespectTableFormat(session)) { + throw new PrestoException(HiveErrorCode.HIVE_CONCURRENT_MODIFICATION_DETECTED, "Table format changed during insert"); + } + + PartitionStatistics partitionStatistics = createPartitionStatistics( + session, + partitionUpdate.getStatistics(), + columnTypes, + getColumnStatistics(partitionComputedStatistics, ImmutableList.of())); + + if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.OVERWRITE) { + finishInsertOverwrite(session, handle, table, partitionUpdate, partitionStatistics); + } + else if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.NEW || partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.APPEND) { + // insert into unpartitioned table + metastore.finishInsertIntoExistingTable( + session, + handle.getSchemaName(), + handle.getTableName(), + partitionUpdate.getWritePath(), + partitionUpdate.getFileNames(), + partitionStatistics, + hiveACIDWriteType); + } + else { + throw new IllegalArgumentException("Unsupported update mode: " + partitionUpdate.getUpdateMode()); + } + } + else if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.APPEND) { + // insert into existing partition + List partitionValues = toPartitionValues(partitionUpdate.getName()); + PartitionStatistics partitionStatistics = createPartitionStatistics( + session, + partitionUpdate.getStatistics(), + columnTypes, + getColumnStatistics(partitionComputedStatistics, partitionValues)); + metastore.finishInsertIntoExistingPartition( + session, + handle.getSchemaName(), + handle.getTableName(), + partitionValues, + partitionUpdate.getWritePath(), + partitionUpdate.getFileNames(), + partitionStatistics, + hiveACIDWriteType); + } + else if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.NEW || partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.OVERWRITE) { + finishInsertInNewPartition(session, handle, table, columnTypes, partitionUpdate, partitionComputedStatistics, hiveACIDWriteType); + } + else { + throw new IllegalArgumentException(format("Unsupported update mode: %s", partitionUpdate.getUpdateMode())); + } + } + + if (partitions != null) { + partitions.addAll(partitionUpdates); + } + + return Optional.of(new HiveWrittenPartitions( + partitionUpdates.stream() + .map(PartitionUpdate::getName) + .collect(toList()))); + } + + @Override + public HiveUpdateTableHandle beginUpdateAsInsert(ConnectorSession session, ConnectorTableHandle tableHandle) + { + HiveInsertTableHandle insertTableHandle = beginInsertUpdateInternal(session, tableHandle, Optional.empty(), HiveACIDWriteType.UPDATE); + return new HiveUpdateTableHandle(insertTableHandle.getSchemaName(), insertTableHandle.getTableName(), + insertTableHandle.getInputColumns(), insertTableHandle.getPageSinkMetadata(), + insertTableHandle.getLocationHandle(), insertTableHandle.getBucketProperty(), + insertTableHandle.getTableStorageFormat(), insertTableHandle.getPartitionStorageFormat()); + } + + @Override + public HiveDeleteAsInsertTableHandle beginDeletesAsInsert(ConnectorSession session, ConnectorTableHandle tableHandle) + { + HiveInsertTableHandle insertTableHandle = beginInsertUpdateInternal(session, tableHandle, Optional.empty(), HiveACIDWriteType.DELETE); + //Delete needs only partitionColumn and bucketing columns data + List inputColumns = insertTableHandle.getInputColumns().stream().filter(HiveColumnHandle::isRequired).collect(toList()); + return new HiveDeleteAsInsertTableHandle(insertTableHandle.getSchemaName(), insertTableHandle.getTableName(), + inputColumns, insertTableHandle.getPageSinkMetadata(), + insertTableHandle.getLocationHandle(), insertTableHandle.getBucketProperty(), + insertTableHandle.getTableStorageFormat(), insertTableHandle.getPartitionStorageFormat()); + } + + @Override + public Optional finishUpdateAsInsert(ConnectorSession session, ConnectorUpdateTableHandle updateHandle, Collection fragments, Collection computedStatistics) + { + HiveUpdateTableHandle updateTableHandle = (HiveUpdateTableHandle) updateHandle; + HiveInsertTableHandle insertTableHandle = new HiveInsertTableHandle(updateTableHandle.getSchemaName(), updateTableHandle.getTableName(), + updateTableHandle.getInputColumns(), updateTableHandle.getPageSinkMetadata(), + updateTableHandle.getLocationHandle(), updateTableHandle.getBucketProperty(), + updateTableHandle.getTableStorageFormat(), updateTableHandle.getPartitionStorageFormat(), false); + return finishInsertInternal(session, insertTableHandle, fragments, computedStatistics, null, HiveACIDWriteType.UPDATE); + } + + @Override + public Optional finishDeleteAsInsert(ConnectorSession session, ConnectorDeleteAsInsertTableHandle deleteHandle, Collection fragments, Collection computedStatistics) + { + HiveDeleteAsInsertTableHandle deleteTableHandle = (HiveDeleteAsInsertTableHandle) deleteHandle; + HiveInsertTableHandle insertTableHandle = new HiveInsertTableHandle(deleteTableHandle.getSchemaName(), deleteTableHandle.getTableName(), + deleteTableHandle.getInputColumns(), deleteTableHandle.getPageSinkMetadata(), + deleteTableHandle.getLocationHandle(), deleteTableHandle.getBucketProperty(), + deleteTableHandle.getTableStorageFormat(), deleteTableHandle.getPartitionStorageFormat(), false); + return finishInsertInternal(session, insertTableHandle, fragments, computedStatistics, null, HiveACIDWriteType.DELETE); + } + + @Override + public ConnectorVacuumTableHandle beginVacuum(ConnectorSession session, ConnectorTableHandle tableHandle, boolean full, boolean unify, Optional partition) + { + HiveInsertTableHandle insertTableHandle = beginInsertUpdateInternal(session, tableHandle, partition, unify ? HiveACIDWriteType.VACUUM_UNIFY : HiveACIDWriteType.VACUUM); + if ((!session.getSource().get().isEmpty()) && + session.getSource().get().equals("auto-vacuum")) { + metastore.setVacuumTableHandle((HiveTableHandle) tableHandle); + } + return new HiveVacuumTableHandle(insertTableHandle.getSchemaName(), insertTableHandle.getTableName(), + insertTableHandle.getInputColumns(), insertTableHandle.getPageSinkMetadata(), + insertTableHandle.getLocationHandle(), insertTableHandle.getBucketProperty(), + insertTableHandle.getTableStorageFormat(), insertTableHandle.getPartitionStorageFormat(), full, unify, null); + } + + @Override + public Optional finishVacuum(ConnectorSession session, ConnectorVacuumTableHandle handle, Collection fragments, Collection computedStatistics) + { + HiveVacuumTableHandle vacuumTableHandle = (HiveVacuumTableHandle) handle; + HiveInsertTableHandle insertTableHandle = new HiveInsertTableHandle(vacuumTableHandle.getSchemaName(), vacuumTableHandle.getTableName(), + vacuumTableHandle.getInputColumns(), vacuumTableHandle.getPageSinkMetadata(), + vacuumTableHandle.getLocationHandle(), vacuumTableHandle.getBucketProperty(), + vacuumTableHandle.getTableStorageFormat(), vacuumTableHandle.getPartitionStorageFormat(), false); + List partitionUpdates = new ArrayList<>(); + Optional connectorOutputMetadata = + finishInsertInternal(session, insertTableHandle, fragments, computedStatistics, partitionUpdates, + vacuumTableHandle.isUnifyVacuum() ? HiveACIDWriteType.VACUUM_UNIFY : HiveACIDWriteType.VACUUM); + + metastore.initiateVacuumCleanupTasks(vacuumTableHandle, session, partitionUpdates); + return connectorOutputMetadata; + } + + protected Partition buildPartitionObject(ConnectorSession session, Table table, PartitionUpdate partitionUpdate) + { + return Partition.builder() + .setDatabaseName(table.getDatabaseName()) + .setTableName(table.getTableName()) + .setColumns(table.getDataColumns()) + .setValues(HivePartitionManager.extractPartitionValues(partitionUpdate.getName())) + .setParameters(ImmutableMap.builder() + .put(PRESTO_VERSION_NAME, prestoVersion) + .put(PRESTO_QUERY_ID_NAME, session.getQueryId()) + .build()) + .withStorage(storage -> storage + .setStorageFormat(HiveSessionProperties.isRespectTableFormat(session) ? + table.getStorage().getStorageFormat() : + StorageFormat.fromHiveStorageFormat(HiveSessionProperties.getHiveStorageFormat(session))) + .setLocation(partitionUpdate.getTargetPath().toString()) + .setBucketProperty(table.getStorage().getBucketProperty()) + .setSerdeParameters(table.getStorage().getSerdeParameters())) + .build(); + } + + private PartitionStatistics createPartitionStatistics( + ConnectorSession session, + Map columnTypes, + ComputedStatistics computedStatistics) + { + Map computedColumnStatistics = computedStatistics.getColumnStatistics(); + + Block rowCountBlock = Optional.ofNullable(computedStatistics.getTableStatistics().get(ROW_COUNT)) + .orElseThrow(() -> new VerifyException("rowCount not present")); + verify(!rowCountBlock.isNull(0), "rowCount must never be null"); + long rowCount = BIGINT.getLong(rowCountBlock, 0); + HiveBasicStatistics rowCountOnlyBasicStatistics = new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.of(rowCount), OptionalLong.empty(), OptionalLong.empty()); + return createPartitionStatistics(session, rowCountOnlyBasicStatistics, columnTypes, computedColumnStatistics); + } + + protected PartitionStatistics createPartitionStatistics( + ConnectorSession session, + HiveBasicStatistics basicStatistics, + Map columnTypes, + Map computedColumnStatistics) + { + long rowCount = basicStatistics.getRowCount().orElseThrow(() -> new IllegalArgumentException("rowCount not present")); + Map columnStatistics = Statistics.fromComputedStatistics( + session, + computedColumnStatistics, + columnTypes, + rowCount); + return new PartitionStatistics(basicStatistics, columnStatistics); + } + + protected static Map getColumnStatistics(Map, ComputedStatistics> statistics, List partitionValues) + { + return Optional.ofNullable(statistics.get(partitionValues)) + .map(ComputedStatistics::getColumnStatistics) + .orElse(ImmutableMap.of()); + } + + @Override + public void createView(ConnectorSession session, SchemaTableName viewName, ConnectorViewDefinition definition, boolean replace) + { + HiveIdentity identity = new HiveIdentity(session); + Map properties = ImmutableMap.builder() + .put(TABLE_COMMENT, "Presto View") + .put(PRESTO_VIEW_FLAG, "true") + .put(PRESTO_VERSION_NAME, prestoVersion) + .put(PRESTO_QUERY_ID_NAME, session.getQueryId()) + .build(); + + Column dummyColumn = new Column("dummy", HiveType.HIVE_STRING, Optional.empty()); + + Table.Builder tableBuilder = Table.builder() + .setDatabaseName(viewName.getSchemaName()) + .setTableName(viewName.getTableName()) + .setOwner(session.getUser()) + .setTableType(TableType.VIRTUAL_VIEW.name()) + .setDataColumns(ImmutableList.of(dummyColumn)) + .setPartitionColumns(ImmutableList.of()) + .setParameters(properties) + .setViewOriginalText(Optional.of(encodeViewData(definition))) + .setViewExpandedText(Optional.of("/* Presto View */")); + + tableBuilder.getStorageBuilder() + .setStorageFormat(StorageFormat.VIEW_STORAGE_FORMAT) + .setLocation(""); + Table table = tableBuilder.build(); + PrincipalPrivileges principalPrivileges = MetastoreUtil.buildInitialPrivilegeSet(session.getUser()); + + Optional
existing = metastore.getTable(identity, viewName.getSchemaName(), viewName.getTableName()); + if (existing.isPresent()) { + if (!replace || !HiveUtil.isPrestoView(existing.get())) { + throw new ViewAlreadyExistsException(viewName); + } + + metastore.replaceView(identity, viewName.getSchemaName(), viewName.getTableName(), table, principalPrivileges); + return; + } + + try { + metastore.createTable(session, table, principalPrivileges, Optional.empty(), false, new PartitionStatistics(HiveBasicStatistics.createEmptyStatistics(), ImmutableMap.of())); + } + catch (TableAlreadyExistsException e) { + throw new ViewAlreadyExistsException(e.getTableName()); + } + } + + @Override + public void dropView(ConnectorSession session, SchemaTableName viewName) + { + ConnectorViewDefinition view = getView(session, viewName) + .orElseThrow(() -> new ViewNotFoundException(viewName)); + + try { + metastore.dropTable(session, viewName.getSchemaName(), viewName.getTableName()); + } + catch (TableNotFoundException e) { + throw new ViewNotFoundException(e.getTableName()); + } + } + + @Override + public List listViews(ConnectorSession session, Optional optionalSchemaName) + { + ImmutableList.Builder tableNames = ImmutableList.builder(); + for (String schemaName : listSchemas(session, optionalSchemaName)) { + for (String tableName : metastore.getAllViews(schemaName).orElse(emptyList())) { + tableNames.add(new SchemaTableName(schemaName, tableName)); + } + } + return tableNames.build(); + } + + @Override + public Optional getView(ConnectorSession session, SchemaTableName viewName) + { + // support presto view and hive view + return metastore.getTable(new HiveIdentity(session), viewName.getSchemaName(), viewName.getTableName()) + .filter(HiveUtil::isView) + .map(view -> { + ConnectorViewDefinition definition; + if (isPrestoView(view)) { + definition = processPrestoView(view, viewName); + } + else { + // if type is hive view + definition = processHiveView(session, view, viewName); + } + + return definition; + }); + } + + private ConnectorViewDefinition processPrestoView(Table view, SchemaTableName viewName) + { + ConnectorViewDefinition definition = decodeViewData(view.getViewOriginalText() + .orElseThrow(() -> new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "No view original text: " + viewName))); + // use owner from table metadata if it exists + if (view.getOwner() != null && !definition.isRunAsInvoker()) { + definition = new ConnectorViewDefinition( + definition.getOriginalSql(), + definition.getCatalog(), + definition.getSchema(), + definition.getColumns(), + Optional.of(view.getOwner()), + false); + } + + return definition; + } + + // support hive view without no HQL and hive UDF + private ConnectorViewDefinition processHiveView(ConnectorSession session, Table view, SchemaTableName viewName) + { + // if the table has not set the schema,the viewOriginalText has no the schema,so we should read the viewExpandedText + String hiveViewQuery = view.getViewExpandedText() + .orElseThrow(() -> new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "No view original text: " + viewName)); + hiveViewQuery = hiveViewQuery.replace('`', '"'); + Optional owner; + if (view.getOwner() != null) { + String fullNameOwner = view.getOwner(); + int domainIndex = fullNameOwner.indexOf('@'); + owner = Optional.of((domainIndex < 0) ? fullNameOwner : fullNameOwner.substring(0, domainIndex)); + } + else { + owner = Optional.empty(); + } + // get column type from view + List viewColumns = new ArrayList<>(); + for (Column item : view.getDataColumns()) { + ConnectorViewDefinition.ViewColumn vc = new ConnectorViewDefinition.ViewColumn(item.getName(), item.getType().getTypeSignature()); + viewColumns.add(vc); + } + return new ConnectorViewDefinition( + hiveViewQuery, + Optional.of(getCatalogName(session)), + Optional.of(view.getDatabaseName()), + viewColumns, + owner, + !owner.isPresent()); + } + + private String getCatalogName(ConnectorSession session) + { + if (session.getCatalog().isPresent()) { + return session.getCatalog().get(); + } + return "hive"; + } + + @Override + public ConnectorTableHandle beginDelete(ConnectorSession session, ConnectorTableHandle tableHandle) + { + HiveTableHandle handle = (HiveTableHandle) tableHandle; + if (AcidUtils.isInsertOnlyTable(handle.getTableParameters().get())) { + throw new PrestoException(NOT_SUPPORTED, "Attempt to do delete on table " + handle.getTableName() + + " that is insert-only transactional"); + } + else { + throw new PrestoException(NOT_SUPPORTED, "This connector only supports delete where one or more partitions" + + " are deleted entirely for Non-Transactional tables"); + } + } + + @Override + public ColumnHandle getDeleteRowIdColumnHandle(ConnectorSession session, ConnectorTableHandle tableHandle) + { + return HiveColumnHandle.updateRowIdHandle(); + } + + @Override + public ColumnHandle getUpdateRowIdColumnHandle(ConnectorSession session, ConnectorTableHandle tableHandle, List updatedColumns) + { + return HiveColumnHandle.updateRowIdHandle(); + } + + @Override + public Optional applyDelete(ConnectorSession session, ConnectorTableHandle handle) + { + return Optional.of(handle); + } + + @Override + public Optional applyDelete(ConnectorSession session, ConnectorTableHandle handle, Constraint constraint) + { + HiveTableHandle hiveTableHandle = (HiveTableHandle) handle; + if (constraint == null) { + return Optional.of(handle); + } + SchemaTableName tableName = hiveTableHandle.getSchemaTableName(); + Table table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + HiveIdentity identity = new HiveIdentity(session); + HivePartitionResult partitionResult = partitionManager.getPartitions(metastore, identity, handle, constraint, table); + HiveTableHandle newHandle = partitionManager.applyPartitionResult(hiveTableHandle, partitionResult); + return Optional.of(newHandle); + } + + @Override + public OptionalLong executeDelete(ConnectorSession session, ConnectorTableHandle deleteHandle) + { + HiveIdentity identity = new HiveIdentity(session); + HiveTableHandle handle = (HiveTableHandle) deleteHandle; + + Optional
table = metastore.getTable(identity, handle.getSchemaName(), handle.getTableName()); + if (!table.isPresent()) { + throw new TableNotFoundException(handle.getSchemaTableName()); + } + + if (table.get().getPartitionColumns().isEmpty()) { + metastore.truncateUnpartitionedTable(session, handle.getSchemaName(), handle.getTableName()); + } + else { + for (HivePartition hivePartition : partitionManager.getOrLoadPartitions(session, metastore, identity, handle)) { + metastore.dropPartition(session, handle.getSchemaName(), handle.getTableName(), toPartitionValues(hivePartition.getPartitionId())); + } + } + // it is too expensive to determine the exact number of deleted rows + return OptionalLong.empty(); + } + + @VisibleForTesting + static Predicate> convertToPredicate(TupleDomain tupleDomain) + { + return bindings -> tupleDomain.contains(TupleDomain.fromFixedValues(bindings)); + } + + @Override + public boolean usesLegacyTableLayouts() + { + return false; + } + + @Override + public ConnectorTableProperties getTableProperties(ConnectorSession session, ConnectorTableHandle table) + { + HiveIdentity identity = new HiveIdentity(session); + HiveTableHandle hiveTable = (HiveTableHandle) table; + + List partitionColumns = ImmutableList.copyOf(hiveTable.getPartitionColumns()); + List partitions = partitionManager.getOrLoadPartitions(session, metastore, identity, hiveTable); + + TupleDomain predicate = createPredicate(partitionColumns, partitions); + + if (hiveTable.isSuitableToPush()) { + Table hmsTable = metastore.getTable(identity, hiveTable.getSchemaName(), hiveTable.getTableName()) + .orElseThrow(() -> new TableNotFoundException(hiveTable.getSchemaTableName())); + + ImmutableMap.Builder pushedDown = ImmutableMap.builder(); + pushedDown.putAll(hiveTable.getCompactEffectivePredicate().getDomains().get().entrySet().stream() + .collect(toMap(e -> (ColumnHandle) e.getKey(), e -> e.getValue()))); + + predicate = predicate.intersect(withColumnDomains(pushedDown.build())); + } + + Optional discretePredicates = Optional.empty(); + if (!partitionColumns.isEmpty()) { + // Do not create tuple domains for every partition at the same time! + // There can be a huge number of partitions so use an iterable so + // all domains do not need to be in memory at the same time. + Iterable> partitionDomains = Iterables.transform(partitions, (hivePartition) -> TupleDomain.fromFixedValues(hivePartition.getKeys())); + discretePredicates = Optional.of(new DiscretePredicates(partitionColumns, partitionDomains)); + } + + Optional tablePartitioning = Optional.empty(); + if (HiveSessionProperties.isBucketExecutionEnabled(session) && hiveTable.getBucketHandle().isPresent()) { + tablePartitioning = hiveTable.getBucketHandle().map(bucketing -> new ConnectorTablePartitioning( + new HivePartitioningHandle( + bucketing.getBucketingVersion(), + bucketing.getReadBucketCount(), + bucketing.getColumns().stream() + .map(HiveColumnHandle::getHiveType) + .collect(toImmutableList()), + OptionalInt.empty()), + bucketing.getColumns().stream() + .map(ColumnHandle.class::cast) + .collect(toList()))); + } + + return new ConnectorTableProperties( + predicate, + tablePartitioning, + Optional.empty(), + discretePredicates, + ImmutableList.of()); + } + + @Override + public Optional> applyFilter(ConnectorSession session, ConnectorTableHandle tableHandle, Constraint constraint) + { + return applyFilter(session, tableHandle, constraint, ImmutableList.of(), ImmutableSet.of(), false); + } + + @Override + public Optional> applyFilter(ConnectorSession session, ConnectorTableHandle tableHandle, + Constraint constraint, List disjuctConstaints, + Set allColumnHandles, + boolean pushPartitionsOnly) + { + HiveIdentity identity = new HiveIdentity(session); + HiveTableHandle handle = (HiveTableHandle) tableHandle; + checkArgument(!handle.getAnalyzePartitionValues().isPresent() || constraint.getSummary().isAll(), "Analyze should not have a constraint"); + + SchemaTableName tableName = handle.getSchemaTableName(); + Table table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + + HivePartitionResult partitionResult = partitionManager.getPartitions(metastore, identity, handle, constraint, table); + + HiveTableHandle newHandle = partitionManager.applyPartitionResult(handle, partitionResult); + + // the goal here is to pushdown all the constraints/predicates to HivePageSourceProvider + // in case some pre-filtering can be done using the heuristic-index + // however, during scheduling we can't be sure a column will have a heuristic-index. + // therefore, filtering should still be done using the filter operator, + // hence the unenforced constraints below includes all constraints (minus partitions) + ImmutableMap.Builder pushedDown = ImmutableMap.builder(); + pushedDown.putAll(partitionResult.getUnenforcedConstraint().getDomains().get().entrySet().stream() + .collect(toMap(e -> (HiveColumnHandle) e.getKey(), e -> e.getValue()))); + + TupleDomain newEffectivePredicate = newHandle.getCompactEffectivePredicate() + .intersect(handle.getCompactEffectivePredicate()) + .intersect(withColumnDomains(pushedDown.build())); + + ImmutableList.Builder> builder = ImmutableList.builder(); + disjuctConstaints.stream().forEach(c -> { + TupleDomain newSubDomain = withColumnDomains(c.getSummary() + .getDomains().get().entrySet() + .stream().collect(toMap(e -> (HiveColumnHandle) e.getKey(), e -> e.getValue()))) + .subtract(newEffectivePredicate); + if (!newSubDomain.isNone()) { + builder.add(newSubDomain); + } + }); + + // Get list of all columns involved in predicate + Set predicateColumnNames = new HashSet<>(); + newEffectivePredicate.getDomains().get().keySet().stream() + .map(HiveColumnHandle::getColumnName) + .forEach(predicateColumnNames::add); + + List> newEffectivePredicates = null; + boolean isSuitableToPush = false; + if (HiveSessionProperties.isOrcPredicatePushdownEnabled(session)) { + isSuitableToPush = checkIfSuitableToPush(allColumnHandles, tableHandle, session); + } + + if (isSuitableToPush && HiveSessionProperties.isOrcDisjunctPredicatePushdownEnabled(session)) { + newEffectivePredicates = builder.build(); + + newEffectivePredicates.stream().forEach(nfp -> + nfp.getDomains().get().keySet().stream() + .map(HiveColumnHandle::getColumnName) + .forEach(predicateColumnNames::add)); + } + + if (isSuitableToPush + && partitionResult.getEnforcedConstraint().equals(newEffectivePredicate) + && (newEffectivePredicates == null || newEffectivePredicates.size() == 0)) { + isSuitableToPush = false; + } + + // Get column handle + Map columnHandles = getColumnHandles(table); + + // map predicate columns to hive column handles + Map predicateColumns = predicateColumnNames.stream() + .map(columnHandles::get) + .map(HiveColumnHandle.class::cast) + .filter(HiveColumnHandle::isRegular) + .collect(toImmutableMap(HiveColumnHandle::getName, identity())); + + newHandle = new HiveTableHandle( + newHandle.getSchemaName(), + newHandle.getTableName(), + newHandle.getTableParameters(), + newHandle.getPartitionColumns(), + newHandle.getPartitions(), + newEffectivePredicate, + newHandle.getEnforcedConstraint(), + newHandle.getBucketHandle(), + newHandle.getBucketFilter(), + newHandle.getAnalyzePartitionValues(), + predicateColumns, + Optional.ofNullable(newEffectivePredicates), + isSuitableToPush, + newHandle.getOffloadExpression()); + + if (pushPartitionsOnly && handle.getPartitions().equals(newHandle.getPartitions()) && + handle.getCompactEffectivePredicate().equals(newHandle.getCompactEffectivePredicate()) && + handle.getBucketFilter().equals(newHandle.getBucketFilter())) { + return Optional.empty(); + } + + if (!pushPartitionsOnly && isSuitableToPush) { + return Optional.of(new ConstraintApplicationResult<>(newHandle, TupleDomain.all())); + } + + // note here that all unenforced constraints will still be applied using the filter operator + return Optional.of(new ConstraintApplicationResult<>(newHandle, partitionResult.getUnenforcedConstraint())); + } + + /** + * This function will be called only user enabled pushdown (i.e. orc_predicate_pushdown_enabled=true). + * Then further check if pushdown can be supported by connector. It support iff below all condition satisfies. + * 1. Storage Format should be only ORC. + * 2. Table to be scanned is not transactional table (so effectively DELETE/UPDATE also not supported). + * 3. Also columns part of the scan are of any primitive data-type except byte. + * NOTE: This should be adjusted as we continue to support additional functionality. + * @param allColumnHandles set of all column handles being part of scan. + * @param tableHandle table handle + * @param session current session handler + * @return return true if current scan can support pushdown otherwise false. + */ + protected boolean checkIfSuitableToPush(Set allColumnHandles, ConnectorTableHandle tableHandle, ConnectorSession session) + { + // We allow predicate pushdown only for non-transaction table of HIVE ORC storage format. + if (getHiveStorageFormat(getTableMetadata(session, tableHandle).getProperties()) != ORC + || getTransactionalValue(getTableMetadata(session, tableHandle).getProperties())) { + return false; + } + + for (ColumnHandle handle : allColumnHandles) { + HiveColumnHandle hiveColumnHandle = (HiveColumnHandle) handle; + // Non-primitive data type(e.g. STRUCT, MAP, LIST) and BYTE are not supported to pushdown. + // UPDATE/DELETE which has explicit column $rowId of STRUCT Type, will be not allowed to pushdown. + // NOTE: Incase STRUCT type supported, support of UPDATE/DELETE should be checked or should be handled here. + if (hiveColumnHandle.getHiveType().getCategory().equals(PRIMITIVE) == false + || hiveColumnHandle.getHiveType().equals(HiveType.HIVE_BYTE)) { + return false; + } + } + + return true; + } + + @Override + public Optional getCommonPartitioningHandle(ConnectorSession session, ConnectorPartitioningHandle left, ConnectorPartitioningHandle right) + { + HivePartitioningHandle leftHandle = (HivePartitioningHandle) left; + HivePartitioningHandle rightHandle = (HivePartitioningHandle) right; + + if (!leftHandle.getHiveTypes().equals(rightHandle.getHiveTypes())) { + return Optional.empty(); + } + if (leftHandle.getBucketingVersion() != rightHandle.getBucketingVersion()) { + return Optional.empty(); + } + if (leftHandle.getBucketCount() == rightHandle.getBucketCount()) { + return Optional.of(leftHandle); + } + if (!HiveSessionProperties.isOptimizedMismatchedBucketCount(session)) { + return Optional.empty(); + } + + int largerBucketCount = Math.max(leftHandle.getBucketCount(), rightHandle.getBucketCount()); + int smallerBucketCount = Math.min(leftHandle.getBucketCount(), rightHandle.getBucketCount()); + if (largerBucketCount % smallerBucketCount != 0) { + // must be evenly divisible + return Optional.empty(); + } + if (Integer.bitCount(largerBucketCount / smallerBucketCount) != 1) { + // ratio must be power of two + return Optional.empty(); + } + + OptionalInt maxCompatibleBucketCount = min(leftHandle.getMaxCompatibleBucketCount(), rightHandle.getMaxCompatibleBucketCount()); + if (maxCompatibleBucketCount.isPresent() && maxCompatibleBucketCount.getAsInt() < smallerBucketCount) { + // maxCompatibleBucketCount must be larger than or equal to smallerBucketCount + // because the current code uses the smallerBucketCount as the common partitioning handle. + return Optional.empty(); + } + + return Optional.of(new HivePartitioningHandle( + leftHandle.getBucketingVersion(), // same as rightHandle.getBucketingVersion() + smallerBucketCount, + leftHandle.getHiveTypes(), + maxCompatibleBucketCount)); + } + + private static OptionalInt min(OptionalInt left, OptionalInt right) + { + if (!left.isPresent()) { + return right; + } + if (!right.isPresent()) { + return left; + } + return OptionalInt.of(Math.min(left.getAsInt(), right.getAsInt())); + } + + @Override + public ConnectorTableHandle makeCompatiblePartitioning(ConnectorSession session, ConnectorTableHandle tableHandle, ConnectorPartitioningHandle partitioningHandle) + { + HiveTableHandle hiveTable = (HiveTableHandle) tableHandle; + HivePartitioningHandle hivePartitioningHandle = (HivePartitioningHandle) partitioningHandle; + + checkArgument(hiveTable.getBucketHandle().isPresent(), "Hive connector only provides alternative layout for bucketed table"); + HiveBucketHandle bucketHandle = hiveTable.getBucketHandle().get(); + ImmutableList bucketTypes = bucketHandle.getColumns().stream().map(HiveColumnHandle::getHiveType).collect(toImmutableList()); + checkArgument( + hivePartitioningHandle.getHiveTypes().equals(bucketTypes), + "Types from the new PartitioningHandle (%s) does not match the TableHandle (%s)", + hivePartitioningHandle.getHiveTypes(), + bucketTypes); + int largerBucketCount = Math.max(bucketHandle.getTableBucketCount(), hivePartitioningHandle.getBucketCount()); + int smallerBucketCount = Math.min(bucketHandle.getTableBucketCount(), hivePartitioningHandle.getBucketCount()); + checkArgument( + largerBucketCount % smallerBucketCount == 0 && Integer.bitCount(largerBucketCount / smallerBucketCount) == 1, + "The requested partitioning is not a valid alternative for the table layout"); + + return new HiveTableHandle( + hiveTable.getSchemaName(), + hiveTable.getTableName(), + hiveTable.getTableParameters(), + hiveTable.getPartitionColumns(), + hiveTable.getPartitions(), + hiveTable.getCompactEffectivePredicate(), + hiveTable.getEnforcedConstraint(), + Optional.of(new HiveBucketHandle( + bucketHandle.getColumns(), + bucketHandle.getBucketingVersion(), + bucketHandle.getTableBucketCount(), + hivePartitioningHandle.getBucketCount())), + hiveTable.getBucketFilter(), + hiveTable.getAnalyzePartitionValues(), + hiveTable.getPredicateColumns(), + hiveTable.getDisjunctCompactEffectivePredicate(), + hiveTable.isSuitableToPush(), + hiveTable.getOffloadExpression()); + } + + @VisibleForTesting + static TupleDomain createPredicate(List partitionColumns, List partitions) + { + if (partitions.isEmpty()) { + return TupleDomain.none(); + } + + return withColumnDomains( + partitionColumns.stream() + .collect(toMap(identity(), column -> buildColumnDomain(column, partitions)))); + } + + private static Domain buildColumnDomain(ColumnHandle column, List partitions) + { + checkArgument(!partitions.isEmpty(), "partitions cannot be empty"); + + boolean hasNull = false; + List nonNullValues = new ArrayList<>(); + Type type = null; + + for (HivePartition partition : partitions) { + NullableValue value = partition.getKeys().get(column); + if (value == null) { + throw new PrestoException(HiveErrorCode.HIVE_UNKNOWN_ERROR, format("Partition %s does not have a value for partition column %s", partition, column)); + } + + if (value.isNull()) { + hasNull = true; + } + else { + nonNullValues.add(value.getValue()); + } + + if (type == null) { + type = value.getType(); + } + } + + if (!nonNullValues.isEmpty()) { + Domain domain = Domain.multipleValues(type, nonNullValues); + if (hasNull) { + return domain.union(Domain.onlyNull(type)); + } + + return domain; + } + + return Domain.onlyNull(type); + } + + @Override + public Optional getInsertLayout(ConnectorSession session, ConnectorTableHandle tableHandle) + { + HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle; + SchemaTableName tableName = hiveTableHandle.getSchemaTableName(); + Table table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + + return getInsertTableLayoutInternal(session, table); + } + + private Optional getInsertTableLayoutInternal(ConnectorSession session, Table table) + { + if (table.getStorage().getBucketProperty().isPresent()) { + if (bucketedOnTimestamp(table.getStorage().getBucketProperty().get(), table)) { + throw new PrestoException(NOT_SUPPORTED, "Writing to tables bucketed on timestamp not supported"); + } + } + + Optional hiveBucketHandle = HiveBucketing.getHiveBucketHandle(table); + if (!hiveBucketHandle.isPresent()) { + // return preferred layout which is partitioned by partition columns + List partitionColumns = table.getPartitionColumns(); + if (partitionColumns.isEmpty() || !HiveSessionProperties.isWritePartitionDistributionEnabled(session)) { + return Optional.empty(); + } + + return Optional.of(new ConnectorNewTableLayout( + partitionColumns.stream() + .map(Column::getName) + .collect(toImmutableList()))); + } + HiveBucketProperty bucketProperty = table.getStorage().getBucketProperty() + .orElseThrow(() -> new NoSuchElementException("Bucket property should be set")); + if (!bucketProperty.getSortedBy().isEmpty() && !HiveSessionProperties.isSortedWritingEnabled(session)) { + throw new PrestoException(NOT_SUPPORTED, "Writing to bucketed sorted Hive tables is disabled"); + } + + HivePartitioningHandle partitioningHandle = new HivePartitioningHandle( + hiveBucketHandle.get().getBucketingVersion(), + hiveBucketHandle.get().getTableBucketCount(), + hiveBucketHandle.get().getColumns().stream() + .map(HiveColumnHandle::getHiveType) + .collect(toList()), + OptionalInt.of(hiveBucketHandle.get().getTableBucketCount())); + List partitionColumns = hiveBucketHandle.get().getColumns().stream() + .map(HiveColumnHandle::getName) + .collect(toList()); + return Optional.of(new ConnectorNewTableLayout(partitioningHandle, partitionColumns)); + } + + @Override + public Optional getUpdateLayout(ConnectorSession session, ConnectorTableHandle tableHandle) + { + HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle; + SchemaTableName tableName = hiveTableHandle.getSchemaTableName(); + Table table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + + //When bucketing is not enabled, do the bucketing based on the RowId handle for update/delete + //In case of partitioned table, include the partition columns also for the parallel partitioning of writes. + List tablePartitionColumns = table.getPartitionColumns(); + List partitionColumnNames = new ArrayList<>(); + List partitionColumnTypes = new ArrayList<>(); + tablePartitionColumns.forEach(column -> { + partitionColumnNames.add(column.getName()); + partitionColumnTypes.add(column.getType()); + }); + partitionColumnNames.add(HiveColumnHandle.UPDATE_ROW_ID_COLUMN_NAME.toLowerCase(ENGLISH)); + partitionColumnTypes.add(HiveColumnHandle.updateRowIdHandle().getHiveType()); + HivePartitioningHandle partitioningHandle = new HivePartitioningHandle( + BucketingVersion.BUCKETING_V2, + HiveBucketing.MAX_BUCKET_NUMBER, + partitionColumnTypes, + OptionalInt.empty(), + true); + return Optional.of(new ConnectorNewTableLayout(partitioningHandle, partitionColumnNames)); + } + + @Override + public Optional getNewTableLayout(ConnectorSession session, ConnectorTableMetadata tableMetadata) + { + validatePartitionColumns(tableMetadata); + validateBucketColumns(tableMetadata); + validateCsvColumns(tableMetadata); + Optional bucketProperty = HiveTableProperties.getBucketProperty(tableMetadata.getProperties()); + if (!bucketProperty.isPresent()) { + // return preferred layout which is partitioned by partition columns + List partitionedBy = getPartitionedBy(tableMetadata.getProperties()); + if (partitionedBy.isEmpty() || !HiveSessionProperties.isWritePartitionDistributionEnabled(session)) { + return Optional.empty(); + } + + return Optional.of(new ConnectorNewTableLayout(partitionedBy)); + } + if (!bucketProperty.get().getSortedBy().isEmpty() && !HiveSessionProperties.isSortedWritingEnabled(session)) { + throw new PrestoException(NOT_SUPPORTED, "Writing to bucketed sorted Hive tables is disabled"); + } + + List bucketedBy = bucketProperty.get().getBucketedBy(); + Map hiveTypeMap = tableMetadata.getColumns().stream() + .collect(toMap(ColumnMetadata::getName, column -> HiveType.toHiveType(typeTranslator, column.getType()))); + return Optional.of(new ConnectorNewTableLayout( + new HivePartitioningHandle( + bucketProperty.get().getBucketingVersion(), + bucketProperty.get().getBucketCount(), + bucketedBy.stream() + .map(hiveTypeMap::get) + .collect(toList()), + OptionalInt.of(bucketProperty.get().getBucketCount())), + bucketedBy)); + } + + @Override + public TableStatisticsMetadata getStatisticsCollectionMetadataForWrite(ConnectorSession session, ConnectorTableMetadata tableMetadata) + { + if (!HiveSessionProperties.isCollectColumnStatisticsOnWrite(session)) { + return TableStatisticsMetadata.empty(); + } + List partitionedBy = firstNonNull(getPartitionedBy(tableMetadata.getProperties()), ImmutableList.of()); + return getStatisticsCollectionMetadata(tableMetadata.getColumns(), partitionedBy, false); + } + + @Override + public TableStatisticsMetadata getStatisticsCollectionMetadata(ConnectorSession session, ConnectorTableMetadata tableMetadata) + { + List partitionedBy = firstNonNull(getPartitionedBy(tableMetadata.getProperties()), ImmutableList.of()); + return getStatisticsCollectionMetadata(tableMetadata.getColumns(), partitionedBy, true); + } + + private TableStatisticsMetadata getStatisticsCollectionMetadata(List columns, List partitionedBy, boolean includeRowCount) + { + Set columnStatistics = columns.stream() + .filter(column -> !partitionedBy.contains(column.getName())) + .filter(column -> !column.isHidden()) + .map(this::getColumnStatisticMetadata) + .flatMap(List::stream) + .collect(toImmutableSet()); + + Set tableStatistics = includeRowCount ? ImmutableSet.of(ROW_COUNT) : ImmutableSet.of(); + return new TableStatisticsMetadata(columnStatistics, tableStatistics, partitionedBy); + } + + private List getColumnStatisticMetadata(ColumnMetadata columnMetadata) + { + return getColumnStatisticMetadata(columnMetadata.getName(), metastore.getSupportedColumnStatistics(columnMetadata.getType())); + } + + private List getColumnStatisticMetadata(String columnName, Set statisticTypes) + { + return statisticTypes.stream() + .map(type -> new ColumnStatisticMetadata(columnName, type)) + .collect(toImmutableList()); + } + + @Override + public void createRole(ConnectorSession session, String role, Optional grantor) + { + accessControlMetadata.createRole(session, role, grantor.map(HivePrincipal::from)); + } + + @Override + public void dropRole(ConnectorSession session, String role) + { + accessControlMetadata.dropRole(session, role); + } + + @Override + public Set listRoles(ConnectorSession session) + { + return accessControlMetadata.listRoles(session); + } + + @Override + public Set listRoleGrants(ConnectorSession session, PrestoPrincipal principal) + { + return ImmutableSet.copyOf(accessControlMetadata.listRoleGrants(session, HivePrincipal.from(principal))); + } + + @Override + public void grantRoles(ConnectorSession session, Set roles, Set grantees, boolean withAdminOption, Optional grantor) + { + accessControlMetadata.grantRoles(session, roles, HivePrincipal.from(grantees), withAdminOption, grantor.map(HivePrincipal::from)); + } + + @Override + public void revokeRoles(ConnectorSession session, Set roles, Set grantees, boolean adminOptionFor, Optional grantor) + { + accessControlMetadata.revokeRoles(session, roles, HivePrincipal.from(grantees), adminOptionFor, grantor.map(HivePrincipal::from)); + } + + @Override + public Set listApplicableRoles(ConnectorSession session, PrestoPrincipal principal) + { + return accessControlMetadata.listApplicableRoles(session, HivePrincipal.from(principal)); + } + + @Override + public Set listEnabledRoles(ConnectorSession session) + { + return accessControlMetadata.listEnabledRoles(session); + } + + @Override + public void grantTablePrivileges(ConnectorSession session, SchemaTableName schemaTableName, Set privileges, PrestoPrincipal grantee, boolean grantOption) + { + accessControlMetadata.grantTablePrivileges(session, schemaTableName, privileges, HivePrincipal.from(grantee), grantOption); + } + + @Override + public void revokeTablePrivileges(ConnectorSession session, SchemaTableName schemaTableName, Set privileges, PrestoPrincipal grantee, boolean grantOption) + { + accessControlMetadata.revokeTablePrivileges(session, schemaTableName, privileges, HivePrincipal.from(grantee), grantOption); + } + + @Override + public List listTablePrivileges(ConnectorSession session, SchemaTablePrefix schemaTablePrefix) + { + return accessControlMetadata.listTablePrivileges(session, listTables(session, schemaTablePrefix)); + } + + public static HiveStorageFormat extractHiveStorageFormat(Table table) + { + StorageFormat storageFormat = table.getStorage().getStorageFormat(); + String outputFormat = storageFormat.getOutputFormat(); + String serde = storageFormat.getSerDe(); + for (HiveStorageFormat format : HiveStorageFormat.values()) { + if (format.getOutputFormat().equals(outputFormat) && format.getSerDe().equals(serde)) { + return format; + } + } + throw new PrestoException(HiveErrorCode.HIVE_UNSUPPORTED_FORMAT, format("Output format %s with SerDe %s is not supported", outputFormat, serde)); + } + + protected void validateBucketColumns(ConnectorTableMetadata tableMetadata) + { + Optional bucketProperty = HiveTableProperties.getBucketProperty(tableMetadata.getProperties()); + if (!bucketProperty.isPresent()) { + return; + } + Set allColumns = tableMetadata.getColumns().stream() + .map(ColumnMetadata::getName) + .collect(toSet()); + + List bucketedBy = bucketProperty.get().getBucketedBy(); + if (!allColumns.containsAll(bucketedBy)) { + throw new PrestoException(INVALID_TABLE_PROPERTY, format("Bucketing columns %s not present in schema", Sets.difference(ImmutableSet.copyOf(bucketedBy), ImmutableSet.copyOf(allColumns)))); + } + + List sortedBy = bucketProperty.get().getSortedBy().stream() + .map(SortingColumn::getColumnName) + .collect(toImmutableList()); + if (!allColumns.containsAll(sortedBy)) { + throw new PrestoException(INVALID_TABLE_PROPERTY, format("Sorting columns %s not present in schema", Sets.difference(ImmutableSet.copyOf(sortedBy), ImmutableSet.copyOf(allColumns)))); + } + } + + private static void validatePartitionColumns(ConnectorTableMetadata tableMetadata) + { + List partitionedBy = getPartitionedBy(tableMetadata.getProperties()); + + List allColumns = tableMetadata.getColumns().stream() + .map(ColumnMetadata::getName) + .collect(toList()); + + if (!allColumns.containsAll(partitionedBy)) { + throw new PrestoException(INVALID_TABLE_PROPERTY, format("Partition columns %s not present in schema", Sets.difference(ImmutableSet.copyOf(partitionedBy), ImmutableSet.copyOf(allColumns)))); + } + + if (allColumns.size() == partitionedBy.size()) { + throw new PrestoException(INVALID_TABLE_PROPERTY, "Table contains only partition columns"); + } + + if (!allColumns.subList(allColumns.size() - partitionedBy.size(), allColumns.size()).equals(partitionedBy)) { + throw new PrestoException(HiveErrorCode.HIVE_COLUMN_ORDER_MISMATCH, "Partition keys must be the last columns in the table and in the same order as the table properties: " + partitionedBy); + } + } + + protected List getColumnHandles(ConnectorTableMetadata tableMetadata, Set partitionColumnNames, TypeTranslator typeTranslator) + { + validatePartitionColumns(tableMetadata); + validateBucketColumns(tableMetadata); + validateCsvColumns(tableMetadata); + + ImmutableList.Builder columnHandles = ImmutableList.builder(); + int ordinal = 0; + for (ColumnMetadata column : tableMetadata.getColumns()) { + HiveColumnHandle.ColumnType columnType; + if (partitionColumnNames.contains(column.getName())) { + columnType = HiveColumnHandle.ColumnType.PARTITION_KEY; + } + else if (column.isHidden()) { + columnType = HiveColumnHandle.ColumnType.SYNTHESIZED; + } + else { + columnType = HiveColumnHandle.ColumnType.REGULAR; + } + columnHandles.add(new HiveColumnHandle( + column.getName(), + HiveType.toHiveType(typeTranslator, column.getType()), + column.getType().getTypeSignature(), + ordinal, + columnType, + Optional.ofNullable(column.getComment()))); + ordinal++; + } + + return columnHandles.build(); + } + + protected void validateCsvColumns(ConnectorTableMetadata tableMetadata) + { + if (HiveTableProperties.getHiveStorageFormat(tableMetadata.getProperties()) != HiveStorageFormat.CSV) { + return; + } + + Set partitionedBy = ImmutableSet.copyOf(getPartitionedBy(tableMetadata.getProperties())); + List unsupportedColumns = tableMetadata.getColumns().stream() + .filter(columnMetadata -> !partitionedBy.contains(columnMetadata.getName())) + .filter(columnMetadata -> !columnMetadata.getType().equals(createUnboundedVarcharType())) + .collect(toImmutableList()); + + if (!unsupportedColumns.isEmpty()) { + String joinedUnsupportedColumns = unsupportedColumns.stream() + .map(columnMetadata -> format("%s %s", columnMetadata.getName(), columnMetadata.getType())) + .collect(joining(", ")); + throw new PrestoException(NOT_SUPPORTED, "Hive CSV storage format only supports VARCHAR (unbounded). Unsupported columns: " + joinedUnsupportedColumns); + } + } + + protected static Function columnMetadataGetter(Table table, TypeManager typeManager) + { + ImmutableList.Builder columnNames = ImmutableList.builder(); + table.getPartitionColumns().stream().map(Column::getName).forEach(columnNames::add); + table.getDataColumns().stream().map(Column::getName).forEach(columnNames::add); + List allColumnNames = columnNames.build(); + if (allColumnNames.size() > Sets.newHashSet(allColumnNames).size()) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, + format("Hive metadata for table %s is invalid: Table descriptor contains duplicate columns", table.getTableName())); + } + + List tableColumns = table.getDataColumns(); + ImmutableMap.Builder> builder = ImmutableMap.builder(); + for (Column field : concat(tableColumns, table.getPartitionColumns())) { + if (field.getComment().isPresent() && !field.getComment().get().equals("from deserializer")) { + builder.put(field.getName(), field.getComment()); + } + else { + builder.put(field.getName(), Optional.empty()); + } + } + // add hidden columns + builder.put(HiveColumnHandle.PATH_COLUMN_NAME, Optional.empty()); + if (table.getStorage().getBucketProperty().isPresent()) { + builder.put(HiveColumnHandle.BUCKET_COLUMN_NAME, Optional.empty()); + } + + Map> columnComment = builder.build(); + + return handle -> new ColumnMetadata( + handle.getName(), + typeManager.getType(handle.getTypeSignature()), + true, + columnComment.get(handle.getName()).orElse(null), + columnExtraInfo(handle.isPartitionKey()), + handle.isHidden(), + emptyMap(), + handle.isRequired()); + } + + @Override + public void rollback() + { + metastore.rollback(); + } + + @Override + public void commit() + { + metastore.commit(); + metastore.submitCleanupTasks(); + } + + @Override + public void beginQuery(ConnectorSession session) + { + metastore.beginQuery(session); + } + + @Override + public void cleanupQuery(ConnectorSession session) + { + metastore.cleanupQuery(session); + } + + public static Optional getSourceTableNameFromSystemTable(SchemaTableName tableName) + { + return Stream.of(SystemTableHandler.values()) + .filter(handler -> handler.matches(tableName)) + .map(handler -> handler.getSourceTableName(tableName)) + .findAny(); + } + + private static SystemTable createSystemTable(ConnectorTableMetadata metadata, Function, RecordCursor> cursor) + { + return new SystemTable() + { + @Override + public Distribution getDistribution() + { + return Distribution.SINGLE_COORDINATOR; + } + + @Override + public ConnectorTableMetadata getTableMetadata() + { + return metadata; + } + + @Override + public RecordCursor cursor(ConnectorTransactionHandle transactionHandle, ConnectorSession session, TupleDomain constraint) + { + return cursor.apply(constraint); + } + }; + } + + private enum SystemTableHandler + { + PARTITIONS, PROPERTIES; + + private final String suffix; + + SystemTableHandler() + { + this.suffix = "$" + name().toLowerCase(ENGLISH); + } + + boolean matches(SchemaTableName table) + { + return table.getTableName().endsWith(suffix) && + (table.getTableName().length() > suffix.length()); + } + + SchemaTableName getSourceTableName(SchemaTableName table) + { + return new SchemaTableName( + table.getSchemaName(), + table.getTableName().substring(0, table.getTableName().length() - suffix.length())); + } + } + + private static Optional firstNonNullable(T... values) + { + for (T value : values) { + if (value != null) { + return Optional.of(value); + } + } + return Optional.empty(); + } + + /** + * Presto can only cache execution plans for supported connectors. + * This method overrides {@link ConnectorMetadata} returns true to indicate + * execution plan caching is enabled for Hive connectors. + * + * @param session Presto session + * @param handle Connector specific table handle + */ + @Override + public boolean isExecutionPlanCacheSupported(ConnectorSession session, ConnectorTableHandle handle) + { + return true; + } + + @Override + public boolean isSnapshotSupportedAsInput(ConnectorSession session, ConnectorTableHandle handle) + { + return true; + } + + @Override + public boolean isSnapshotSupportedAsOutput(ConnectorSession session, ConnectorTableHandle handle) + { + HiveTableHandle tableHandle = (HiveTableHandle) handle; + Optional
table = metastore.getTable( + new HiveIdentity(session), + tableHandle.getSchemaName(), + tableHandle.getTableName()); + return table.isPresent() && HiveMetadata.extractHiveStorageFormat(table.get()) == ORC; + } + + @Override + public boolean isSnapshotSupportedAsNewTable(ConnectorSession session, Map tableProperties) + { + return getHiveStorageFormat(tableProperties) == ORC; + } + + @Override + public void resetInsertForRerun(ConnectorSession session, ConnectorInsertTableHandle tableHandle, OptionalLong snapshotIndex) + { + updateSnapshotFiles(session, (HiveWritableTableHandle) tableHandle, true, null, snapshotIndex); + } + + @Override + public void resetCreateForRerun(ConnectorSession session, ConnectorOutputTableHandle tableHandle, OptionalLong snapshotIndex) + { + updateSnapshotFiles(session, (HiveWritableTableHandle) tableHandle, true, null, snapshotIndex); + } + + // Visit table writePath recursively. + // resume=true: (for rerun) remove all files with snapshot identifier + // resume=false: (for finish) remove sub-files; rename merged-files + private void updateSnapshotFiles(ConnectorSession session, HiveWritableTableHandle tableHandle, boolean resume, Set mergedFileNames, OptionalLong snapshotIndex) + { + try { + FileSystem fileSystem = hdfsEnvironment.getFileSystem( + new HdfsContext(session, tableHandle.getSchemaName(), tableHandle.getTableName()), + tableHandle.getLocationHandle().getWritePath()); + updatePreviousFiles(fileSystem, tableHandle.getLocationHandle().getWritePath(), session.getQueryId(), resume, mergedFileNames, snapshotIndex.orElse(0)); + } + catch (IOException e) { + throw new PrestoException(HIVE_FILESYSTEM_ERROR, "Failed to update Hive files for " + tableHandle.getSchemaName() + "." + tableHandle.getTableName(), e); + } + } + + private void updatePreviousFiles(FileSystem fileSystem, Path folder, String queryId, boolean resume, Set mergedFileNames, long snapshotIndex) + throws IOException + { + if (fileSystem.exists(folder)) { + for (FileStatus status : fileSystem.listStatus(folder)) { + if (status.isDirectory()) { + updatePreviousFiles(fileSystem, status.getPath(), queryId, resume, mergedFileNames, snapshotIndex); + } + else if (isSnapshotFile(status.getPath().getName(), queryId)) { + String fileName = status.getPath().getName(); + if (resume) { + long subFileIndex = getSnapshotSubFileIndex(fileName, queryId); + // Remove any merged files and subfiles that are after the snapshot being resumed to + if (subFileIndex < 0 || subFileIndex >= snapshotIndex) { + log.debug("Deleting file resume=true: %s", fileName); + fileSystem.delete(status.getPath()); + } + } + else { + if (isSnapshotSubFile(fileName, queryId)) { + log.debug("Deleting sub file resume=false: %s", fileName); + fileSystem.delete(status.getPath()); + } + else { + // Rename snapshot-altered file names (_snapshot_) to original name, if: + // For normal tables, the file is part of the output file list (mergedFileNames) + // For transaqctional tables, the file's parent folder is part of the output file list + if (mergedFileNames.contains(fileName) || mergedFileNames.contains(status.getPath().getParent().getName())) { + String newName = removeSnapshotFileName(fileName, queryId); + log.debug("Renaming merged file resume=false: %s to %s", fileName, newName); + fileSystem.rename(status.getPath(), new Path(folder, newName)); + } + else { + // Remove files that are not part of the final output files. (e.g. those produced by abandoned tasks.) + log.debug("Deleting old merged file resume=false: %s", fileName); + fileSystem.delete(status.getPath()); + } + } + } + } + } + + Path acidVersionFile = AcidUtils.OrcAcidVersion.getVersionFilePath(folder); + if (fileSystem.exists(acidVersionFile) && fileSystem.getFileStatus(acidVersionFile).getLen() == 0) { + // For some reason, an empty _orc_acid_version file was created. Recreate it. + fileSystem.delete(acidVersionFile); + AcidUtils.OrcAcidVersion.writeVersionFile(folder, fileSystem); + } + } + } + + private Set collectMergedFileNames(List partitionUpdates) + { + return partitionUpdates.stream().flatMap(update -> update.getFileNames().stream()).collect(toSet()); + } + + private List updateSnapshotFileNames(List partitionUpdates, String queryId) + { + ImmutableList.Builder builder = ImmutableList.builder(); + for (PartitionUpdate partitionUpdate : partitionUpdates) { + builder.add(new PartitionUpdate( + partitionUpdate.getName(), + partitionUpdate.getUpdateMode(), + partitionUpdate.getWritePath(), + partitionUpdate.getTargetPath(), + updateSnapshotFileName(partitionUpdate.getFileNames(), queryId), + partitionUpdate.getRowCount(), + partitionUpdate.getInMemoryDataSizeInBytes(), + partitionUpdate.getOnDiskDataSizeInBytes(), + partitionUpdate.getMiscData())); + } + return builder.build(); + } + + private List updateSnapshotFileName(List fileNames, String queryId) + { + return fileNames.stream().map(name -> HiveWriterFactory.removeSnapshotFileName(name, queryId)).collect(Collectors.toList()); + } + + protected void finishInsertOverwrite(ConnectorSession session, HiveInsertTableHandle handle, Table table, PartitionUpdate partitionUpdate, PartitionStatistics partitionStatistics) + { + PrincipalPrivileges principalPrivileges = PrincipalPrivileges.fromHivePrivilegeInfos(metastore.listTablePrivileges(handle.getSchemaName(), handle.getTableName(), null)); + // first drop it + metastore.dropTable(session, handle.getSchemaName(), handle.getTableName()); + + // create the table with the new location + metastore.createTable(session, table, principalPrivileges, Optional.of(partitionUpdate.getWritePath()), false, partitionStatistics); + } + + protected void finishInsertInNewPartition(ConnectorSession session, HiveInsertTableHandle handle, Table table, Map columnTypes, PartitionUpdate partitionUpdate, Map, ComputedStatistics> partitionComputedStatistics, HiveACIDWriteType acidWriteType) + { + // insert into new partition or overwrite existing partition + Partition partition = buildPartitionObject(session, table, partitionUpdate); + if (!partition.getStorage().getStorageFormat().getInputFormat().equals(handle.getPartitionStorageFormat().getInputFormat()) && HiveSessionProperties.isRespectTableFormat(session)) { + throw new PrestoException(HiveErrorCode.HIVE_CONCURRENT_MODIFICATION_DETECTED, "Partition format changed during insert"); + } + if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.OVERWRITE) { + metastore.dropPartition(session, handle.getSchemaName(), handle.getTableName(), partition.getValues()); + } + PartitionStatistics partitionStatistics = createPartitionStatistics( + session, + partitionUpdate.getStatistics(), + columnTypes, + getColumnStatistics(partitionComputedStatistics, partition.getValues())); + metastore.addPartition(session, handle.getSchemaName(), handle.getTableName(), partition, partitionUpdate.getWritePath(), partitionStatistics, acidWriteType); + } + + public void setExternalTable(boolean externalTable) + { + this.externalTable = externalTable; + } + + protected void verifyStorageFormatForCatalog(StorageFormat storageFormat) + { + requireNonNull(storageFormat, "Storage format is null"); + + if (storageFormat.getInputFormat().contains("CarbonInputFormat")) { + String sf = storageFormat.getInputFormat(); + throw new PrestoException(NOT_SUPPORTED, + String.format("Tables with %s are not supported by Hive connector", sf.substring(sf.lastIndexOf(".") + 1))); + } + } + + @Override + public List getTablesForVacuum() + { + if (autoVacuumEnabled) { + return VacuumEligibleTableCollector.getVacuumTableList(metastore, hdfsEnvironment, + vacuumDeltaNumThreshold, vacuumDeltaPercentThreshold, vacuumExecutorService, vacuumCollectorInterval); + } + return null; + } + + @Override + public PartialAndFinalAggregationType validateAndGetSortAggregationType(ConnectorSession session, ConnectorTableHandle tableHandle, List groupKeyNames) + { + PartialAndFinalAggregationType partialAndFinalAggregationType = new PartialAndFinalAggregationType(); + ConnectorTableMetadata connectorTableMetadata = getTableMetadata(session, ((HiveTableHandle) tableHandle).getSchemaTableName()); + List sortingColumn = (List) connectorTableMetadata.getProperties().get(HiveTableProperties.SORTED_BY_PROPERTY); + boolean isSortingColumnsNotPresent = (sortingColumn == null) || (sortingColumn.size() == 0); + + List partitionedBy = new ArrayList<>(); + List partitionedByTemp = (List) connectorTableMetadata.getProperties().get(HiveTableProperties.PARTITIONED_BY_PROPERTY); + if ((partitionedByTemp != null) && (partitionedByTemp.size() != 0)) { + partitionedBy.addAll(partitionedByTemp); + if (isSortingColumnsNotPresent && (partitionedByTemp.size() != groupKeyNames.size())) { + return partialAndFinalAggregationType; + } + } + else if (isSortingColumnsNotPresent) { + return partialAndFinalAggregationType; + } + int bucketCount = 0; + List bucketedColumns = new ArrayList<>(); + if (!isSortingColumnsNotPresent) { + bucketedColumns.addAll((List) connectorTableMetadata.getProperties().get(HiveTableProperties.BUCKETED_BY_PROPERTY)); + if (null != bucketedColumns) { + bucketCount = (int) connectorTableMetadata.getProperties().get(HiveTableProperties.BUCKET_COUNT_PROPERTY); + } + } + + List sortedColumnNames = new ArrayList<>(); + if ((sortingColumn != null) && (sortingColumn.size() != 0)) { + sortedColumnNames.addAll(sortingColumn.stream().map(column -> column.getColumnName()).collect(Collectors.toList())); + } + + //grouping key should be sub set of sorted By and it should match all partition by columns + if ((partitionedBy.size() + sortedColumnNames.size() < groupKeyNames.size()) || + (partitionedBy.size() > groupKeyNames.size())) { + //sorted columns are less than join criteria columns + log.debug("number of sorted columns " + sortedColumnNames.size() + "are less join column size " + groupKeyNames.size()); + return partialAndFinalAggregationType; + } + + // bucketby columns and groupby Columns should be same. + // or when bucket count should be 1 and bucket column that matches with groupBy + // or when bucket count is 0 no need to compare buckets + int partitionedByCount = partitionedBy.size() == 0 ? 0 : partitionedBy.size() - 1; + boolean singleOrZeroBucketedColumn = (((bucketCount == 1) && (bucketedColumns.size() == 1) && + (groupKeyNames.get(partitionedByCount).equals(bucketedColumns.get(0)))) || (bucketCount == 0)); + + if ((bucketCount == 1) && (bucketedColumns.size() > 1)) { + int minSize = Math.min(groupKeyNames.size() - partitionedBy.size(), bucketedColumns.size()); + int partSize = partitionedBy.size(); + for (int keyIdx = 0; keyIdx < minSize; keyIdx++) { + if (!groupKeyNames.get(keyIdx + partSize).equals(bucketedColumns.get(keyIdx))) { + return partialAndFinalAggregationType; + } + } + singleOrZeroBucketedColumn = true; + } + + for (int numOfComparedKeys = 0; numOfComparedKeys < partitionedBy.size(); numOfComparedKeys++) { + if ((!groupKeyNames.get(numOfComparedKeys).equals(partitionedBy.get(numOfComparedKeys)))) { + return partialAndFinalAggregationType; + } + } + + if (groupKeyNames.size() == partitionedBy.size()) { + partialAndFinalAggregationType.setPartialAsSortAndFinalAsHashAggregation(true); + return partialAndFinalAggregationType; + } + + if (singleOrZeroBucketedColumn || (groupKeyNames.size() == (bucketedColumns.size() + partitionedBy.size()))) { + int numOfCmpKeysAfterPartitionedBy = partitionedBy.size(); + for (int numOfComparedKeys = 0; numOfComparedKeys < groupKeyNames.size() - partitionedBy.size(); numOfComparedKeys++, numOfCmpKeysAfterPartitionedBy++) { + boolean bucketedColumnsResult = !singleOrZeroBucketedColumn && (!groupKeyNames.get(numOfComparedKeys).equals(bucketedColumns.get(numOfComparedKeys))); + if ((!groupKeyNames.get(numOfCmpKeysAfterPartitionedBy).equals(sortedColumnNames.get(numOfComparedKeys))) || + (!singleOrZeroBucketedColumn && bucketedColumnsResult)) { + if (log.isDebugEnabled()) { + final String[] dbgGroupKeyNames = {new String("")}; + groupKeyNames.stream().forEach(k -> dbgGroupKeyNames[0] = dbgGroupKeyNames[0].concat(k + " , ")); + final String[] dbgSortedColumnNames = {new String("")}; + sortedColumnNames.stream().forEach(k -> dbgSortedColumnNames[0] = dbgSortedColumnNames[0].concat(k + " , ")); + if ((null != bucketedColumns) && (bucketedColumns.size() > 0)) { + final String[] dbgbucketedColumns = {new String("")}; + bucketedColumns.stream().forEach(k -> dbgbucketedColumns[0] = dbgbucketedColumns[0].concat(k + " , ")); + log.debug("Not matching sortedColumnNames: " + dbgSortedColumnNames + " group columns name: " + dbgGroupKeyNames + " bucketedColumns :" + dbgbucketedColumns); + } + log.debug("Not matching sortedColumnNames: " + dbgSortedColumnNames + " group columns name: " + dbgGroupKeyNames); + } + return partialAndFinalAggregationType; + } + } + partialAndFinalAggregationType.setSortAggregation(true); + return partialAndFinalAggregationType; + } + return partialAndFinalAggregationType; + } + + @Override + public void refreshMetadataCache() + { + metastore.refreshMetastoreCache(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveMetadataFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveMetadataFactory.java new file mode 100644 index 00000000..72e5b5b6 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveMetadataFactory.java @@ -0,0 +1,216 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.concurrent.BoundedExecutor; +import io.airlift.json.JsonCodec; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.metastore.CachingHiveMetastore; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.security.AccessControlMetadataFactory; +import io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider; +import io.prestosql.plugin.hive.statistics.TableColumnStatistics; +import io.prestosql.spi.type.TypeManager; + +import javax.inject.Inject; + +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.ScheduledExecutorService; +import java.util.function.Supplier; + +import static java.util.Objects.requireNonNull; + +public class HiveMetadataFactory + implements Supplier +{ + protected final Map statsCache = new ConcurrentHashMap(); + protected final Map samplePartitionCache = new ConcurrentHashMap(); + + private final boolean skipDeletionForAlter; + private final boolean skipTargetCleanupOnRollback; + private final boolean writesToNonManagedTablesEnabled; + private final boolean createsOfNonManagedTablesEnabled; + private final boolean tableCreatesWithLocationAllowed; + private final long perTransactionCacheMaximumSize; + private final HiveMetastore metastore; + private final HdfsEnvironment hdfsEnvironment; + private final HivePartitionManager partitionManager; + private final TypeManager typeManager; + private final LocationService locationService; + private final JsonCodec partitionUpdateCodec; + private final BoundedExecutor renameExecution; + private final ScheduledExecutorService hiveVacuumService; + private final TypeTranslator typeTranslator; + private final String prestoVersion; + private final AccessControlMetadataFactory accessControlMetadataFactory; + private final Optional hiveTransactionHeartbeatInterval; + private final ScheduledExecutorService heartbeatService; + private final ScheduledExecutorService hiveMetastoreClientService; + private final Duration vacuumCleanupRecheckInterval; + private final int vacuumDeltaNumThreshold; + private final double vacuumDeltaPercentThreshold; + private final boolean autoVacuumEnabled; + private Optional vacuumCollectorInterval; + protected final int hmsWriteBatchSize; + + @Inject + @SuppressWarnings("deprecation") + public HiveMetadataFactory( + HiveConfig hiveConfig, + HiveMetastore metastore, + HdfsEnvironment hdfsEnvironment, + HivePartitionManager partitionManager, + @ForHive ExecutorService executorService, + @ForHiveVacuum ScheduledExecutorService hiveVacuumService, + @ForHiveMetastore ScheduledExecutorService hiveMetastoreClientService, + @ForHiveTransactionHeartbeats ScheduledExecutorService heartbeatService, + TypeManager typeManager, + LocationService locationService, + JsonCodec partitionUpdateCodec, + TypeTranslator typeTranslator, + NodeVersion nodeVersion, + AccessControlMetadataFactory accessControlMetadataFactory) + { + this( + metastore, + hdfsEnvironment, + partitionManager, + hiveConfig.getMaxConcurrentFileRenames(), + hiveConfig.isSkipDeletionForAlter(), + hiveConfig.isSkipTargetCleanupOnRollback(), + hiveConfig.getWritesToNonManagedTablesEnabled(), + hiveConfig.getCreatesOfNonManagedTablesEnabled(), + hiveConfig.getTableCreatesWithLocationAllowed(), + hiveConfig.getPerTransactionMetastoreCacheMaximumSize(), + hiveConfig.getHiveTransactionHeartbeatInterval(), + hiveConfig.getVacuumCleanupRecheckInterval(), + typeManager, + locationService, + partitionUpdateCodec, + executorService, + hiveVacuumService, + heartbeatService, + hiveMetastoreClientService, + typeTranslator, + nodeVersion.toString(), + accessControlMetadataFactory, + hiveConfig.getVacuumDeltaNumThreshold(), + hiveConfig.getVacuumDeltaPercentThreshold(), + hiveConfig.getAutoVacuumEnabled(), + hiveConfig.getVacuumCollectorInterval(), + hiveConfig.getMetastoreWriteBatchSize()); + } + + public HiveMetadataFactory( + HiveMetastore metastore, + HdfsEnvironment hdfsEnvironment, + HivePartitionManager partitionManager, + int maxConcurrentFileRenames, + boolean skipDeletionForAlter, + boolean skipTargetCleanupOnRollback, + boolean writesToNonManagedTablesEnabled, + boolean createsOfNonManagedTablesEnabled, + boolean tableCreatesWithLocationAllowed, + long perTransactionCacheMaximumSize, + Optional hiveTransactionHeartbeatInterval, + Duration vacuumCleanupRecheckInterval, + TypeManager typeManager, + LocationService locationService, + JsonCodec partitionUpdateCodec, + ExecutorService executorService, + ScheduledExecutorService hiveVacuumService, + ScheduledExecutorService heartbeatService, + ScheduledExecutorService hiveMetastoreClientService, + TypeTranslator typeTranslator, + String prestoVersion, + AccessControlMetadataFactory accessControlMetadataFactory, + int vacuumDeltaNumThreshold, + double vacuumDeltaPercentThreshold, + boolean autoVacuumEnabled, + Optional vacuumCollectorInterval, + int hmsWriteBatchSize) + { + this.skipDeletionForAlter = skipDeletionForAlter; + this.skipTargetCleanupOnRollback = skipTargetCleanupOnRollback; + this.writesToNonManagedTablesEnabled = writesToNonManagedTablesEnabled; + this.createsOfNonManagedTablesEnabled = createsOfNonManagedTablesEnabled; + this.tableCreatesWithLocationAllowed = tableCreatesWithLocationAllowed; + this.perTransactionCacheMaximumSize = perTransactionCacheMaximumSize; + + this.metastore = requireNonNull(metastore, "metastore is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.partitionManager = requireNonNull(partitionManager, "partitionManager is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + this.locationService = requireNonNull(locationService, "locationService is null"); + this.partitionUpdateCodec = requireNonNull(partitionUpdateCodec, "partitionUpdateCodec is null"); + this.typeTranslator = requireNonNull(typeTranslator, "typeTranslator is null"); + this.prestoVersion = requireNonNull(prestoVersion, "prestoVersion is null"); + this.accessControlMetadataFactory = requireNonNull(accessControlMetadataFactory, "accessControlMetadataFactory is null"); + this.hiveTransactionHeartbeatInterval = requireNonNull(hiveTransactionHeartbeatInterval, "hiveTransactionHeartbeatInterval is null"); + this.vacuumCleanupRecheckInterval = requireNonNull(vacuumCleanupRecheckInterval, "vacuumCleanupInterval is null"); + + renameExecution = new BoundedExecutor(executorService, maxConcurrentFileRenames); + this.hiveVacuumService = requireNonNull(hiveVacuumService, "hiveVacuumService is null"); + this.heartbeatService = requireNonNull(heartbeatService, "heartbeatService is null"); + this.hiveMetastoreClientService = requireNonNull(hiveMetastoreClientService, "heartbeatService is null"); + this.vacuumDeltaNumThreshold = vacuumDeltaNumThreshold; + this.vacuumDeltaPercentThreshold = vacuumDeltaPercentThreshold; + this.autoVacuumEnabled = autoVacuumEnabled; + this.vacuumCollectorInterval = vacuumCollectorInterval; + this.hmsWriteBatchSize = hmsWriteBatchSize; + } + + @Override + public HiveMetadata get() + { + SemiTransactionalHiveMetastore metastore = new SemiTransactionalHiveMetastore( + hdfsEnvironment, + CachingHiveMetastore.memoizeMetastore(this.metastore, perTransactionCacheMaximumSize), // per-transaction cache + renameExecution, + hiveVacuumService, + vacuumCleanupRecheckInterval, + skipDeletionForAlter, + skipTargetCleanupOnRollback, + hiveTransactionHeartbeatInterval, + heartbeatService, + hiveMetastoreClientService, + hmsWriteBatchSize); + + return new HiveMetadata( + metastore, + hdfsEnvironment, + partitionManager, + writesToNonManagedTablesEnabled, + createsOfNonManagedTablesEnabled, + tableCreatesWithLocationAllowed, + typeManager, + locationService, + partitionUpdateCodec, + typeTranslator, + prestoVersion, + new MetastoreHiveStatisticsProvider(metastore, statsCache, samplePartitionCache), + accessControlMetadataFactory.create(metastore), + autoVacuumEnabled, + vacuumDeltaNumThreshold, + vacuumDeltaPercentThreshold, + hiveVacuumService, + vacuumCollectorInterval, + hiveMetastoreClientService); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveMetastoreClosure.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveMetastoreClosure.java new file mode 100644 index 00000000..a725f8d6 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveMetastoreClosure.java @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.PartitionWithStatistics; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableNotFoundException; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.Maps.immutableEntry; +import static io.prestosql.plugin.hive.HivePartitionManager.extractPartitionValues; +import static java.util.Objects.requireNonNull; + +public class HiveMetastoreClosure +{ + private final HiveMetastore delegate; + + public HiveMetastoreClosure(HiveMetastore delegate) + { + this.delegate = requireNonNull(delegate, "delegate is null"); + } + + public Table getExistingTable(HiveIdentity identity, String databaseName, String tableName) + { + return getTable(identity, databaseName, tableName) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + } + + public Optional
getTable(HiveIdentity identity, String databaseName, String tableName) + { + return delegate.getTable(identity, databaseName, tableName); + } + + public PartitionStatistics getTableStatistics(HiveIdentity identity, String databaseName, String tableName) + { + return delegate.getTableStatistics(identity, getExistingTable(identity, databaseName, tableName)); + } + + public Map getPartitionStatistics(HiveIdentity identity, String databaseName, String tableName, Set partitionNames) + { + Table table = getExistingTable(identity, databaseName, tableName); + List partitions = getExistingPartitionsByNames(identity, table, ImmutableList.copyOf(partitionNames)); + return delegate.getPartitionStatistics(identity, table, partitions); + } + + private List getExistingPartitionsByNames(HiveIdentity identity, Table table, List partitionNames) + { + Map partitions = delegate.getPartitionsByNames(identity, table.getDatabaseName(), table.getTableName(), partitionNames).entrySet().stream() + .map(entry -> immutableEntry(entry.getKey(), entry.getValue().orElseThrow(() -> + new PartitionNotFoundException(table.getSchemaTableName(), extractPartitionValues(entry.getKey()))))) + .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue)); + + return partitionNames.stream() + .map(partitions::get) + .collect(toImmutableList()); + } + + public void updateTableStatistics(HiveIdentity identity, String databaseName, String tableName, Function update) + { + delegate.updateTableStatistics(identity, databaseName, tableName, update); + } + + public void updatePartitionStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, Function update) + { + delegate.updatePartitionStatistics(identity, databaseName, tableName, partitionName, update); + } + + public void addPartitions(HiveIdentity identity, String databaseName, String tableName, List partitions) + { + delegate.addPartitions(identity, databaseName, tableName, partitions); + } + + public void dropPartition(HiveIdentity identity, String databaseName, String tableName, List parts, boolean deleteData) + { + delegate.dropPartition(identity, databaseName, tableName, parts, deleteData); + } + + public void alterPartition(HiveIdentity identity, String databaseName, String tableName, PartitionWithStatistics partition) + { + delegate.alterPartition(identity, databaseName, tableName, partition); + } + + public Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues) + { + return delegate.getPartition(identity, databaseName, tableName, partitionValues); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveModule.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveModule.java new file mode 100644 index 00000000..a2bf6fee --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveModule.java @@ -0,0 +1,230 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.cache.CacheLoader; +import com.google.inject.Binder; +import com.google.inject.Module; +import com.google.inject.Provides; +import com.google.inject.Scopes; +import com.google.inject.TypeLiteral; +import com.google.inject.multibindings.Multibinder; +import io.airlift.concurrent.BoundedExecutor; +import io.airlift.event.client.EventClient; +import io.prestosql.orc.BloomFilterCacheStatsLister; +import io.prestosql.orc.FileTailCacheStatsLister; +import io.prestosql.orc.OrcCacheStore; +import io.prestosql.orc.RowDataCacheStatsLister; +import io.prestosql.orc.RowIndexCacheStatsLister; +import io.prestosql.orc.StripeFooterCacheStatsLister; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.omnidata.OmniDataNodeManager; +import io.prestosql.plugin.hive.orc.OrcPageSourceFactory; +import io.prestosql.plugin.hive.orc.OrcSelectivePageSourceFactory; +import io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory; +import io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory; +import io.prestosql.plugin.hive.rule.HivePlanOptimizerProvider; +import io.prestosql.plugin.hive.s3.PrestoS3ClientFactory; +import io.prestosql.plugin.hive.util.IndexCache; +import io.prestosql.plugin.hive.util.IndexCacheLoader; +import io.prestosql.spi.connector.ConnectorNodePartitioningProvider; +import io.prestosql.spi.connector.ConnectorPageSinkProvider; +import io.prestosql.spi.connector.ConnectorPageSourceProvider; +import io.prestosql.spi.connector.ConnectorPlanOptimizerProvider; +import io.prestosql.spi.connector.ConnectorSplitManager; + +import javax.inject.Singleton; + +import java.time.Duration; +import java.util.concurrent.Executor; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.ScheduledExecutorService; +import java.util.function.Function; +import java.util.function.Supplier; + +import static com.google.inject.multibindings.Multibinder.newSetBinder; +import static io.airlift.concurrent.Threads.daemonThreadsNamed; +import static io.airlift.configuration.ConfigBinder.configBinder; +import static io.airlift.json.JsonCodecBinder.jsonCodecBinder; +import static java.util.concurrent.Executors.newCachedThreadPool; +import static java.util.concurrent.Executors.newScheduledThreadPool; +import static org.weakref.jmx.guice.ExportBinder.newExporter; + +public class HiveModule + implements Module +{ + @Override + public void configure(Binder binder) + { + binder.bind(TypeTranslator.class).toInstance(new HiveTypeTranslator()); + binder.bind(CoercionPolicy.class).to(HiveCoercionPolicy.class).in(Scopes.SINGLETON); + + binder.bind(HdfsConfigurationInitializer.class).in(Scopes.SINGLETON); + newSetBinder(binder, DynamicConfigurationProvider.class); + binder.bind(HdfsConfiguration.class).to(HiveHdfsConfiguration.class).in(Scopes.SINGLETON); + binder.bind(HdfsEnvironment.class).in(Scopes.SINGLETON); + binder.bind(DirectoryLister.class).to(CachingDirectoryLister.class).in(Scopes.SINGLETON); + configBinder(binder).bindConfig(HiveConfig.class); + + binder.bind(HiveSessionProperties.class).in(Scopes.SINGLETON); + binder.bind(HiveTableProperties.class).in(Scopes.SINGLETON); + binder.bind(HiveAnalyzeProperties.class).in(Scopes.SINGLETON); + + binder.bind(NamenodeStats.class).in(Scopes.SINGLETON); + newExporter(binder).export(NamenodeStats.class).withGeneratedName(); + + binder.bind(PrestoS3ClientFactory.class).in(Scopes.SINGLETON); + + binder.bind(CachingDirectoryLister.class).in(Scopes.SINGLETON); + newExporter(binder).export(CachingDirectoryLister.class).withGeneratedName(); + + Multibinder recordCursorProviderBinder = newSetBinder(binder, HiveRecordCursorProvider.class); + recordCursorProviderBinder.addBinding().to(S3SelectRecordCursorProvider.class).in(Scopes.SINGLETON); + recordCursorProviderBinder.addBinding().to(GenericHiveRecordCursorProvider.class).in(Scopes.SINGLETON); + + binder.bind(HiveWriterStats.class).in(Scopes.SINGLETON); + newExporter(binder).export(HiveWriterStats.class).withGeneratedName(); + + binder.bind(OmniDataNodeManager.class).in(Scopes.SINGLETON); + + newSetBinder(binder, EventClient.class).addBinding().to(HiveEventClient.class).in(Scopes.SINGLETON); + binder.bind(HivePartitionManager.class).in(Scopes.SINGLETON); + binder.bind(LocationService.class).to(HiveLocationService.class).in(Scopes.SINGLETON); + binder.bind(HiveMetadataFactory.class).in(Scopes.SINGLETON); + binder.bind(new TypeLiteral>() {}).to(HiveMetadataFactory.class).in(Scopes.SINGLETON); + binder.bind(HiveTransactionManager.class).in(Scopes.SINGLETON); + binder.bind(ConnectorSplitManager.class).to(HiveSplitManager.class).in(Scopes.SINGLETON); + newExporter(binder).export(ConnectorSplitManager.class).as(generator -> generator.generatedNameOf(HiveSplitManager.class)); + binder.bind(ConnectorPageSourceProvider.class).to(HivePageSourceProvider.class).in(Scopes.SINGLETON); + binder.bind(ConnectorPageSinkProvider.class).to(HivePageSinkProvider.class).in(Scopes.SINGLETON); + binder.bind(ConnectorNodePartitioningProvider.class).to(HiveNodePartitioningProvider.class).in(Scopes.SINGLETON); + binder.bind(ConnectorPlanOptimizerProvider.class).to(HivePlanOptimizerProvider.class).in(Scopes.SINGLETON); + + jsonCodecBinder(binder).bindJsonCodec(PartitionUpdate.class); + + binder.bind(FileFormatDataSourceStats.class).in(Scopes.SINGLETON); + newExporter(binder).export(FileFormatDataSourceStats.class).withGeneratedName(); + + Multibinder pageSourceFactoryBinder = newSetBinder(binder, HivePageSourceFactory.class); + pageSourceFactoryBinder.addBinding().to(OrcPageSourceFactory.class).in(Scopes.SINGLETON); + pageSourceFactoryBinder.addBinding().to(ParquetPageSourceFactory.class).in(Scopes.SINGLETON); + pageSourceFactoryBinder.addBinding().to(RcFilePageSourceFactory.class).in(Scopes.SINGLETON); + + Multibinder selectivePageSourceFactoryBinder = newSetBinder(binder, HiveSelectivePageSourceFactory.class); + selectivePageSourceFactoryBinder.addBinding().to(OrcSelectivePageSourceFactory.class).in(Scopes.SINGLETON); + + Multibinder fileWriterFactoryBinder = newSetBinder(binder, HiveFileWriterFactory.class); + binder.bind(OrcFileWriterFactory.class).in(Scopes.SINGLETON); + newExporter(binder).export(OrcFileWriterFactory.class).withGeneratedName(); + configBinder(binder).bindConfig(OrcFileWriterConfig.class); + fileWriterFactoryBinder.addBinding().to(OrcFileWriterFactory.class).in(Scopes.SINGLETON); + fileWriterFactoryBinder.addBinding().to(RcFileFileWriterFactory.class).in(Scopes.SINGLETON); + + configBinder(binder).bindConfig(ParquetFileWriterConfig.class); + + binder.bind(CacheLoader.class).to(IndexCacheLoader.class).in(Scopes.SINGLETON); + binder.bind(IndexCache.class).in(Scopes.SINGLETON); + + binder.bind(FileTailCacheStatsLister.class).in(Scopes.SINGLETON); + newExporter(binder).export(FileTailCacheStatsLister.class).withGeneratedName(); + binder.bind(StripeFooterCacheStatsLister.class).in(Scopes.SINGLETON); + newExporter(binder).export(StripeFooterCacheStatsLister.class).withGeneratedName(); + binder.bind(RowIndexCacheStatsLister.class).in(Scopes.SINGLETON); + newExporter(binder).export(RowIndexCacheStatsLister.class).withGeneratedName(); + binder.bind(BloomFilterCacheStatsLister.class).in(Scopes.SINGLETON); + newExporter(binder).export(BloomFilterCacheStatsLister.class).withGeneratedName(); + binder.bind(RowDataCacheStatsLister.class).in(Scopes.SINGLETON); + newExporter(binder).export(RowDataCacheStatsLister.class).withGeneratedName(); + } + + @ForHive + @Singleton + @Provides + public ExecutorService createHiveClientExecutor(HiveCatalogName catalogName) + { + return newCachedThreadPool(daemonThreadsNamed("hive-" + catalogName + "-%s")); + } + + @ForHiveVacuum + @Singleton + @Provides + public ScheduledExecutorService createHiveVacuumServiceExecutor(HiveCatalogName catalogName, HiveConfig hiveConfig) + { + return newScheduledThreadPool( + hiveConfig.getVacuumServiceThreads(), + daemonThreadsNamed("hive-vacuum-service-" + catalogName + "-%s")); + } + + @ForHiveMetastore + @Singleton + @Provides + public ScheduledExecutorService createHiveMetadataClientServiceExecutor(HiveCatalogName catalogName, HiveConfig hiveConfig) + { + return newScheduledThreadPool( + hiveConfig.getMetastoreClientServiceThreads(), + daemonThreadsNamed("hive-metastore-client-service-" + catalogName + "-%s")); + } + + @ForHiveTransactionHeartbeats + @Singleton + @Provides + public ScheduledExecutorService createHiveTransactionHeartbeatExecutor(HiveCatalogName catalogName, HiveConfig hiveConfig) + { + return newScheduledThreadPool( + hiveConfig.getHiveTransactionHeartbeatThreads(), + daemonThreadsNamed("hive-heartbeat-" + catalogName + "-%s")); + } + + @ForCachingHiveMetastore + @Singleton + @Provides + public Executor createCachingHiveMetastoreExecutor(HiveCatalogName catalogName, HiveConfig hiveConfig) + { + return new BoundedExecutor( + newCachedThreadPool(daemonThreadsNamed("hive-metastore-" + catalogName + "-%s")), + (int) Math.max(hiveConfig.getMaxMetastoreRefreshThreads() * 0.9, 9)); + } + + @ForCachingHiveMetastoreTableRefresh + @Singleton + @Provides + public Executor createCachingHiveMetastoreTableRefreshExecutor(HiveCatalogName catalogName, HiveConfig hiveConfig) + { + return new BoundedExecutor( + newCachedThreadPool(daemonThreadsNamed("hive-metastore-refresh-" + catalogName + "-%s")), + (int) Math.max(hiveConfig.getMaxMetastoreRefreshThreads() * 0.1, 1)); + } + + @Singleton + @Provides + public Function createMetastoreGetter(HiveTransactionManager transactionManager) + { + return transactionHandle -> ((HiveMetadata) transactionManager.get(transactionHandle)).getMetastore(); + } + + @Provides + @Singleton + public static OrcCacheStore getCacheStore(HiveConfig config) + { + return OrcCacheStore.builder().newCacheStore( + config.getOrcFileTailCacheLimit(), Duration.ofMillis(config.getOrcFileTailCacheTtl().toMillis()), + config.getOrcStripeFooterCacheLimit(), + Duration.ofMillis(config.getOrcStripeFooterCacheTtl().toMillis()), + config.getOrcRowIndexCacheLimit(), Duration.ofMillis(config.getOrcRowIndexCacheTtl().toMillis()), + config.getOrcBloomFiltersCacheLimit(), + Duration.ofMillis(config.getOrcBloomFiltersCacheTtl().toMillis()), + config.getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(config.getOrcRowDataCacheTtl().toMillis()), + config.isOrcCacheStatsMetricCollectionEnabled()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveNodePartitioningProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveNodePartitioningProvider.java new file mode 100644 index 00000000..61d4ca0b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveNodePartitioningProvider.java @@ -0,0 +1,73 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.connector.BucketFunction; +import io.prestosql.spi.connector.ConnectorBucketNodeMap; +import io.prestosql.spi.connector.ConnectorNodePartitioningProvider; +import io.prestosql.spi.connector.ConnectorPartitionHandle; +import io.prestosql.spi.connector.ConnectorPartitioningHandle; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorSplit; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.type.Type; + +import java.util.List; +import java.util.function.ToIntFunction; +import java.util.stream.IntStream; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.prestosql.spi.connector.ConnectorBucketNodeMap.createBucketNodeMap; + +public class HiveNodePartitioningProvider + implements ConnectorNodePartitioningProvider +{ + @Override + public BucketFunction getBucketFunction( + ConnectorTransactionHandle transactionHandle, + ConnectorSession session, + ConnectorPartitioningHandle partitioningHandle, + List partitionChannelTypes, + int bucketCount) + { + HivePartitioningHandle handle = (HivePartitioningHandle) partitioningHandle; + List hiveTypes = handle.getHiveTypes(); + return new HiveBucketFunction(handle.getBucketingVersion(), bucketCount, hiveTypes, handle.isForUpdateOrDelete()); + } + + @Override + public ConnectorBucketNodeMap getBucketNodeMap(ConnectorTransactionHandle transactionHandle, ConnectorSession session, ConnectorPartitioningHandle partitioningHandle) + { + HivePartitioningHandle handle = (HivePartitioningHandle) partitioningHandle; + return createBucketNodeMap(handle.getBucketCount()); + } + + @Override + public ToIntFunction getSplitBucketFunction( + ConnectorTransactionHandle transactionHandle, + ConnectorSession session, + ConnectorPartitioningHandle partitioningHandle) + { + return value -> ((HiveSplitWrapper) value).getBucketNumber() + .orElseThrow(() -> new IllegalArgumentException("Bucket number not set in split")); + } + + @Override + public List listPartitionHandles(ConnectorTransactionHandle transactionHandle, ConnectorSession session, ConnectorPartitioningHandle partitioningHandle) + { + HivePartitioningHandle handle = (HivePartitioningHandle) partitioningHandle; + int bucketCount = handle.getBucketCount(); + return IntStream.range(0, bucketCount).mapToObj(HivePartitionHandle::new).collect(toImmutableList()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveNotReadableException.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveNotReadableException.java new file mode 100644 index 00000000..6001e8b9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveNotReadableException.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.SchemaTableName; + +import java.util.Optional; + +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class HiveNotReadableException + extends PrestoException +{ + private final SchemaTableName tableName; + private final Optional partition; + + public HiveNotReadableException(SchemaTableName tableName, Optional partition, String message) + { + super(partition.isPresent() ? HiveErrorCode.HIVE_PARTITION_NOT_READABLE : HiveErrorCode.HIVE_TABLE_READ_ONLY, composeMessage(tableName, partition, message)); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.partition = requireNonNull(partition, "partition is null"); + } + + private static String composeMessage(SchemaTableName tableName, Optional partition, String message) + { + return partition.isPresent() + ? format("Table '%s' partition '%s' is not readable: %s", tableName, partition.get(), message) + : format("Table '%s' is not readable: %s", tableName, message); + } + + public SchemaTableName getTableName() + { + return tableName; + } + + public Optional getPartition() + { + return partition; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveOffloadExpression.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveOffloadExpression.java new file mode 100644 index 00000000..662cc71e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveOffloadExpression.java @@ -0,0 +1,175 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.huawei.boostkit.omnidata.model.AggregationInfo; +import io.prestosql.spi.plan.Symbol; +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.RowExpression; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalLong; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.prestosql.expressions.LogicalRowExpressions.TRUE_CONSTANT; + +public class HiveOffloadExpression +{ + private final Set offloadColumns; + private final RowExpression filterExpression; // The default value is TRUE_CONSTANT, indicating that no operator is pushed down. + private final Optional aggregations; + private final OptionalLong limit; + private final Map projections; + + public HiveOffloadExpression() + { + this(Collections.emptySet(), TRUE_CONSTANT, Optional.empty(), OptionalLong.empty(), Collections.emptyMap()); + } + + @JsonCreator + public HiveOffloadExpression( + @JsonProperty("offloadColumns") Set offloadColumns, + @JsonProperty("filterExpression") RowExpression filterExpression, + @JsonProperty("aggregations") Optional aggregations, + @JsonProperty("limit") OptionalLong limit, + @JsonProperty("projections") Map projections) + { + this.offloadColumns = offloadColumns; + this.filterExpression = filterExpression; + this.aggregations = aggregations; + this.limit = limit; + this.projections = projections; + } + + @JsonProperty + public Set getOffloadColumns() + { + return offloadColumns; + } + + @JsonProperty + public RowExpression getFilterExpression() + { + return filterExpression; + } + + @JsonProperty + public Optional getAggregations() + { + return aggregations; + } + + @JsonProperty + public Map getProjections() + { + return projections; + } + + @JsonProperty + public OptionalLong getLimit() + { + return limit; + } + + public HiveOffloadExpression updateFilter(RowExpression filterExpression, Set offloadColumns) + { + /// todo : handle exception + checkArgument(!aggregations.isPresent() && !limit.isPresent() && projections.isEmpty(), + "Aggregations, limit or projection expression is not empty."); + Set newOffloadColumns = new HashSet<>(this.offloadColumns); + newOffloadColumns.addAll(offloadColumns); + return new HiveOffloadExpression(newOffloadColumns, filterExpression, aggregations, limit, projections); + } + + public HiveOffloadExpression updateAggregation(Optional aggregations, Set offloadColumns) + { + checkArgument(!limit.isPresent() && !this.aggregations.isPresent(), + "Limit or aggregations expression is not empty."); + Set newOffloadColumns = new HashSet<>(this.offloadColumns); + newOffloadColumns.addAll(offloadColumns); + return new HiveOffloadExpression(newOffloadColumns, filterExpression, aggregations, limit, projections); + } + + public HiveOffloadExpression updateLimit(OptionalLong limit) + { + return new HiveOffloadExpression(offloadColumns, filterExpression, aggregations, limit, projections); + } + + public HiveOffloadExpression updateProjections(Map projections, Set offloadColumns) + { + checkArgument(this.projections.isEmpty() && !aggregations.isPresent(), + "Projections or aggregations expression is not empty."); + Set newOffloadColumns = new HashSet<>(this.offloadColumns); + newOffloadColumns.addAll(offloadColumns); + return new HiveOffloadExpression(newOffloadColumns, filterExpression, aggregations, limit, projections); + } + + public boolean isPresent() + { + return !TRUE_CONSTANT.equals(filterExpression) || aggregations.isPresent() || limit.isPresent() || !projections.isEmpty(); + } + + public static String aggregationInfoToString(AggregationInfo aggregationInfo) + { + StringBuilder builder = new StringBuilder(); + for (Map.Entry entry : aggregationInfo.getAggregations().entrySet()) { + CallExpression call = entry.getValue().getCall(); + String argument = call.getArguments().isEmpty() ? "*" : call.getArguments().get(0).toString(); + builder.append(call.getDisplayName()).append("(").append(argument).append(") "); + } + + if (aggregationInfo.getGroupingKeys().isEmpty()) { + return builder.toString(); + } + + builder.append("group by:"); + for (RowExpression variable : aggregationInfo.getGroupingKeys()) { + builder.append(variable.toString()).append(" "); + } + return builder.toString(); + } + + @Override + public String toString() + { + if (!isPresent()) { + return ""; + } + + StringBuilder builder = new StringBuilder(); + builder.append(" offload={"); + if (!TRUE_CONSTANT.equals(filterExpression)) { + builder.append(" filter=[").append(filterExpression.toString()).append("]"); + } + if (!projections.isEmpty()) { + builder.append(" projections=["); + for (Map.Entry entry : projections.entrySet()) { + builder.append(entry.getKey().getName()).append(":").append(entry.getValue().toString()).append(" "); + } + builder.append("]"); + } + aggregations.ifPresent(expression -> + builder.append(" aggregation=[").append(aggregationInfoToString(expression)).append("]")); + limit.ifPresent(expression -> + builder.append(" limit=[").append(expression).append("]")); + builder.append("} "); + return builder.toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveOutputTableHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveOutputTableHandle.java new file mode 100644 index 00000000..df3f1e70 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveOutputTableHandle.java @@ -0,0 +1,84 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.metastore.HivePageSinkMetadata; +import io.prestosql.spi.connector.ConnectorOutputTableHandle; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +public class HiveOutputTableHandle + extends HiveWritableTableHandle + implements ConnectorOutputTableHandle +{ + private final List partitionedBy; + private final String tableOwner; + private final Map additionalTableParameters; + + @JsonCreator + public HiveOutputTableHandle( + @JsonProperty("schemaName") String schemaName, + @JsonProperty("tableName") String tableName, + @JsonProperty("inputColumns") List inputColumns, + @JsonProperty("pageSinkMetadata") HivePageSinkMetadata pageSinkMetadata, + @JsonProperty("locationHandle") LocationHandle locationHandle, + @JsonProperty("tableStorageFormat") HiveStorageFormat tableStorageFormat, + @JsonProperty("partitionStorageFormat") HiveStorageFormat partitionStorageFormat, + @JsonProperty("partitionedBy") List partitionedBy, + @JsonProperty("bucketProperty") Optional bucketProperty, + @JsonProperty("tableOwner") String tableOwner, + @JsonProperty("additionalTableParameters") Map additionalTableParameters) + { + super( + schemaName, + tableName, + inputColumns, + pageSinkMetadata, + locationHandle, + bucketProperty, + tableStorageFormat, + partitionStorageFormat, + false); + + this.partitionedBy = ImmutableList.copyOf(requireNonNull(partitionedBy, "partitionedBy is null")); + this.tableOwner = requireNonNull(tableOwner, "tableOwner is null"); + this.additionalTableParameters = ImmutableMap.copyOf(requireNonNull(additionalTableParameters, "additionalTableParameters is null")); + } + + @JsonProperty + public List getPartitionedBy() + { + return partitionedBy; + } + + @JsonProperty + public String getTableOwner() + { + return tableOwner; + } + + @JsonProperty + public Map getAdditionalTableParameters() + { + return additionalTableParameters; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSink.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSink.java new file mode 100644 index 00000000..3a3f4f81 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSink.java @@ -0,0 +1,1076 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterators; +import com.google.common.primitives.Ints; +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.ListeningExecutorService; +import io.airlift.concurrent.MoreFutures; +import io.airlift.json.JsonCodec; +import io.airlift.log.Logger; +import io.airlift.slice.Slice; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.plugin.hive.HiveVacuumTableHandle.Range; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.Page; +import io.prestosql.spi.PageBuilder; +import io.prestosql.spi.PageIndexer; +import io.prestosql.spi.PageIndexerFactory; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.block.IntArrayBlockBuilder; +import io.prestosql.spi.block.RowBlock; +import io.prestosql.spi.block.RunLengthEncodedBlock; +import io.prestosql.spi.block.SortOrder; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorPageSink; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorPageSourceProvider; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorSplit; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.snapshot.BlockEncodingSerdeProvider; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import it.unimi.dsi.fastutil.objects.Object2IntMap; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat.Options; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.io.AcidUtils.ParsedDelta; + +import java.io.IOException; +import java.io.Serializable; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.Iterables.getOnlyElement; +import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static io.airlift.slice.Slices.wrappedBuffer; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_TOO_MANY_OPEN_PARTITIONS; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_CLOSE_ERROR; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; + +public class HivePageSink + implements ConnectorPageSink +{ + private static final Logger log = Logger.get(HivePageSink.class); + + private static final int MAX_PAGE_POSITIONS = 4096; + + private final HiveWriterFactory writerFactory; + + private final int[] dataColumnInputIndex; // ordinal of columns (not counting sample weight column) + private final int[] partitionColumnsInputIndex; // ordinal of columns (not counting sample weight column) + private final int rowIdColumnIndex; + private final HiveACIDWriteType acidWriteType; + + private final int[] bucketColumns; + private final HiveBucketFunction bucketFunction; + + private final HiveWriterPagePartitioner pagePartitioner; + private final HdfsEnvironment hdfsEnvironment; + + private final int maxOpenWriters; + private final ListeningExecutorService writeVerificationExecutor; + + private final JsonCodec partitionUpdateCodec; + + private final List writers = new ArrayList<>(); + + // Snapshot: parameters used to construct writer instances + private final List writerParams = new ArrayList<>(); + + protected final ConnectorSession session; + private final List nullBlocks; + + private long rows; + private long writtenBytes; + private long systemMemoryUsage; + private long validationCpuNanos; + protected final List inputColumns; + private final TypeManager typeManager; + protected final HiveWritableTableHandle writableTableHandle; + private final ThreadLocal> vacuumOptionsMap = ThreadLocal.withInitial(() -> null); + private VaccumOp vacuumOp; + + public HivePageSink( + HiveWriterFactory writerFactory, + List inputColumns, + Optional bucketProperty, + PageIndexerFactory pageIndexerFactory, + TypeManager typeManager, + HdfsEnvironment hdfsEnvironment, + int maxOpenWriters, + ListeningExecutorService writeVerificationExecutor, + JsonCodec partitionUpdateCodec, + ConnectorSession session, + HiveACIDWriteType acidWriteType, + HiveWritableTableHandle handle) + { + this.writerFactory = requireNonNull(writerFactory, "writerFactory is null"); + this.acidWriteType = acidWriteType; + this.writableTableHandle = requireNonNull(handle, "hive table handle is null"); + + this.inputColumns = requireNonNull(inputColumns, "inputColumns is null"); + this.typeManager = requireNonNull(typeManager, "typemMnager is null"); + + requireNonNull(pageIndexerFactory, "pageIndexerFactory is null"); + + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.maxOpenWriters = maxOpenWriters; + this.writeVerificationExecutor = requireNonNull(writeVerificationExecutor, "writeVerificationExecutor is null"); + this.partitionUpdateCodec = requireNonNull(partitionUpdateCodec, "partitionUpdateCodec is null"); + + requireNonNull(bucketProperty, "bucketProperty is null"); + this.pagePartitioner = new HiveWriterPagePartitioner( + inputColumns, + bucketProperty.isPresent() || + (handle.getTableStorageFormat() == HiveStorageFormat.ORC && + HiveACIDWriteType.isRowIdNeeded(acidWriteType) && + HiveACIDWriteType.VACUUM_UNIFY != acidWriteType && + !isInsertOnlyTable()), + isVacuumOperationValid() && !isInsertOnlyTable(), + pageIndexerFactory, + typeManager); + + // determine the input index of the partition columns and data columns + // and determine the input index and type of bucketing columns + ImmutableList.Builder partitionColumns = ImmutableList.builder(); + ImmutableList.Builder dataColumnsInputIndex = ImmutableList.builder(); + ImmutableList.Builder dataColumnTypes = ImmutableList.builder(); + Object2IntMap dataColumnNameToIdMap = new Object2IntOpenHashMap<>(); + Map dataColumnNameToTypeMap = new HashMap<>(); + // sample weight column is passed separately, so index must be calculated without this column + int inputIndex; + for (inputIndex = 0; inputIndex < inputColumns.size(); inputIndex++) { + HiveColumnHandle column = inputColumns.get(inputIndex); + if (column.isPartitionKey()) { + partitionColumns.add(inputIndex); + } + else { + dataColumnsInputIndex.add(inputIndex); + dataColumnNameToIdMap.put(column.getName(), inputIndex); + dataColumnNameToTypeMap.put(column.getName(), column.getHiveType()); + dataColumnTypes.add(typeManager.getType(column.getTypeSignature())); + } + } + rowIdColumnIndex = HiveACIDWriteType.isRowIdNeeded(acidWriteType) ? inputIndex : -1; + this.partitionColumnsInputIndex = Ints.toArray(partitionColumns.build()); + this.dataColumnInputIndex = Ints.toArray(dataColumnsInputIndex.build()); + + if (bucketProperty.isPresent()) { + BucketingVersion bucketingVersion = bucketProperty.get().getBucketingVersion(); + int bucketCount = bucketProperty.get().getBucketCount(); + bucketColumns = bucketProperty.get().getBucketedBy().stream() + .mapToInt(dataColumnNameToIdMap::get) + .toArray(); + List bucketColumnTypes = bucketProperty.get().getBucketedBy().stream() + .map(dataColumnNameToTypeMap::get) + .collect(toList()); + bucketFunction = new HiveBucketFunction(bucketingVersion, bucketCount, bucketColumnTypes); + } + else if (handle.getTableStorageFormat() == HiveStorageFormat.ORC && + HiveACIDWriteType.isRowIdNeeded(acidWriteType) && + !isInsertOnlyTable()) { + bucketColumns = new int[]{rowIdColumnIndex}; + bucketFunction = new HiveBucketFunction(BucketingVersion.BUCKETING_V2, + HiveBucketing.MAX_BUCKET_NUMBER, + ImmutableList.of(HiveColumnHandle.updateRowIdHandle().getHiveType()), + true); + } + else { + bucketColumns = null; + bucketFunction = null; + } + + if (acidWriteType == HiveACIDWriteType.DELETE) { + //Null blocks will be used in case of delete + ImmutableList.Builder nullBlocks = ImmutableList.builder(); + for (Type dataColumnType : dataColumnTypes.build()) { + BlockBuilder blockBuilder = dataColumnType.createBlockBuilder(null, 1, 0); + blockBuilder.appendNull(); + nullBlocks.add(blockBuilder.build()); + } + this.nullBlocks = nullBlocks.build(); + } + else { + this.nullBlocks = ImmutableList.of(); + } + + this.session = requireNonNull(session, "session is null"); + } + + @Override + public long getCompletedBytes() + { + return writtenBytes; + } + + @Override + public long getRowsWritten() + { + return rows; + } + + @Override + public long getSystemMemoryUsage() + { + return systemMemoryUsage; + } + + @Override + public long getValidationCpuNanos() + { + return validationCpuNanos; + } + + @Override + public CompletableFuture> finish() + { + // Must be wrapped in doAs entirely + // Implicit FileSystem initializations are possible in HiveRecordWriter#commit -> RecordWriter#close + ListenableFuture> result = hdfsEnvironment.doAs(session.getUser(), this::doFinish); + if (!session.isSnapshotEnabled()) { + return MoreFutures.toCompletableFuture(result); + } + + // Will merge all sub files and call doFinish again, so clear the total bytes + writtenBytes = 0; + ListenableFuture> mergedResult = hdfsEnvironment.doAs(session.getUser(), this::mergeFiles); + // Use mergedResult as return value (indexed at 1) + return MoreFutures.toCompletableFuture(Futures.transform(Futures.allAsList(result, mergedResult), results -> results.get(1), directExecutor())); + } + + private ListenableFuture> doFinish() + { + ImmutableList.Builder partitionUpdates = ImmutableList.builder(); + List> verificationTasks = new ArrayList<>(); + for (HiveWriter writer : writers) { + if (writer == null) { + continue; + } + writer.commit(); + PartitionUpdate partitionUpdate = writer.getPartitionUpdate(); + partitionUpdates.add(wrappedBuffer(partitionUpdateCodec.toJsonBytes(partitionUpdate))); + writer.getVerificationTask() + .map(Executors::callable) + .ifPresent(verificationTasks::add); + } + List result = partitionUpdates.build(); + + writtenBytes += writers.stream() + .filter(Objects::nonNull) + .mapToLong(HiveWriter::getWrittenBytes) + .sum(); + validationCpuNanos += writers.stream() + .filter(Objects::nonNull) + .mapToLong(HiveWriter::getValidationCpuNanos) + .sum(); + writers.clear(); + + if (vacuumOp != null) { + vacuumOp.close(); + } + + if (verificationTasks.isEmpty()) { + return Futures.immediateFuture(result); + } + + try { + List> futures = writeVerificationExecutor.invokeAll(verificationTasks).stream() + .map(future -> (ListenableFuture) future) + .collect(toList()); + return Futures.transform(Futures.allAsList(futures), input -> result, directExecutor()); + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException(e); + } + } + + private ListenableFuture> mergeFiles() + { + // When snapshot is enabled, each snapshot produces a sub file, named file.n, for each "writer", + // where "file" is what the file would be named without snapshot, and n is a number starting from 0. + // Need to merge these sub files to a single one at the end. + checkState(writers.isEmpty()); + try { + for (WriterParam param : writerParams) { + // Construct writers for the merged files + HiveWriter hiveWriter = writerFactory.createWriterForSnapshotMerge(param.partitionValues, param.bucket, Optional.empty()); + writers.add(hiveWriter); + } + writerFactory.mergeSubFiles(writers); + // Finish up writers for merged files, to get final results and stats + return doFinish(); + } + catch (IOException e) { + log.debug("exception '%s' while merging subfile", e); + throw new RuntimeException(e); + } + } + + @Override + public void abort() + { + // Must be wrapped in doAs entirely + // Implicit FileSystem initializations are possible in HiveRecordWriter#rollback -> RecordWriter#close + hdfsEnvironment.doAs(session.getUser(), this::doAbort); + } + + @Override + public void cancelToResume() + { + // Must be wrapped in doAs entirely + // Implicit FileSystem initializations are possible in HiveRecordWriter#rollback -> RecordWriter#close + hdfsEnvironment.doAs(session.getUser(), this::doAbort); + } + + private void doAbort() + { + Optional rollbackException = Optional.empty(); + for (HiveWriter writer : writers) { + // writers can contain nulls if an exception is thrown when doAppend expends the writer list + if (writer != null) { + try { + writer.rollback(); + } + catch (Exception e) { + log.warn("exception '%s' while rollback on %s", e, writer); + rollbackException = Optional.of(e); + } + } + } + writers.clear(); + + if (rollbackException.isPresent()) { + throw new PrestoException(HIVE_WRITER_CLOSE_ERROR, "Error rolling back write to Hive", rollbackException.get()); + } + } + + @Override + public VacuumResult vacuum(ConnectorPageSourceProvider pageSourceProvider, + ConnectorTransactionHandle transactionHandle, + ConnectorTableHandle connectorTableHandle, + List splits) + { + if (vacuumOp == null) { + vacuumOp = new VaccumOp(pageSourceProvider, transactionHandle, connectorTableHandle, splits); + } + Page page = vacuumOp.processNext(); + return new VacuumResult(page, vacuumOp.isFinished()); + } + + private class VaccumOp + { + List pageSources; + Iterator sortedPagesForVacuum; + Set allBucketList = new HashSet<>(); + Map> partitionToBuckets = new HashMap<>(); + Map> partitionToKeys = new HashMap<>(); + AtomicInteger rowsWritten = new AtomicInteger(); + AtomicBoolean emptyfileWriten = new AtomicBoolean(); + + private VaccumOp(ConnectorPageSourceProvider pageSourceProvider, + ConnectorTransactionHandle transactionHandle, + ConnectorTableHandle connectorTableHandle, + List splits) + { + List hiveSplits = new ArrayList<>(); + splits.forEach(split -> { + hiveSplits.addAll(((HiveSplitWrapper) split).getSplits()); + }); + vacuumOptionsMap.set(initVacuumOptions(hiveSplits)); + //In case of no + hiveSplits.stream().forEach(split -> { + String partitionName = split.getPartitionName(); + if (partitionName != null && !partitionName.isEmpty()) { + Set partitionBuckets = partitionToBuckets.computeIfAbsent(partitionName, + (partition) -> new HashSet<>()); + List partitionKeys = split.getPartitionKeys(); + partitionToKeys.put(partitionName, partitionKeys); + partitionBuckets.add(split.getBucketNumber().orElse(0)); + } + allBucketList.add(split.getBucketNumber().orElse(0)); + }); + + List inputColumns = new ArrayList<>(HivePageSink.this.inputColumns); + if (isInsertOnlyTable() || acidWriteType == HiveACIDWriteType.VACUUM_UNIFY) { + //Insert only tables Just need to merge contents together. No processing required. + //During vacuum unify, all buckets will be merged to one. + //There is no need to sort again. sort_by is valid only on bucketed table, + //for which VACUUM_UNIFY is not valid. + List multiSplits = hiveSplits.stream() + .map(HiveSplitWrapper::wrap) + .collect(toList()); + if (isInsertOnlyTable()) { + Collections.sort(multiSplits, this::compareInsertOnlySplits); + } + else if (acidWriteType == HiveACIDWriteType.VACUUM_UNIFY) { + HiveColumnHandle rowIdHandle = HiveColumnHandle.updateRowIdHandle(); + inputColumns.add(rowIdHandle); + } + + pageSources = multiSplits.stream() + .map(split -> pageSourceProvider.createPageSource( + transactionHandle, session, split, connectorTableHandle, inputColumns)) + .collect(toList()); + List> pageSourceIterators = HiveUtil.getPageSourceIterators(pageSources); + sortedPagesForVacuum = Iterators.concat(pageSourceIterators.iterator()); + } + else { + HiveColumnHandle rowIdHandle = HiveColumnHandle.updateRowIdHandle(); + inputColumns.add(rowIdHandle); + + List multiSplits = hiveSplits.stream() + .map(HiveSplitWrapper::wrap) + .collect(toList()); + + pageSources = multiSplits.stream() + .map(split -> pageSourceProvider.createPageSource( + transactionHandle, session, split, connectorTableHandle, inputColumns)) + .collect(toList()); + List columnTypes = inputColumns.stream() + .map(c -> ((HiveColumnHandle) c).getHiveType().getType(typeManager)) + .collect(toList()); + //Last index for rowIdHandle + List sortFields = ImmutableList.of(inputColumns.size() - 1); + sortedPagesForVacuum = HiveUtil.getMergeSortedPages(pageSources, columnTypes, sortFields, + ImmutableList.of(SortOrder.ASC_NULLS_FIRST)); + } + } + + Page processNext() + { + if (sortedPagesForVacuum.hasNext()) { + Page page = sortedPagesForVacuum.next(); + appendPage(page); + rowsWritten.addAndGet(page.getPositionCount()); + return page; + } + if (rowsWritten.get() == 0) { + //In case this partition/table have 0 rows, then create empty file. + if (partitionToBuckets.isEmpty()) { + createEmptyFiles(ImmutableList.of(), allBucketList); + } + else { + partitionToBuckets.entrySet().stream().forEach(entry -> { + String partitionName = entry.getKey(); + List partitionKeys = partitionToKeys.get(partitionName); + Set buckets = entry.getValue(); + createEmptyFiles(partitionKeys, buckets); + }); + } + } + return null; + } + + /* + * When all rows of table are deleted, then vacuum will not generate any of the compacted file. + * So at the end, need to generate empty bucket files in base directory to indicate all rows are deleted. + */ + private synchronized void createEmptyFiles(List partitionKeys, Set bucketNumbers) + { + if (emptyfileWriten.get()) { + return; + } + PageBuilder builder; + if (partitionKeys != null && !partitionKeys.isEmpty()) { + List partitionTypes = inputColumns.stream() + .filter(HiveColumnHandle::isPartitionKey) + .map(HiveColumnHandle::getHiveType) + .map((t) -> t.getType(typeManager)) + .collect(toList()); + builder = new PageBuilder(partitionTypes); + for (int i = 0; i < partitionKeys.size(); i++) { + HivePartitionKey partitionKey = partitionKeys.get(i); + Type type = partitionTypes.get(i); + Object partitionColumnValue = HiveUtil.typedPartitionKey(partitionKey.getValue(), type, partitionKey.getName()); + RunLengthEncodedBlock block = RunLengthEncodedBlock.create(type, partitionColumnValue, 1); + type.appendTo(block, 0, builder.getBlockBuilder(i)); + } + builder.declarePosition(); + } + else { + builder = new PageBuilder(ImmutableList.of()); + } + Page partitionColumns = builder.build(); + String partitionName = writerFactory.getPartitionName(partitionColumns, 0).orElse(HivePartition.UNPARTITIONED_ID); + bucketNumbers.forEach((bucket) -> { + List partitionValues = writerFactory.getPartitionValues(partitionColumns, 0); + writers.add(writerFactory.createWriter(partitionValues, OptionalInt.of(bucket), getVacuumOptions(partitionName))); + // TODO-cp-I2BZ0A: vacuum is not supported with snapshot + writerParams.add(null); + }); + emptyfileWriten.compareAndSet(false, true); + } + + boolean isFinished() + { + return !sortedPagesForVacuum.hasNext(); + } + + private Map initVacuumOptions(List hiveSplits) + { + return hdfsEnvironment.doAs(session.getUser(), () -> { + Map vacuumOptionsMap = new HashMap<>(); + //Findout the minWriteId and maxWriteId for current compaction. + HiveVacuumTableHandle vacuumTableHandle = (HiveVacuumTableHandle) writableTableHandle; + for (HiveSplit split : hiveSplits) { + String partition = split.getPartitionName(); + Options options = vacuumOptionsMap.get(partition); + if (options == null) { + options = new Options(writerFactory.getConf()).maximumWriteId(-1).minimumWriteId(Long.MAX_VALUE); + vacuumOptionsMap.put(partition, options); + } + if (vacuumTableHandle.isFullVacuum()) { + //Major compaction, need to write the base + options.writingBase(true); + Range range = getOnlyElement(vacuumTableHandle.getRanges().get(partition)); + options.minimumWriteId(range.getMin()); + if (vacuumTableHandle.isUnifyVacuum()) { + options.maximumWriteId(vacuumTableHandle.getLocationHandle().getJsonSerializablewriteIdInfo().get().getMaxWriteId()); + } + else { + options.maximumWriteId(range.getMax()); + } + Path bucketFile = new Path(split.getPath()); + OptionalInt bucketNumber = vacuumTableHandle.isUnifyVacuum() ? OptionalInt.of(0) : HiveUtil.getBucketNumber(bucketFile.getName()); + if (bucketNumber.isPresent()) { + options.bucket(bucketNumber.getAsInt()); + } + else { + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, "Error while parsing split info for vacuum"); + } + } + else { + Path bucketFile = new Path(split.getPath()); + try { + Options currentOptions = new Options(writerFactory.getConf()); + if (isInsertOnlyTable()) { + Path parent = bucketFile.getParent(); + if (parent.getName().startsWith(AcidUtils.BASE_PREFIX)) { + long baseWriteId = AcidUtils.parseBase(parent); + currentOptions.writingBase(true); + currentOptions.minimumWriteId(0); + currentOptions.maximumWriteId(baseWriteId); + } + else if (parent.getName().startsWith(AcidUtils.DELTA_PREFIX)) { + ParsedDelta parsedDelta = AcidUtils.parsedDelta(parent, parent.getFileSystem(writerFactory.getConf())); + currentOptions.maximumWriteId(parsedDelta.getMaxWriteId()); + currentOptions.minimumWriteId(parsedDelta.getMinWriteId()); + } + } + else { + currentOptions = AcidUtils.parseBaseOrDeltaBucketFilename(bucketFile, writerFactory.getConf()); + } + if (currentOptions.isWritingBase() || options.isWritingBase()) { + options.writingBase(true); + } + else if (options.isWritingDeleteDelta() || AcidUtils.isDeleteDelta(bucketFile.getParent())) { + options.writingDeleteDelta(true); + } + + if (currentOptions.getMinimumWriteId() < options.getMinimumWriteId()) { + options.minimumWriteId(currentOptions.getMinimumWriteId()); + } + if (currentOptions.getMaximumWriteId() > options.getMaximumWriteId()) { + options.maximumWriteId(currentOptions.getMaximumWriteId()); + } + options.bucket(currentOptions.getBucketId()); + } + catch (IOException e) { + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, "Error while parsing split info for vacuum", e); + } + Range suitableRange = getOnlyElement(vacuumTableHandle.getRanges().get(partition)); + options.minimumWriteId(suitableRange.getMin()); + options.maximumWriteId(suitableRange.getMax()); + } + } + return vacuumOptionsMap; + }); + } + + void close() + { + pageSources.forEach(c -> { + try { + c.close(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + }); + } + + /** + * Compares the INSERT_ONLY transactional table's splits to read in the same sequence of insert. + */ + private int compareInsertOnlySplits(HiveSplitWrapper o1, HiveSplitWrapper o2) + { + if (o1.getFilePath().equals(o2.getFilePath())) { + //same file different splits. + return Long.compare(o1.getStartIndex(), o2.getStartIndex()); + } + Path p1 = new Path(o1.getFilePath()); + Path p2 = new Path(o2.getFilePath()); + String p1Parent = p1.getParent().getName(); + String p2Parent = p2.getParent().getName(); + if (p1Parent.equals(p2Parent)) { + //Same parent + return p1.getName().compareTo(p2.getName()); + } + boolean isP1AcidDir = p1Parent.startsWith(AcidUtils.BASE_PREFIX) || p1Parent.startsWith(AcidUtils.DELTA_PREFIX); + boolean isP2AcidDir = p2Parent.startsWith(AcidUtils.BASE_PREFIX) || p2Parent.startsWith(AcidUtils.DELTA_PREFIX); + if (isP1AcidDir && isP2AcidDir) { + //Both are ACID inserts + if (p1Parent.startsWith(AcidUtils.BASE_PREFIX)) { + //base will have higher priority + return -1; + } + else if (p2Parent.startsWith(AcidUtils.BASE_PREFIX)) { + return 1; + } + return p1Parent.compareTo(p2Parent); + } + //Both are not acid + if (!isP1AcidDir && !isP2AcidDir) { + return p1.getName().compareTo(p2.getName()); + } + //o1 is Original + if (!isP1AcidDir) { + return -1; + } + return 1; + } + } + + private Optional getVacuumOptions(String partition) + { + Map partitionToOptions = vacuumOptionsMap.get(); + if (partitionToOptions == null) { + return Optional.empty(); + } + return Optional.ofNullable(partitionToOptions.get(partition)); + } + + @Override + public CompletableFuture appendPage(Page page) + { + if (page.getPositionCount() > 0) { + // Must be wrapped in doAs entirely + // Implicit FileSystem initializations are possible in HiveRecordWriter#addRow or #createWriter + hdfsEnvironment.doAs(session.getUser(), () -> doAppend(page)); + } + + return NOT_BLOCKED; + } + + private void doAppend(Page page) + { + while (page.getPositionCount() > MAX_PAGE_POSITIONS) { + Page chunk = page.getRegion(0, MAX_PAGE_POSITIONS); + page = page.getRegion(MAX_PAGE_POSITIONS, page.getPositionCount() - MAX_PAGE_POSITIONS); + writePage(chunk); + } + + writePage(page); + } + + private void writePage(Page page) + { + int[] writerIndexes = getWriterIndexes(page); + + // position count for each writer + int[] sizes = new int[writers.size()]; + for (int index : writerIndexes) { + sizes[index]++; + } + + // record which positions are used by which writer + int[][] writerPositions = new int[writers.size()][]; + int[] counts = new int[writers.size()]; + + for (int position = 0; position < page.getPositionCount(); position++) { + int index = writerIndexes[position]; + + int count = counts[index]; + if (count == 0) { + writerPositions[index] = new int[sizes[index]]; + } + writerPositions[index][count] = position; + counts[index] = count + 1; + } + + // invoke the writers + Page dataPage = getDataPage(page); + for (int index = 0; index < writerPositions.length; index++) { + int[] positions = writerPositions[index]; + if (positions == null) { + continue; + } + + // If write is partitioned across multiple writers, filter page using dictionary blocks + Page pageForWriter = dataPage; + if (positions.length != dataPage.getPositionCount()) { + verify(positions.length == counts[index]); + pageForWriter = pageForWriter.getPositions(positions, 0, positions.length); + } + + HiveWriter writer = writers.get(index); + + long currentWritten = writer.getWrittenBytes(); + long currentMemory = writer.getSystemMemoryUsage(); + + writer.append(pageForWriter); + + writtenBytes += (writer.getWrittenBytes() - currentWritten); + systemMemoryUsage += (writer.getSystemMemoryUsage() - currentMemory); + } + rows += page.getPositionCount(); + } + + private int[] getWriterIndexes(Page page) + { + Page partitionColumns = extractColumns(page, partitionColumnsInputIndex); + Block bucketBlock = buildBucketBlock(page); + Block operationIdBlock = buildAcidOperationBlock(page); + int[] writerIndexes = pagePartitioner.partitionPage(partitionColumns, bucketBlock, operationIdBlock); + if (pagePartitioner.getMaxIndex() >= maxOpenWriters) { + throw new PrestoException(HIVE_TOO_MANY_OPEN_PARTITIONS, format("Exceeded limit of %s open writers for partitions/buckets", maxOpenWriters)); + } + + // expand writers list to new size + while (writers.size() <= pagePartitioner.getMaxIndex()) { + writers.add(null); + } + // The 2 lists may start with different sizes (e.g. writer is 0 after resume; but writerParams has entries), + // They will end up with same size after the loops + while (writerParams.size() <= pagePartitioner.getMaxIndex()) { + writerParams.add(null); + } + + // create missing writers + for (int position = 0; position < page.getPositionCount(); position++) { + int writerIndex = writerIndexes[position]; + if (writers.get(writerIndex) != null) { + continue; + } + + Optional partitionName = writerFactory.getPartitionName(partitionColumns, position); + String partition = partitionName.orElse(HivePartition.UNPARTITIONED_ID); + OptionalInt bucketNumber = OptionalInt.empty(); + Optional partitionOptions = getVacuumOptions(partition); + if (bucketBlock != null) { + bucketNumber = OptionalInt.of(bucketBlock.getInt(position, 0)); + } + else if (acidWriteType == HiveACIDWriteType.VACUUM_UNIFY) { + bucketNumber = OptionalInt.of(0); + } + else if (isVacuumOperationValid() && isInsertOnlyTable()) { + bucketNumber = OptionalInt.of(partitionOptions.get().getBucketId()); + } + else if (session.getTaskId().isPresent() && writerFactory.isTxnTable()) { + //Use the taskId and driverId to get bucketId for ACID table + bucketNumber = generateBucketNumber(partitionColumns.getChannelCount() != 0); + } + List partitionValues = writerFactory.getPartitionValues(partitionColumns, position); + HiveWriter writer = writerFactory.createWriter(partitionValues, bucketNumber, partitionOptions); + // Snapshot: record what parameters were used to construct the writer. Vacuum is not supported currently. + writerParams.set(writerIndex, new WriterParam(partitionValues, bucketNumber, writer.getFilePath())); + writers.set(writerIndex, writer); + } + verify(writers.size() == pagePartitioner.getMaxIndex() + 1); + // After snapshots are taken, the writer list is cleared. New pages may skip over certain writer indexes. + verify(session.isSnapshotEnabled() || !writers.contains(null)); + + return writerIndexes; + } + + private OptionalInt generateBucketNumber(boolean isPartitionedTable) + { + if (session.getTaskId().isPresent() && session.getDriverId().isPresent() && writerFactory.isTxnTable() && + (!(isPartitionedTable && HiveSessionProperties.isWritePartitionDistributionEnabled(session)))) { + int taskId = session.getTaskId().getAsInt(); + int driverId = session.getDriverId().getAsInt(); + int taskWriterCount = session.getTaskWriterCount(); + //taskId starts from 0. + //driverId starts from 0 and will be < taskWriterCount. + //taskWriterCount starts from 1 + // for taskId n, buckets will be between n*taskWriterCount (inclusive) and (n+1)*taskWriterCount (exclusive) + int bucketNumber = taskId * taskWriterCount + driverId; + return OptionalInt.of(bucketNumber); + } + return OptionalInt.empty(); + } + + private Page getDataPage(Page page) + { + Block[] blocks = null; + if (HiveACIDWriteType.isRowIdNeeded(acidWriteType) && !isInsertOnlyTable()) { + //For UPDATE/DELETE source page will have extra block for row_id column. + blocks = new Block[dataColumnInputIndex.length + 1]; + blocks[dataColumnInputIndex.length] = page.getBlock(rowIdColumnIndex); + } + else { + blocks = new Block[dataColumnInputIndex.length]; + } + for (int i = 0; i < dataColumnInputIndex.length; i++) { + if (acidWriteType == HiveACIDWriteType.DELETE) { + //For delete remaining data not required as these will be ignored during write. + //But this will reduce the data size of sort buffer + blocks[i] = new RunLengthEncodedBlock(nullBlocks.get(i), page.getPositionCount()); + } + else { + int dataColumn = dataColumnInputIndex[i]; + blocks[i] = page.getBlock(dataColumn); + } + } + return new Page(page.getPositionCount(), blocks); + } + + private Block buildBucketBlock(Page page) + { + if (acidWriteType == HiveACIDWriteType.VACUUM_UNIFY) { + //There is no pre bucket block in case of unify + return null; + } + if (bucketFunction == null) { + return null; + } + + IntArrayBlockBuilder bucketColumnBuilder = new IntArrayBlockBuilder(null, page.getPositionCount()); + Page bucketColumnsPage = extractColumns(page, bucketColumns); + for (int position = 0; position < page.getPositionCount(); position++) { + int bucket = bucketFunction.getBucket(bucketColumnsPage, position); + bucketColumnBuilder.writeInt(bucket); + } + return bucketColumnBuilder.build(); + } + + /** + * In case of minor VACUUM there is a chance of delete_delta files to be written along with delta_files. + * partition and bucket value can be same for both delete_delta and delete, so to differentiate these writers + * include another column 'operationId' for hashing. + */ + private Block buildAcidOperationBlock(Page page) + { + if (!isVacuumOperationValid() || isInsertOnlyTable()) { + //Supported only for ORC format + return null; + } + + IntArrayBlockBuilder operationIdBuilder = new IntArrayBlockBuilder(null, page.getPositionCount()); + Block rowIdBlock = page.getBlock(rowIdColumnIndex); + for (int position = 0; position < page.getPositionCount(); position++) { + RowBlock rowBlock = (RowBlock) rowIdBlock.getSingleValueBlock(position); + int operationId = rowBlock.getRawFieldBlocks()[4].getInt(0, 0); + if (operationId == HiveACIDWriteType.DELETE.getOperationId()) { + operationIdBuilder.writeInt(operationId); + } + else { + operationIdBuilder.writeInt(0); + } + } + return operationIdBuilder.build(); + } + + private boolean isVacuumOperationValid() + { + return HiveACIDWriteType.isVacuum(acidWriteType) && + writableTableHandle != null && + writableTableHandle.getTableStorageFormat() == HiveStorageFormat.ORC; + } + + private boolean isInsertOnlyTable() + { + Optional
table = writableTableHandle.getPageSinkMetadata().getTable(); + return table.isPresent() && AcidUtils.isInsertOnlyTable(table.get().getParameters()); + } + + private static Page extractColumns(Page page, int[] columns) + { + Block[] blocks = new Block[columns.length]; + for (int i = 0; i < columns.length; i++) { + int dataColumn = columns[i]; + blocks[i] = page.getBlock(dataColumn); + } + return new Page(page.getPositionCount(), blocks); + } + + private static class HiveWriterPagePartitioner + { + private final PageIndexer pageIndexer; + + public HiveWriterPagePartitioner( + List inputColumns, + boolean bucketed, + boolean isVacuum, + PageIndexerFactory pageIndexerFactory, + TypeManager typeManager) + { + requireNonNull(inputColumns, "inputColumns is null"); + requireNonNull(pageIndexerFactory, "pageIndexerFactory is null"); + + List partitionColumnTypes = inputColumns.stream() + .filter(HiveColumnHandle::isPartitionKey) + .map(column -> typeManager.getType(column.getTypeSignature())) + .collect(toList()); + + if (bucketed) { + partitionColumnTypes.add(INTEGER); + } + + if (isVacuum) { + partitionColumnTypes.add(INTEGER); + } + + this.pageIndexer = pageIndexerFactory.createPageIndexer(partitionColumnTypes); + } + + public int[] partitionPage(Page partitionColumns, Block bucketBlock, Block operationIdBlock) + { + if (bucketBlock != null) { + Block[] blocks = new Block[partitionColumns.getChannelCount() + 1]; + for (int i = 0; i < partitionColumns.getChannelCount(); i++) { + blocks[i] = partitionColumns.getBlock(i); + } + blocks[blocks.length - 1] = bucketBlock; + partitionColumns = new Page(partitionColumns.getPositionCount(), blocks); + } + if (operationIdBlock != null) { + Block[] blocks = new Block[partitionColumns.getChannelCount() + 1]; + for (int i = 0; i < partitionColumns.getChannelCount(); i++) { + blocks[i] = partitionColumns.getBlock(i); + } + blocks[blocks.length - 1] = operationIdBlock; + partitionColumns = new Page(partitionColumns.getPositionCount(), blocks); + } + return pageIndexer.indexPage(partitionColumns); + } + + public int getMaxIndex() + { + return pageIndexer.getMaxIndex(); + } + } + + @Override + public Object capture(BlockEncodingSerdeProvider serdeProvider) + { + // TODO-cp-I2BZ0A: What to do about this? How to capture its state? + checkState(vacuumOp == null); + + try { + // Commit current set of sub files. New writers will be created for new sub files. + hdfsEnvironment.doAs(session.getUser(), this::doFinish).get(); + } + catch (Exception e) { + throw new RuntimeException(e); + } + + // TODO-cp-I2BZ0A: ClassNotFoundException when using "State" class, because Hive classes are from a different classloader + Map state = new HashMap<>(); + state.put("pagePartitioner", pagePartitioner.pageIndexer.capture(serdeProvider)); + state.put("writerFactory", writerFactory.capture()); + state.put("writerParams", new ArrayList<>(writerParams.stream().map(p -> { + Map map = new HashMap<>(); + map.put("partitionValues", p.partitionValues); + map.put("bucket", p.bucket.isPresent() ? p.bucket.getAsInt() : null); + map.put("filePath", p.filePath); + return map; + }).collect(toList()))); + state.put("rows", rows); + state.put("writtenBytes", writtenBytes); + state.put("systemMemoryUsage", systemMemoryUsage); + state.put("validationCpuNanos", validationCpuNanos); + return state; + } + + @Override + public void restore(Object obj, BlockEncodingSerdeProvider serdeProvider, long resumeCount) + { + checkState(writers.isEmpty()); + // TODO-cp-I2BZ0A: ClassNotFoundException when using "State" class, because Hive classes are from a different classloader + Map state = (Map) obj; + pagePartitioner.pageIndexer.restore(state.get("pagePartitioner"), serdeProvider); + writerFactory.restore(state.get("writerFactory"), resumeCount); + writerParams.clear(); + writerParams.addAll(((List>) state.get("writerParams")).stream().map(p -> { + return new WriterParam( + (List) p.get("partitionValues"), + p.get("bucket") == null ? OptionalInt.empty() : OptionalInt.of((Integer) p.get("bucket")), + (String) p.get("filePath")); + }).collect(toList())); + rows = (Long) state.get("rows"); + writtenBytes = (Long) state.get("writtenBytes"); + systemMemoryUsage = (Long) state.get("systemMemoryUsage"); + validationCpuNanos = (Long) state.get("validationCpuNanos"); + } + + private static class State + implements Serializable + { + private Object pagePartitioner; + private Object writerFactory; + private List writerParams; + private long rows; + private long writtenBytes; + private long systemMemoryUsage; + private long validationCpuNanos; + } + + private static class WriterParam + implements Serializable + { + private final List partitionValues; + private final OptionalInt bucket; + private final String filePath; + + private WriterParam(List partitionValues, OptionalInt bucket, String filePath) + { + this.partitionValues = partitionValues; + this.bucket = bucket; + this.filePath = filePath; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSinkProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSinkProvider.java new file mode 100644 index 00000000..f38a2176 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSinkProvider.java @@ -0,0 +1,214 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.util.concurrent.ListeningExecutorService; +import io.airlift.event.client.EventClient; +import io.airlift.json.JsonCodec; +import io.airlift.units.DataSize; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.CachingHiveMetastore; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.HivePageSinkMetadataProvider; +import io.prestosql.plugin.hive.metastore.SortingColumn; +import io.prestosql.spi.NodeManager; +import io.prestosql.spi.PageIndexerFactory; +import io.prestosql.spi.PageSorter; +import io.prestosql.spi.connector.ConnectorDeleteAsInsertTableHandle; +import io.prestosql.spi.connector.ConnectorInsertTableHandle; +import io.prestosql.spi.connector.ConnectorOutputTableHandle; +import io.prestosql.spi.connector.ConnectorPageSink; +import io.prestosql.spi.connector.ConnectorPageSinkProvider; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.connector.ConnectorUpdateTableHandle; +import io.prestosql.spi.connector.ConnectorVacuumTableHandle; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.joda.time.DateTimeZone; + +import javax.inject.Inject; + +import java.util.List; +import java.util.Map; +import java.util.OptionalInt; +import java.util.Set; + +import static com.google.common.util.concurrent.MoreExecutors.listeningDecorator; +import static io.airlift.concurrent.Threads.daemonThreadsNamed; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.Executors.newFixedThreadPool; + +public class HivePageSinkProvider + implements ConnectorPageSinkProvider +{ + private final Set fileWriterFactories; + private final HdfsEnvironment hdfsEnvironment; + private final PageSorter pageSorter; + private final HiveMetastore metastore; + private final PageIndexerFactory pageIndexerFactory; + private final TypeManager typeManager; + private final int maxOpenPartitions; + private final int maxOpenSortFiles; + private final DataSize writerSortBufferSize; + private final boolean immutablePartitions; + private final LocationService locationService; + private final ListeningExecutorService writeVerificationExecutor; + private final JsonCodec partitionUpdateCodec; + private final NodeManager nodeManager; + private final EventClient eventClient; + private final HiveSessionProperties hiveSessionProperties; + private final HiveWriterStats hiveWriterStats; + private final OrcFileWriterFactory orcFileWriterFactory; + private final long perTransactionMetastoreCacheMaximumSize; + private final DateTimeZone parquetTimeZone; + + @Inject + public HivePageSinkProvider( + Set fileWriterFactories, + HdfsEnvironment hdfsEnvironment, + PageSorter pageSorter, + HiveMetastore metastore, + PageIndexerFactory pageIndexerFactory, + TypeManager typeManager, + HiveConfig config, + LocationService locationService, + JsonCodec partitionUpdateCodec, + NodeManager nodeManager, + EventClient eventClient, + HiveSessionProperties hiveSessionProperties, + HiveWriterStats hiveWriterStats, + OrcFileWriterFactory orcFileWriterFactory) + { + this.fileWriterFactories = ImmutableSet.copyOf(requireNonNull(fileWriterFactories, "fileWriterFactories is null")); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.pageSorter = requireNonNull(pageSorter, "pageSorter is null"); + this.metastore = requireNonNull(metastore, "metastore is null"); + this.pageIndexerFactory = requireNonNull(pageIndexerFactory, "pageIndexerFactory is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + this.maxOpenPartitions = config.getMaxPartitionsPerWriter(); + this.maxOpenSortFiles = config.getMaxOpenSortFiles(); + this.writerSortBufferSize = requireNonNull(config.getWriterSortBufferSize(), "writerSortBufferSize is null"); + this.immutablePartitions = config.isImmutablePartitions(); + this.locationService = requireNonNull(locationService, "locationService is null"); + this.writeVerificationExecutor = listeningDecorator(newFixedThreadPool(config.getWriteValidationThreads(), daemonThreadsNamed("hive-write-validation-%s"))); + this.partitionUpdateCodec = requireNonNull(partitionUpdateCodec, "partitionUpdateCodec is null"); + this.nodeManager = requireNonNull(nodeManager, "nodeManager is null"); + this.eventClient = requireNonNull(eventClient, "eventClient is null"); + this.hiveSessionProperties = requireNonNull(hiveSessionProperties, "hiveSessionProperties is null"); + this.hiveWriterStats = requireNonNull(hiveWriterStats, "stats is null"); + this.orcFileWriterFactory = requireNonNull(orcFileWriterFactory, "orcFileWriterFactory is null"); + this.perTransactionMetastoreCacheMaximumSize = config.getPerTransactionMetastoreCacheMaximumSize(); + this.parquetTimeZone = config.getParquetDateTimeZone(); + } + + @Override + public ConnectorPageSink createPageSink(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorOutputTableHandle tableHandle) + { + HiveOutputTableHandle handle = (HiveOutputTableHandle) tableHandle; + //Support CTAS for transactional tables. + boolean isAcidInsert = handle.getAdditionalTableParameters() != null && AcidUtils.isTransactionalTable(handle.getAdditionalTableParameters()); + HiveACIDWriteType acidWriteType = isAcidInsert ? HiveACIDWriteType.INSERT : HiveACIDWriteType.NONE; + return createPageSink(handle, true, acidWriteType, session, handle.getAdditionalTableParameters()); + } + + @Override + public ConnectorPageSink createPageSink(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorUpdateTableHandle tableHandle) + { + HiveUpdateTableHandle handle = (HiveUpdateTableHandle) tableHandle; + return createPageSink(handle, false, HiveACIDWriteType.UPDATE, session, ImmutableMap.of()); + } + + public ConnectorPageSink createPageSink(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorDeleteAsInsertTableHandle tableHandle) + { + HiveDeleteAsInsertTableHandle handle = (HiveDeleteAsInsertTableHandle) tableHandle; + return createPageSink(handle, false, HiveACIDWriteType.DELETE, session, ImmutableMap.of()); + } + + @Override + public ConnectorPageSink createPageSink(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorVacuumTableHandle tableHandle) + { + HiveVacuumTableHandle handle = (HiveVacuumTableHandle) tableHandle; + return createPageSink(handle, false, handle.isUnifyVacuum() ? HiveACIDWriteType.VACUUM_UNIFY : HiveACIDWriteType.VACUUM, session, ImmutableMap.of()); + } + + @Override + public ConnectorPageSink createPageSink(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorInsertTableHandle tableHandle) + { + HiveInsertTableHandle handle = (HiveInsertTableHandle) tableHandle; + if (handle.getIsOverwrite()) { + return createPageSink(handle, false, HiveACIDWriteType.INSERT_OVERWRITE, session, ImmutableMap.of()); + } + return createPageSink(handle, false, HiveACIDWriteType.INSERT, session, ImmutableMap.of() /* for insert properties are taken from metastore */); + } + + private ConnectorPageSink createPageSink(HiveWritableTableHandle handle, boolean isCreateTable, HiveACIDWriteType acidWriteType, ConnectorSession session, + Map additionalTableParameters) + { + OptionalInt bucketCount = OptionalInt.empty(); + List sortedBy = ImmutableList.of(); + + if (handle.getBucketProperty().isPresent()) { + bucketCount = OptionalInt.of(handle.getBucketProperty().get().getBucketCount()); + sortedBy = handle.getBucketProperty().get().getSortedBy(); + } + + HiveWriterFactory writerFactory = new HiveWriterFactory( + fileWriterFactories, + handle.getSchemaName(), + handle.getTableName(), + isCreateTable, + acidWriteType, + handle.getInputColumns(), + handle.getTableStorageFormat(), + handle.getPartitionStorageFormat(), + additionalTableParameters, + bucketCount, + sortedBy, + handle.getLocationHandle(), + locationService, + session.getQueryId(), + new HivePageSinkMetadataProvider(handle.getPageSinkMetadata(), CachingHiveMetastore.memoizeMetastore(metastore, perTransactionMetastoreCacheMaximumSize), new HiveIdentity(session)), + typeManager, + hdfsEnvironment, + pageSorter, + writerSortBufferSize, + maxOpenSortFiles, + immutablePartitions, + parquetTimeZone, + session, + nodeManager, + eventClient, + hiveSessionProperties, + hiveWriterStats, + orcFileWriterFactory); + + return new HivePageSink( + writerFactory, + handle.getInputColumns(), + handle.getBucketProperty(), + pageIndexerFactory, + typeManager, + hdfsEnvironment, + maxOpenPartitions, + writeVerificationExecutor, + partitionUpdateCodec, + session, + acidWriteType, + handle); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSource.java new file mode 100644 index 00000000..bea8695d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSource.java @@ -0,0 +1,522 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.primitives.Booleans; +import io.airlift.log.Logger; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.plugin.hive.HivePageSourceProvider.BucketAdaptation; +import io.prestosql.plugin.hive.HivePageSourceProvider.ColumnMapping; +import io.prestosql.plugin.hive.coercions.HiveCoercer; +import io.prestosql.plugin.hive.orc.OrcSelectivePageSource; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.LazyBlock; +import io.prestosql.spi.block.LazyBlockLoader; +import io.prestosql.spi.block.RunLengthEncodedBlock; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.dynamicfilter.BloomFilterDynamicFilter; +import io.prestosql.spi.dynamicfilter.DynamicFilter; +import io.prestosql.spi.dynamicfilter.DynamicFilterSupplier; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.type.TypeUtils; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.prestosql.plugin.hive.HiveBucketing.getHiveBucket; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_CURSOR_ERROR; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_INVALID_BUCKET_FILES; +import static io.prestosql.plugin.hive.HivePageSourceProvider.ColumnMappingKind.PREFILLED; +import static io.prestosql.plugin.hive.HiveSessionProperties.getDynamicFilteringRowFilteringThreshold; +import static io.prestosql.plugin.hive.HiveUtil.isPartitionFiltered; +import static io.prestosql.plugin.hive.HiveUtil.typedPartitionKey; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class HivePageSource + implements ConnectorPageSource +{ + private static final Logger log = Logger.get(HivePageSource.class); + + private final List columnMappings; + private final Optional bucketAdapter; + private final Object[] prefilledValues; + private final Type[] types; + private final TypeManager typeManager; + private final List>> coercers; + private final int rowFilteringThreshold; + protected boolean eligibleForRowFiltering; + + private final ConnectorPageSource delegate; + + private final List partitionKeys; + private final Optional dynamicFilterSupplier; + private boolean isSelectiveRead; + + public HivePageSource( + List columnMappings, + Optional bucketAdaptation, + TypeManager typeManager, + ConnectorPageSource delegate, + Optional dynamicFilterSupplier, + ConnectorSession session, + List partitionKeys) + { + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + + this.delegate = requireNonNull(delegate, "delegate is null"); + this.columnMappings = columnMappings; + this.bucketAdapter = bucketAdaptation.map(BucketAdapter::new); + + this.dynamicFilterSupplier = dynamicFilterSupplier; + + this.partitionKeys = partitionKeys; + this.rowFilteringThreshold = getDynamicFilteringRowFilteringThreshold(session); + + int size = columnMappings.size(); + + prefilledValues = new Object[size]; + types = new Type[size]; + ImmutableList.Builder>> coercers = ImmutableList.builder(); + + for (int columnIndex = 0; columnIndex < size; columnIndex++) { + ColumnMapping columnMapping = columnMappings.get(columnIndex); + HiveColumnHandle column = columnMapping.getHiveColumnHandle(); + + String name = column.getName(); + Type type = typeManager.getType(column.getTypeSignature()); + types[columnIndex] = type; + + if (columnMapping.getCoercionFrom().isPresent()) { + coercers.add(Optional.of(HiveCoercer.createCoercer(typeManager, columnMapping.getCoercionFrom().get(), columnMapping.getHiveColumnHandle().getHiveType()))); + } + else { + coercers.add(Optional.empty()); + } + + if (columnMapping.getKind() == PREFILLED) { + if (columnMapping.getPrefilledValue() == null) { + prefilledValues[columnIndex] = null; + } + else { + prefilledValues[columnIndex] = typedPartitionKey(columnMapping.getPrefilledValue(), type, name); + } + } + } + this.coercers = coercers.build(); + this.isSelectiveRead = delegate instanceof OrcSelectivePageSource; + } + + private static Page extractColumns(Page page, int[] columns) + { + Block[] blocks = new Block[columns.length]; + for (int i = 0; i < columns.length; i++) { + int dataColumn = columns[i]; + blocks[i] = page.getBlock(dataColumn); + } + return new Page(page.getPositionCount(), blocks); + } + + @Override + public long getCompletedBytes() + { + return delegate.getCompletedBytes(); + } + + @Override + public long getReadTimeNanos() + { + return delegate.getReadTimeNanos(); + } + + @Override + public boolean isFinished() + { + return delegate.isFinished(); + } + + @Override + public Page getNextPage() + { + try { + final List> dynamicFilters; + if (dynamicFilterSupplier.isPresent()) { + dynamicFilters = dynamicFilterSupplier.get().getDynamicFilters(); + // Wait for any dynamic filter + if (dynamicFilters.isEmpty() && dynamicFilterSupplier.get().isBlocked()) { + return null; + } + + // Close the current PageSource if the partition should be filtered + List> dynamicFilterList = new ArrayList(); + for (Map df : dynamicFilters) { + Set values = df.values().stream().collect(Collectors.toSet()); + dynamicFilterList.add(values); + } + if (isPartitionFiltered(partitionKeys, dynamicFilterList, typeManager)) { + close(); + return null; + } + } + else { + dynamicFilters = ImmutableList.of(); + } + + Page dataPage = delegate.getNextPage(); + if (dataPage == null) { + return null; + } + + // This part is for filtering using the bloom filter + // we filter out rows that are not in the bloom filter + // using the filter rows function + if (!dynamicFilters.isEmpty()) { + final List> eligibleColumns = getEligibleColumnsForRowFiltering(dataPage.getChannelCount(), dynamicFilters); + if (!eligibleColumns.isEmpty()) { + dataPage = filter(dynamicFilters, dataPage, eligibleColumns, types); + } + } + + if (bucketAdapter.isPresent()) { + IntArrayList rowsToKeep = bucketAdapter.get().computeEligibleRowIds(dataPage); + Block[] adaptedBlocks = new Block[dataPage.getChannelCount()]; + for (int i = 0; i < adaptedBlocks.length; i++) { + Block block = dataPage.getBlock(i); + if (block instanceof LazyBlock && !((LazyBlock) block).isLoaded()) { + adaptedBlocks[i] = new LazyBlock(rowsToKeep.size(), new RowFilterLazyBlockLoader(dataPage.getBlock(i), rowsToKeep.elements())); + } + else { + adaptedBlocks[i] = block.getPositions(rowsToKeep.elements(), 0, rowsToKeep.size()); + } + } + dataPage = new Page(rowsToKeep.size(), adaptedBlocks); + } + + if (isSelectiveRead) { //FixMe(Rajeev) : Check way to optimize for prefilled fields. + return dataPage; + } + + int batchSize = dataPage.getPositionCount(); + List blocks = new ArrayList<>(); + for (int fieldId = 0; fieldId < columnMappings.size(); fieldId++) { + ColumnMapping columnMapping = columnMappings.get(fieldId); + switch (columnMapping.getKind()) { + case PREFILLED: + blocks.add(RunLengthEncodedBlock.create(types[fieldId], prefilledValues[fieldId], batchSize)); + break; + case REGULAR: + case TRANSACTIONID: + Block block = dataPage.getBlock(columnMapping.getIndex()); + Optional> coercer = coercers.get(fieldId); + if (coercer.isPresent()) { + block = new LazyBlock(batchSize, new CoercionLazyBlockLoader(block, coercer.get())); + } + blocks.add(block); + break; + case INTERIM: + // interim columns don't show up in output + break; + default: + throw new UnsupportedOperationException(); + } + } + return new Page(batchSize, dataPage.getPageMetadata(), blocks.toArray(new Block[0])); + } + catch (PrestoException e) { + closeWithSuppression(e); + throw e; + } + catch (RuntimeException e) { + closeWithSuppression(e); + throw new PrestoException(HIVE_CURSOR_ERROR, e); + } + } + + @Override + public void close() + { + try { + delegate.close(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public String toString() + { + return delegate.toString(); + } + + @Override + public long getSystemMemoryUsage() + { + return delegate.getSystemMemoryUsage(); + } + + protected void closeWithSuppression(Throwable throwable) + { + requireNonNull(throwable, "throwable is null"); + try { + close(); + } + catch (RuntimeException e) { + // Self-suppression not permitted + if (throwable != e) { + throwable.addSuppressed(e); + } + } + } + + public ConnectorPageSource getPageSource() + { + return delegate; + } + + private List> getEligibleColumnsForRowFiltering(int channelCount, List> dynamicFilters) + { + List> eligibleColumnsList = new ArrayList<>(); + for (Map dynamicFilter : dynamicFilters) { + Map eligibleColumns = new HashMap<>(); + for (int channel = 0; channel < channelCount; channel++) { + HiveColumnHandle columnHandle = columnMappings.get(channel).getHiveColumnHandle(); + if (!columnHandle.isPartitionKey() && dynamicFilter.containsKey(columnHandle)) { + if (dynamicFilter.get(columnHandle).getSize() <= rowFilteringThreshold) { + eligibleColumns.put(channel, columnHandle); + } + } + } + if (!eligibleColumns.isEmpty()) { + eligibleColumnsList.add(eligibleColumns); + } + } + return eligibleColumnsList; + } + + private static boolean[] filterRows(List> dynamicFilters, Page page, List> eligibleColumns, Type[] types) + { + boolean[] result = new boolean[page.getPositionCount()]; + Arrays.fill(result, Boolean.FALSE); + // loop to handle union of filters if any + for (int j = 0; j < eligibleColumns.size(); j++) { + boolean[] filterResult = new boolean[page.getPositionCount()]; + Arrays.fill(filterResult, Boolean.TRUE); + for (Map.Entry column : eligibleColumns.get(j).entrySet()) { + final int columnIndex = column.getKey(); + final ColumnHandle columnHandle = column.getValue(); + final DynamicFilter dynamicFilter = dynamicFilters.get(j).get(columnHandle); + final Block block = page.getBlock(columnIndex).getLoadedBlock(); + if (dynamicFilter instanceof BloomFilterDynamicFilter) { + block.filter(((BloomFilterDynamicFilter) dynamicFilters.get(j).get(columnHandle)).getBloomFilterDeserialized(), filterResult); + } + else { + for (int i = 0; i < block.getPositionCount(); i++) { + filterResult[i] = filterResult[i] && dynamicFilter.contains(TypeUtils.readNativeValue(types[columnIndex], block, i)); + } + } + } + // apply union of last filter + for (Map.Entry column : eligibleColumns.get(j).entrySet()) { + final int columnIndex = column.getKey(); + final Block block = page.getBlock(columnIndex).getLoadedBlock(); + for (int i = 0; i < block.getPositionCount(); i++) { + result[i] = result[i] || filterResult[i]; + } + } + } + return result; + } + + @VisibleForTesting + public static Page filter(List> dynamicFilters, Page page, List> eligibleColumns, Type[] types) + { + boolean[] result = filterRows(dynamicFilters, page, eligibleColumns, types); + int[] rowsToKeep = toPositions(result); + // If no row is filtered, no need to create a new page + if (rowsToKeep.length == page.getPositionCount()) { + return page; + } + + Block[] adaptedBlocks = new Block[page.getChannelCount()]; + for (int i = 0; i < adaptedBlocks.length; i++) { + Block block = page.getBlock(i); + if (block instanceof LazyBlock && !((LazyBlock) block).isLoaded()) { + adaptedBlocks[i] = new LazyBlock(rowsToKeep.length, new RowFilterLazyBlockLoader(page.getBlock(i), rowsToKeep)); + } + else { + adaptedBlocks[i] = block.getPositions(rowsToKeep, 0, rowsToKeep.length); + } + } + return new Page(rowsToKeep.length, adaptedBlocks); + } + + /** + * Is position 1-based? extract the "true" value positions + * + * @param keep Boolean array including which row to keep + * @return Int array of positions need to be kept + */ + private static int[] toPositions(boolean[] keep) + { + int size = Booleans.countTrue(keep); + int[] result = new int[size]; + int idx = 0; + for (int i = 0; i < keep.length; i++) { + if (keep[i]) { + result[idx] = i; //position is 1-based + idx++; + } + } + return result; + } + + private static final class CoercionLazyBlockLoader + implements LazyBlockLoader + { + private final Function coercer; + private Block block; + + public CoercionLazyBlockLoader(Block block, Function coercer) + { + this.block = requireNonNull(block, "block is null"); + this.coercer = requireNonNull(coercer, "coercer is null"); + } + + @Override + public void load(LazyBlock lazyBlock) + { + if (block == null) { + return; + } + + lazyBlock.setBlock(coercer.apply(block.getLoadedBlock())); + + // clear reference to loader to free resources, since load was successful + block = null; + } + } + + private static final class RowFilterLazyBlockLoader + implements LazyBlockLoader + { + private final int[] rowsToKeep; + private Block block; + + public RowFilterLazyBlockLoader(Block block, int[] rowsToKeep) + { + this.block = requireNonNull(block, "block is null"); + this.rowsToKeep = requireNonNull(rowsToKeep, "rowsToKeep is null"); + } + + @Override + public void load(LazyBlock lazyBlock) + { + if (block == null) { + return; + } + + lazyBlock.setBlock(block.getPositions(rowsToKeep, 0, rowsToKeep.length)); + + // clear reference to loader to free resources, since load was successful + block = null; + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + RowFilterLazyBlockLoader other = (RowFilterLazyBlockLoader) obj; + return Arrays.equals(this.rowsToKeep, other.rowsToKeep) && + Objects.equals(this.block, other.block); + } + + @Override + public int hashCode() + { + return Objects.hash(Arrays.hashCode(rowsToKeep), block); + } + } + + public static class BucketAdapter + { + private final int[] bucketColumns; + private final BucketingVersion bucketingVersion; + private final int bucketToKeep; + private final int tableBucketCount; + private final int partitionBucketCount; // for sanity check only + private final List typeInfoList; + + public BucketAdapter(BucketAdaptation bucketAdaptation) + { + this.bucketColumns = bucketAdaptation.getBucketColumnIndices(); + this.bucketingVersion = bucketAdaptation.getBucketingVersion(); + this.bucketToKeep = bucketAdaptation.getBucketToKeep(); + this.typeInfoList = bucketAdaptation.getBucketColumnHiveTypes().stream() + .map(HiveType::getTypeInfo) + .collect(toImmutableList()); + this.tableBucketCount = bucketAdaptation.getTableBucketCount(); + this.partitionBucketCount = bucketAdaptation.getPartitionBucketCount(); + } + + public IntArrayList computeEligibleRowIds(Page page) + { + IntArrayList ids = new IntArrayList(page.getPositionCount()); + Page bucketColumnsPage = extractColumns(page, bucketColumns); + for (int position = 0; position < page.getPositionCount(); position++) { + int bucket = getHiveBucket(bucketingVersion, tableBucketCount, typeInfoList, bucketColumnsPage, position); + if ((bucket - bucketToKeep) % partitionBucketCount != 0) { + throw new PrestoException(HIVE_INVALID_BUCKET_FILES, format( + "A row that is supposed to be in bucket %s is encountered. Only rows in bucket %s (modulo %s) are expected", + bucket, bucketToKeep % partitionBucketCount, partitionBucketCount)); + } + if (bucket == bucketToKeep) { + ids.add(position); + } + } + return ids; + } + } + + @Override + public boolean needMergingForPages() + { + return isSelectiveRead; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSourceFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSourceFactory.java new file mode 100644 index 00000000..f1a9a58c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSourceFactory.java @@ -0,0 +1,90 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.dynamicfilter.DynamicFilterSupplier; +import io.prestosql.spi.heuristicindex.IndexMetadata; +import io.prestosql.spi.heuristicindex.SplitMetadata; +import io.prestosql.spi.predicate.TupleDomain; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; + +import java.util.List; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; + +public interface HivePageSourceFactory +{ + Optional createPageSource( + Configuration configuration, + ConnectorSession session, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + TupleDomain effectivePredicate, + Optional dynamicFilterSupplier, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + SplitMetadata splitMetadata, + boolean splitCacheable, + long dataSourceLastModifiedTime); + + default Optional createPageSource( + Configuration configuration, + ConnectorSession session, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + TupleDomain effectivePredicate, + Optional dynamicFilterSupplier, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + SplitMetadata splitMetadata, + boolean splitCacheable, + long dataSourceLastModifiedTime, + List partitionKeys, + OptionalInt bucketNumber, + Optional omniDataAddress, + HiveOffloadExpression offloadExpression) + { + return createPageSource( + configuration, + session, + path, + start, + length, + fileSize, + schema, + columns, + effectivePredicate, + dynamicFilterSupplier, + deleteDeltaLocations, + startRowOffsetOfFile, + indexes, + splitMetadata, + splitCacheable, + dataSourceLastModifiedTime); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSourceProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSourceProvider.java new file mode 100644 index 00000000..356926e1 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePageSourceProvider.java @@ -0,0 +1,1028 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.huawei.boostkit.omnidata.block.BlockDeserializer; +import com.huawei.boostkit.omnidata.model.Predicate; +import com.huawei.boostkit.omnidata.model.TaskSource; +import com.huawei.boostkit.omnidata.model.datasource.DataSource; +import com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsRecordDataSource; +import com.huawei.boostkit.omnidata.reader.DataReader; +import com.huawei.boostkit.omnidata.reader.DataReaderFactory; +import io.airlift.log.Logger; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.prestosql.memory.context.AggregatedMemoryContext; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.plugin.hive.coercions.HiveCoercer; +import io.prestosql.plugin.hive.omnidata.OmniDataNodeManager; +import io.prestosql.plugin.hive.orc.OrcConcatPageSource; +import io.prestosql.plugin.hive.util.IndexCache; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorPageSourceProvider; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorSplit; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.connector.FixedPageSource; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.connector.RecordPageSource; +import io.prestosql.spi.dynamicfilter.CombinedDynamicFilter; +import io.prestosql.spi.dynamicfilter.DynamicFilter; +import io.prestosql.spi.dynamicfilter.DynamicFilterSupplier; +import io.prestosql.spi.dynamicfilter.FilteredDynamicFilter; +import io.prestosql.spi.function.BuiltInFunctionHandle; +import io.prestosql.spi.function.Signature; +import io.prestosql.spi.heuristicindex.IndexMetadata; +import io.prestosql.spi.heuristicindex.SplitMetadata; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.Range; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.predicate.ValueSet; +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.eclipse.jetty.util.URIUtil; + +import javax.inject.Inject; + +import java.net.InetAddress; +import java.net.URI; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.Set; +import java.util.StringJoiner; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.Maps.uniqueIndex; +import static com.huawei.boostkit.omnidata.OmniDataProperty.GRPC_CLIENT_TARGET_LIST; +import static com.huawei.boostkit.omnidata.OmniDataProperty.HOSTADDRESS_DELIMITER; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.DUMMY_OFFLOADED; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveColumnHandle.MAX_PARTITION_KEY_COLUMN_INDEX; +import static io.prestosql.plugin.hive.HivePageSourceProvider.ColumnMapping.toColumnHandles; +import static io.prestosql.plugin.hive.HiveUtil.isPartitionFiltered; +import static io.prestosql.plugin.hive.coercions.HiveCoercer.createCoercer; +import static io.prestosql.plugin.hive.metastore.MetastoreUtil.META_PARTITION_COLUMNS; +import static io.prestosql.plugin.hive.util.PageSourceUtil.buildPushdownContext; +import static io.prestosql.plugin.hive.util.PageSourceUtil.getSslConfiguredProperties; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS; + +public class HivePageSourceProvider + implements ConnectorPageSourceProvider +{ + private static final Logger log = Logger.get(HivePageSourceProvider.class); + private final HdfsEnvironment hdfsEnvironment; + private final Set cursorProviders; + private final TypeManager typeManager; + + private final Set pageSourceFactories; + + private static final String HIVE_DEFAULT_PARTITION_VALUE = "\\N"; + private final IndexCache indexCache; + private final Set selectivePageSourceFactories; + private final OmniDataNodeManager omniDataNodeManager; + private final ImmutableMap sslPropertyMap; + + @Inject + public HivePageSourceProvider( + OmniDataNodeManager omniDataNodeManager, + HiveConfig hiveConfig, + HdfsEnvironment hdfsEnvironment, + Set cursorProviders, + Set pageSourceFactories, + TypeManager typeManager, + IndexCache indexCache, + Set selectivePageSourceFactories) + { + requireNonNull(hiveConfig, "hiveConfig is null"); + this.omniDataNodeManager = omniDataNodeManager; + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.cursorProviders = ImmutableSet.copyOf(requireNonNull(cursorProviders, "cursorProviders is null")); + this.pageSourceFactories = ImmutableSet.copyOf( + requireNonNull(pageSourceFactories, "pageSourceFactories is null")); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + this.indexCache = indexCache; + this.selectivePageSourceFactories = selectivePageSourceFactories; + this.sslPropertyMap = getSslConfiguredProperties(hiveConfig); + } + + public HivePageSourceProvider( + HiveConfig hiveConfig, + HdfsEnvironment hdfsEnvironment, + Set cursorProviders, + Set pageSourceFactories, + TypeManager typeManager, + IndexCache indexCache, + Set selectivePageSourceFactories) + { + requireNonNull(hiveConfig, "hiveConfig is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.cursorProviders = ImmutableSet.copyOf(requireNonNull(cursorProviders, "cursorProviders is null")); + this.pageSourceFactories = ImmutableSet.copyOf( + requireNonNull(pageSourceFactories, "pageSourceFactories is null")); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + this.indexCache = indexCache; + this.selectivePageSourceFactories = selectivePageSourceFactories; + this.omniDataNodeManager = null; + this.sslPropertyMap = getSslConfiguredProperties(hiveConfig); + } + + @Override + public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorSplit split, ConnectorTableHandle table, + List columns) + { + return createPageSource(transaction, session, split, table, columns, Optional.empty()); + } + + @Override + public ConnectorPageSource createPageSource(ConnectorTransactionHandle transaction, ConnectorSession session, + ConnectorSplit split, ConnectorTableHandle table, List columns, + Optional dynamicFilterSupplier) + { + List> dynamicFilters = null; + if (dynamicFilterSupplier.isPresent()) { + dynamicFilters = dynamicFilterSupplier.get().getDynamicFilters(); + } + + HiveTableHandle hiveTable = (HiveTableHandle) table; + + List hiveColumns = columns.stream() + .map(HiveColumnHandle.class::cast) + .collect(toList()); + + List hiveSplits = (((HiveSplitWrapper) split).getSplits()); + if (hiveSplits.size() == 1) { + HiveSplit hiveSplit = hiveSplits.get(0); + return createPageSourceInternal(session, dynamicFilterSupplier, dynamicFilters, hiveTable, hiveColumns, hiveSplit); + } + List> finalDynamicFilters = dynamicFilters; + List pageSources = hiveSplits.stream() + .map(hiveSplit -> createPageSourceInternal(session, dynamicFilterSupplier, finalDynamicFilters, hiveTable, hiveColumns, hiveSplit)) + .collect(toList()); + return new OrcConcatPageSource(pageSources); + } + + private Optional getSplitOmniDataAddr(HiveOffloadExpression expression, HiveSplit hiveSplit) + { + if (!expression.isPresent()) { + return Optional.empty(); + } + + // empty split + if (hiveSplit.getAddresses().size() == 0) { + return omniDataNodeManager.getAllNodes().isEmpty() ? Optional.empty() : + Optional.of(omniDataNodeManager.getAllNodes().values().stream().findAny().get().getHostAddress()); + } + + StringJoiner hostAddressJoiner = new StringJoiner(HOSTADDRESS_DELIMITER); + int copyNumber = Math.max(1, hiveSplit.getAddresses().size()); + int seed = (int) ((hiveSplit.getStart() / Math.max(1, hiveSplit.getLength()) + hiveSplit.getFileSize()) % copyNumber); + int counter = 0; + for (int i = 0; i < hiveSplit.getAddresses().size(); i++) { + int copyIndex = (i + seed) % copyNumber; + try { + String hostIp = InetAddress.getByName(hiveSplit.getAddresses().get(copyIndex).getHostText()).getHostAddress(); + if (omniDataNodeManager.getAllNodes().containsKey(hostIp)) { + hostAddressJoiner.add(omniDataNodeManager.getAllNodes().get(hostIp).getHostAddress()); + counter++; + } + } + catch (UnknownHostException e) { + log.warn("Get host ip by host name %s fail.", hiveSplit.getAddresses().get(copyIndex).getHostText()); + } + } + + if (counter == 0) { + StringJoiner splitJoiner = new StringJoiner(", "); + hiveSplit.getAddresses().stream().map(entry -> entry.toString()).forEach(splitJoiner::add); + log.warn("Get omniData ip for split[%s] fail, omniDataNodeManager size %d.", splitJoiner.toString(), omniDataNodeManager.getAllNodes().size()); + return Optional.empty(); + } + return Optional.of(hostAddressJoiner.toString()); + } + + private ConnectorPageSource createPageSourceInternal(ConnectorSession session, + Optional dynamicFilterSupplier, + List> dynamicFilters, + HiveTableHandle hiveTable, + List hiveColumns, + HiveSplit hiveSplit) + { + Path path = new Path(hiveSplit.getPath()); + + List> dynamicFilterList = new ArrayList(); + if (dynamicFilters != null) { + for (Map df : dynamicFilters) { + Set values = df.values().stream().collect(Collectors.toSet()); + dynamicFilterList.add(values); + } + } + // Filter out splits using partition values and dynamic filters + if (dynamicFilters != null && !dynamicFilters.isEmpty() && isPartitionFiltered(hiveSplit.getPartitionKeys(), dynamicFilterList, typeManager)) { + return new FixedPageSource(ImmutableList.of()); + } + + Configuration configuration = hdfsEnvironment.getConfiguration( + new HdfsEnvironment.HdfsContext(session, hiveSplit.getDatabase(), hiveSplit.getTable()), path); + + Properties schema = hiveSplit.getSchema(); + String columnNameDelimiter = schema.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? schema + .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA); + List partitionColumnNames; + if (schema.containsKey(META_PARTITION_COLUMNS)) { + partitionColumnNames = Arrays.asList(schema.getProperty(META_PARTITION_COLUMNS).split(columnNameDelimiter)); + } + else if (schema.containsKey(META_TABLE_COLUMNS)) { + partitionColumnNames = Arrays.asList(schema.getProperty(META_TABLE_COLUMNS).split(columnNameDelimiter)); + } + else { + partitionColumnNames = new ArrayList<>(); + } + + List tableColumns = hiveColumns.stream().map(cols -> cols.getName()).collect(toList()); + + List missingColumns = tableColumns.stream().filter(cols -> !partitionColumnNames.contains(cols)).collect(toList()); + + List indexes = new ArrayList<>(); + if (indexCache != null && session.isHeuristicIndexFilterEnabled()) { + indexes.addAll(this.indexCache.getIndices(session + .getCatalog().orElse(null), hiveTable + .getSchemaTableName().toString(), hiveSplit, hiveTable.getCompactEffectivePredicate(), + hiveTable.getPartitionColumns())); + + /* Bloom/Bitmap indices are checked for given table and added to the possible matchers for pushdown. */ + if (hiveTable.getDisjunctCompactEffectivePredicate().isPresent() && hiveTable.getDisjunctCompactEffectivePredicate().get().size() > 0) { + hiveTable.getDisjunctCompactEffectivePredicate().get().forEach(orPredicate -> + indexes.addAll(this.indexCache.getIndices(session + .getCatalog().orElse(null), hiveTable + .getSchemaTableName().toString(), hiveSplit, orPredicate, hiveTable + .getPartitionColumns()))); + } + } + Optional> indexOptional = + indexes == null || indexes.isEmpty() ? Optional.empty() : Optional.of(indexes); + URI splitUri = URI.create(URIUtil.encodePath(hiveSplit.getPath())); + SplitMetadata splitMetadata = new SplitMetadata(splitUri.getRawPath(), hiveSplit.getLastModifiedTime()); + + TupleDomain predicate = TupleDomain.all(); + if (dynamicFilterSupplier.isPresent() && dynamicFilters != null && !dynamicFilters.isEmpty()) { + if (dynamicFilters.size() == 1) { + List filteredHiveColumnHandles = hiveColumns.stream().filter(column -> dynamicFilters.get(0).containsKey(column)).collect(toList()); + HiveColumnHandle hiveColumnHandle = filteredHiveColumnHandles.get(0); + Type type = hiveColumnHandle.getColumnMetadata(typeManager).getType(); + predicate = getPredicate(dynamicFilters.get(0).get(hiveColumnHandle), type, hiveColumnHandle); + if (predicate.isNone()) { + predicate = TupleDomain.all(); + } + } + } + + /** + * This is main logical division point to process filter pushdown enabled case (aka as selective read flow). + * If user configuration orc_predicate_pushdown_enabled is true and if all clause of query can be handled by hive + * selective read flow, then hiveTable.isSuitableToPush() will be enabled. + * (Refer HiveMetadata.checkIfSuitableToPush). + */ + if (hiveTable.isSuitableToPush()) { + return createSelectivePageSource(selectivePageSourceFactories, configuration, + session, hiveSplit, assignUniqueIndicesToPartitionColumns(hiveColumns), typeManager, + dynamicFilterSupplier, hiveSplit.getDeleteDeltaLocations(), + hiveSplit.getStartRowOffsetOfFile(), + indexOptional, hiveSplit.isCacheable(), + hiveTable.getCompactEffectivePredicate(), + hiveTable.getPredicateColumns(), + hiveTable.getDisjunctCompactEffectivePredicate(), + hiveSplit.getBucketConversion(), + hiveSplit.getBucketNumber(), + hiveSplit.getLastModifiedTime(), + missingColumns); + } + + Optional omniDataAddress = getSplitOmniDataAddr(hiveTable.getOffloadExpression(), hiveSplit); + Optional pageSource = createHivePageSource( + cursorProviders, + pageSourceFactories, + configuration, + session, + path, + hiveSplit.getBucketNumber(), + hiveSplit.getStart(), + hiveSplit.getLength(), + hiveSplit.getFileSize(), + hiveSplit.getSchema(), + hiveTable.getCompactEffectivePredicate().intersect(predicate), + hiveColumns, + hiveSplit.getPartitionKeys(), + typeManager, + hiveSplit.getColumnCoercions(), + hiveSplit.getBucketConversion(), + hiveSplit.isS3SelectPushdownEnabled(), + dynamicFilterSupplier, + hiveSplit.getDeleteDeltaLocations(), + hiveSplit.getStartRowOffsetOfFile(), + indexOptional, + splitMetadata, + hiveSplit.isCacheable(), + hiveSplit.getLastModifiedTime(), + hiveSplit.getCustomSplitInfo(), + missingColumns, + omniDataAddress, + hiveTable.getOffloadExpression(), + sslPropertyMap); + if (pageSource.isPresent()) { + return pageSource.get(); + } + throw new RuntimeException("Could not find a file reader for split " + hiveSplit); + } + + /** + * All partition columns have index as -1, since we are making map of this, we need to assign an unique index. + * @param columns List of partition columns + * @return Modified list of columns with unique index. + */ + private static List assignUniqueIndicesToPartitionColumns(List columns) + { + // Gives a distinct hiveColumnIndex to partitioning columns. Columns are identified by these indices in the rest of the + // selective read path. + ImmutableList.Builder newColumns = ImmutableList.builder(); + int nextIndex = MAX_PARTITION_KEY_COLUMN_INDEX; + for (HiveColumnHandle column : columns) { + if (column.isPartitionKey()) { + newColumns.add(new HiveColumnHandle(column.getName(), column.getHiveType(), column.getTypeSignature(), nextIndex--, column.getColumnType(), column.getComment())); + } + else { + newColumns.add(column); + } + } + return newColumns.build(); + } + + /** + * Create selective page source, which will be used for selective reader flow. + * Unlike normal page source, selective page source required to pass below additional details to reader + * a. Pre-filled values of all constant. + * b. Coercion information of all columns. + * c. Columns which required to be projected. + * d. Total list of columns which will be read (projection + filter). + * All these info gets used by reader. + * @param columns List of all columns being part of scan. + * @param effectivePredicate Predicates related to AND clause + * @param predicateColumns Map of all columns handles being part of predicate + * @param additionPredicates Predicates related to OR clause. + * Remaining columns are same as for createHivePageSource. + * @param missingColumns + * @return + */ + private static ConnectorPageSource createSelectivePageSource( + Set selectivePageSourceFactories, + Configuration configuration, + ConnectorSession session, + HiveSplit split, + List columns, + TypeManager typeManager, + Optional dynamicFilterSupplier, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + boolean splitCacheable, + TupleDomain effectivePredicate, + Map predicateColumns, + Optional>> additionPredicates, + Optional bucketConversion, + OptionalInt bucketNumber, + long dataSourceLastModifiedTime, + List missingColumns) + { + Set interimColumns = ImmutableSet.builder() + .addAll(predicateColumns.values()) + .addAll(bucketConversion.map(HiveSplit.BucketConversion::getBucketColumnHandles).orElse(ImmutableList.of())) + .build(); + + Path path = new Path(split.getPath()); + List columnMappings = ColumnMapping.buildColumnMappings( + split.getPartitionKeys(), + columns, + ImmutableList.copyOf(interimColumns), + split.getColumnCoercions(), + path, + bucketNumber, + true, + missingColumns); + + List regularAndInterimColumnMappings = ColumnMapping.extractRegularAndInterimColumnMappings( + columnMappings); + Optional bucketAdaptation = toBucketAdaptation(bucketConversion, regularAndInterimColumnMappings, bucketNumber); + checkArgument(!bucketAdaptation.isPresent(), "Bucket conversion is not yet supported"); + + // Make a list of all PREFILLED columns, which can be passed to reader. Unlike normal flow, selective read + // flow require to pass this below at reader level as we need to make block of all column values. + Map prefilledValues = columnMappings.stream() + .filter(mapping -> mapping.getKind() == ColumnMappingKind.PREFILLED) + .collect(toImmutableMap(mapping -> mapping.getHiveColumnHandle().getHiveColumnIndex(), ColumnMapping::getPrefilledValue)); + + // Make a map of column required to be coerced. This also needs to be sent to reader level as coercion + // should be applied before adding values in block. + Map coercers = columnMappings.stream() + .filter(mapping -> mapping.getCoercionFrom().isPresent()) + .collect(toImmutableMap( + mapping -> mapping.getHiveColumnHandle().getHiveColumnIndex(), + mapping -> createCoercer(typeManager, mapping.getCoercionFrom().get(), mapping.getHiveColumnHandle().getHiveType()))); + + List outputColumns = columns.stream() + .map(HiveColumnHandle::getHiveColumnIndex) + .collect(toImmutableList()); + + for (HiveSelectivePageSourceFactory pageSourceFactory : selectivePageSourceFactories) { + Optional pageSource = pageSourceFactory.createPageSource( + configuration, + session, + path, + split.getStart(), + split.getLength(), + split.getFileSize(), + split.getSchema(), + toColumnHandles(columnMappings, true), + prefilledValues, + outputColumns, + effectivePredicate, + additionPredicates, + deleteDeltaLocations, + startRowOffsetOfFile, + indexes, + splitCacheable, + columnMappings, + coercers, + dataSourceLastModifiedTime); + if (pageSource.isPresent()) { + return new HivePageSource( + columnMappings, + Optional.empty(), + typeManager, + pageSource.get(), + dynamicFilterSupplier, + session, + split.getPartitionKeys()); + } + } + + throw new IllegalStateException("Could not find a file reader for split " + split); + } + + public static Optional createHivePageSource( + Set cursorProviders, + Set pageSourceFactories, + Configuration configuration, + ConnectorSession session, + Path path, + OptionalInt bucketNumber, + long start, + long length, + long fileSize, + Properties schema, + TupleDomain effectivePredicate, + List hiveColumns, + List partitionKeys, + TypeManager typeManager, + Map columnCoercions, + Optional bucketConversion, + boolean s3SelectPushdownEnabled, + Optional dynamicFilterSupplier, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + SplitMetadata splitMetadata, + boolean splitCacheable, + long dataSourceLastModifiedTime, + Map customSplitInfo, + List missingColumns, + Optional omniDataAddress, + HiveOffloadExpression expression, + ImmutableMap sslPropertyMap) + { + List columnMappings = ColumnMapping.buildColumnMappings( + partitionKeys, + hiveColumns, + bucketConversion.map(HiveSplit.BucketConversion::getBucketColumnHandles).orElse(ImmutableList.of()), + columnCoercions, + path, + bucketNumber, + true, + missingColumns); + List regularAndInterimColumnMappings = ColumnMapping.extractRegularAndInterimColumnMappings( + columnMappings); + + Optional bucketAdaptation = toBucketAdaptation(bucketConversion, regularAndInterimColumnMappings, bucketNumber); + + for (HivePageSourceFactory pageSourceFactory : pageSourceFactories) { + Optional pageSource = pageSourceFactory.createPageSource( + configuration, + session, + path, + start, + length, + fileSize, + schema, + toColumnHandles(regularAndInterimColumnMappings, true), + effectivePredicate, + dynamicFilterSupplier, + deleteDeltaLocations, + startRowOffsetOfFile, + indexes, + splitMetadata, + splitCacheable, + dataSourceLastModifiedTime, + partitionKeys, + bucketNumber, + omniDataAddress, + expression); + if (pageSource.isPresent()) { + return Optional.of( + new HivePageSource( + columnMappings, + bucketAdaptation, + typeManager, + pageSource.get(), + dynamicFilterSupplier, + session, + partitionKeys)); + } + } + + // PushDown txt/orc file to omni-data + if (HiveSessionProperties.isOmniDataEnabled(session) && expression.isPresent()) { + checkArgument(omniDataAddress.isPresent(), "omniDataAddress is empty"); + + Predicate predicate = buildPushdownContext(hiveColumns, expression, typeManager, + effectivePredicate, partitionKeys, bucketNumber, path); + ConnectorPageSource pageSource = createPushDownPageSource(path, + start, + length, + fileSize, + predicate, + omniDataAddress.get(), + schema, + sslPropertyMap); + return Optional.of( + new HivePageSource( + columnMappings, + bucketAdaptation, + typeManager, + pageSource, + dynamicFilterSupplier, + session, + partitionKeys)); + } + + for (HiveRecordCursorProvider provider : cursorProviders) { + // GenericHiveRecordCursor will automatically do the coercion without HiveCoercionRecordCursor + boolean doCoercion = !(provider instanceof GenericHiveRecordCursorProvider); + + Optional cursor = provider.createRecordCursor( + configuration, + session, + path, + start, + length, + fileSize, + schema, + toColumnHandles(regularAndInterimColumnMappings, doCoercion), + effectivePredicate, + typeManager, + s3SelectPushdownEnabled, + customSplitInfo); + + if (cursor.isPresent()) { + RecordCursor delegate = cursor.get(); + + checkArgument(!deleteDeltaLocations.isPresent(), "Delete delta is not supported"); + + if (bucketAdaptation.isPresent()) { + delegate = new HiveBucketAdapterRecordCursor( + bucketAdaptation.get().getBucketColumnIndices(), + bucketAdaptation.get().getBucketColumnHiveTypes(), + bucketAdaptation.get().getBucketingVersion(), + bucketAdaptation.get().getTableBucketCount(), + bucketAdaptation.get().getPartitionBucketCount(), + bucketAdaptation.get().getBucketToKeep(), + typeManager, + delegate); + } + + // Need to wrap RcText and RcBinary into a wrapper, which will do the coercion for mismatch columns + if (doCoercion) { + delegate = new HiveCoercionRecordCursor(regularAndInterimColumnMappings, typeManager, delegate); + } + + HiveRecordCursor hiveRecordCursor = new HiveRecordCursor( + columnMappings, + typeManager, + delegate); + List columnTypes = hiveColumns.stream() + .map(input -> typeManager.getType(input.getTypeSignature())) + .collect(toList()); + + return Optional.of(new RecordPageSource(columnTypes, hiveRecordCursor)); + } + } + + return Optional.empty(); + } + + private static ConnectorPageSource createPushDownPageSource( + Path path, + long start, + long length, + long fileSize, + Predicate predicate, + String omniDataServerTarget, + Properties schema, + ImmutableMap sslPropertyMap) + { + AggregatedMemoryContext systemMemoryUsage = AggregatedMemoryContext.newSimpleAggregatedMemoryContext(); + Properties transProperties = new Properties(); + transProperties.put(GRPC_CLIENT_TARGET_LIST, omniDataServerTarget); + transProperties.putAll(sslPropertyMap); + + DataSource pushDownDataSource = new HdfsRecordDataSource(path.toString(), start, length, fileSize, schema); + + TaskSource readTaskInfo = new TaskSource( + pushDownDataSource, + predicate, + TaskSource.ONE_MEGABYTES); + DataReader dataReader = DataReaderFactory.create(transProperties, readTaskInfo, new BlockDeserializer()); + + return new HivePushDownRecordPageSource(dataReader, systemMemoryUsage); + } + + public static Optional toBucketAdaptation(Optional bucketConversion, + List regularAndInterimColumnMappings, + OptionalInt bucketNumber) + { + return bucketConversion.map(conversion -> { + Map hiveIndexToBlockIndex = uniqueIndex(regularAndInterimColumnMappings, + columnMapping -> columnMapping.getHiveColumnHandle().getHiveColumnIndex()); + int[] bucketColumnIndices = conversion.getBucketColumnHandles().stream() + .mapToInt(columnHandle -> hiveIndexToBlockIndex.get(columnHandle.getHiveColumnIndex()).getIndex()) + .toArray(); + List bucketColumnHiveTypes = conversion.getBucketColumnHandles().stream() + .map(columnHandle -> hiveIndexToBlockIndex.get( + columnHandle.getHiveColumnIndex()).getHiveColumnHandle().getHiveType()) + .collect(toImmutableList()); + return new BucketAdaptation( + bucketColumnIndices, + bucketColumnHiveTypes, + conversion.getBucketingVersion(), + conversion.getTableBucketCount(), + conversion.getPartitionBucketCount(), + bucketNumber.getAsInt()); + }); + } + + public static class ColumnMapping + { + private final ColumnMappingKind kind; + private final HiveColumnHandle hiveColumnHandle; + private final Optional prefilledValue; + /** + * ordinal of this column in the underlying page source or record cursor + */ + private final OptionalInt index; + private final Optional coercionFrom; + + public static ColumnMapping regular(HiveColumnHandle hiveColumnHandle, int index, Optional coerceFrom) + { + checkArgument(hiveColumnHandle.getColumnType() == HiveColumnHandle.ColumnType.REGULAR); + return new ColumnMapping(ColumnMappingKind.REGULAR, hiveColumnHandle, Optional.empty(), + OptionalInt.of(index), coerceFrom); + } + + public static ColumnMapping synthesized(HiveColumnHandle hiveColumnHandle, int index, + Optional coerceFrom) + { + checkArgument(hiveColumnHandle.getColumnType() == HiveColumnHandle.ColumnType.SYNTHESIZED); + return new ColumnMapping(ColumnMappingKind.REGULAR, hiveColumnHandle, Optional.empty(), + OptionalInt.of(index), coerceFrom); + } + + public static ColumnMapping prefilled(HiveColumnHandle hiveColumnHandle, String prefilledValue, + Optional coerceFrom) + { + checkArgument( + hiveColumnHandle.getColumnType() == HiveColumnHandle.ColumnType.PARTITION_KEY || hiveColumnHandle.getColumnType() == HiveColumnHandle.ColumnType.SYNTHESIZED); + return new ColumnMapping(ColumnMappingKind.PREFILLED, hiveColumnHandle, Optional.of(prefilledValue), + OptionalInt.empty(), coerceFrom); + } + + public static ColumnMapping transaction(HiveColumnHandle hiveColumnHandle, int index, + Optional coerceFrom) + { + checkArgument(hiveColumnHandle.getColumnType() == HiveColumnHandle.ColumnType.SYNTHESIZED); + return new ColumnMapping(ColumnMappingKind.TRANSACTIONID, hiveColumnHandle, Optional.empty(), + OptionalInt.of(index), coerceFrom); + } + + public static ColumnMapping interim(HiveColumnHandle hiveColumnHandle, int index) + { + return new ColumnMapping(ColumnMappingKind.INTERIM, hiveColumnHandle, Optional.empty(), + OptionalInt.of(index), Optional.empty()); + } + + private ColumnMapping(ColumnMappingKind kind, HiveColumnHandle hiveColumnHandle, + Optional prefilledValue, OptionalInt index, Optional coerceFrom) + { + this.kind = requireNonNull(kind, "kind is null"); + this.hiveColumnHandle = requireNonNull(hiveColumnHandle, "hiveColumnHandle is null"); + this.prefilledValue = requireNonNull(prefilledValue, "prefilledValue is null"); + this.index = requireNonNull(index, "index is null"); + this.coercionFrom = requireNonNull(coerceFrom, "coerceFrom is null"); + } + + public ColumnMappingKind getKind() + { + return kind; + } + + public String getPrefilledValue() + { + checkState(kind == ColumnMappingKind.PREFILLED); + return prefilledValue.isPresent() ? prefilledValue.get() : HIVE_DEFAULT_PARTITION_VALUE; + } + + public HiveColumnHandle getHiveColumnHandle() + { + return hiveColumnHandle; + } + + public int getIndex() + { + checkState( + kind == ColumnMappingKind.REGULAR || kind == ColumnMappingKind.INTERIM || kind == ColumnMappingKind.TRANSACTIONID); + return index.getAsInt(); + } + + public Optional getCoercionFrom() + { + return coercionFrom; + } + + public static ColumnMapping aggregated(HiveColumnHandle hiveColumnHandle, int index) + { + checkArgument(hiveColumnHandle.getColumnType() == DUMMY_OFFLOADED); + // Pretend that it is a regular column so that the split manager can process it as normal + return new ColumnMapping(ColumnMappingKind.REGULAR, hiveColumnHandle, Optional.empty(), OptionalInt.of(index), Optional.empty()); + } + + /** + * @param columns columns that need to be returned to engine + * @param requiredInterimColumns columns that are needed for processing, but shouldn't be returned to engine (may overlaps with columns) + * @param columnCoercions map from hive column index to hive type + * @param bucketNumber empty if table is not bucketed, a number within [0, # bucket in table) otherwise + * @param missingColumns + */ + public static List buildColumnMappings( + List partitionKeys, + List columns, + List requiredInterimColumns, + Map columnCoercions, + Path path, + OptionalInt bucketNumber, + boolean filterPushDown, + List missingColumns) + { + Map partitionKeysByName = uniqueIndex(partitionKeys, HivePartitionKey::getName); + int regularIndex = 0; + Set regularColumnIndices = new HashSet<>(); + ImmutableList.Builder columnMappings = ImmutableList.builder(); + for (HiveColumnHandle column : columns) { + Optional coercionFrom = Optional.ofNullable(columnCoercions.get(column.getHiveColumnIndex())); + if (column.getColumnType() == REGULAR) { + if (missingColumns.contains(column.getColumnName())) { + columnMappings.add(new ColumnMapping(ColumnMappingKind.PREFILLED, column, Optional.empty(), + OptionalInt.empty(), coercionFrom)); + continue; + } + checkArgument(regularColumnIndices.add(column.getHiveColumnIndex()), "duplicate hiveColumnIndex in columns list"); + + columnMappings.add(regular(column, regularIndex, coercionFrom)); + regularIndex++; + } + else if (column.getColumnType() == DUMMY_OFFLOADED) { + columnMappings.add(aggregated(column, regularIndex)); + regularIndex++; + } + else if (HiveColumnHandle.isUpdateColumnHandle(column)) { + columnMappings.add(transaction(column, regularIndex, coercionFrom)); + regularIndex++; + } + else { + columnMappings.add(prefilled( + column, + HiveUtil.getPrefilledColumnValue(column, partitionKeysByName.get(column.getName()), path, + bucketNumber), + coercionFrom)); + } + } + for (HiveColumnHandle column : requiredInterimColumns) { + checkArgument(column.getColumnType() == REGULAR || filterPushDown); + if (regularColumnIndices.contains(column.getHiveColumnIndex())) { + continue; // This column exists in columns. Do not add it again. + } + // If coercion does not affect bucket number calculation, coercion doesn't need to be applied here. + // Otherwise, read of this partition should not be allowed. + // (Alternatively, the partition could be read as an unbucketed partition. This is not implemented.) + columnMappings.add(interim(column, regularIndex)); + regularIndex++; + } + return columnMappings.build(); + } + + public static List extractRegularAndInterimColumnMappings(List columnMappings) + { + return columnMappings.stream() + .filter(columnMapping -> columnMapping.getKind() == ColumnMappingKind.REGULAR || columnMapping.getKind() == ColumnMappingKind.INTERIM || columnMapping.getKind() == ColumnMappingKind.TRANSACTIONID) + .collect(toImmutableList()); + } + + public static List toColumnHandles(List regularColumnMappings, + boolean doCoercion) + { + return regularColumnMappings.stream() + .map(columnMapping -> { + HiveColumnHandle columnHandle = columnMapping.getHiveColumnHandle(); + if (!doCoercion || !columnMapping.getCoercionFrom().isPresent()) { + return columnHandle; + } + return new HiveColumnHandle( + columnHandle.getName(), + columnMapping.getCoercionFrom().get(), + columnMapping.getCoercionFrom().get().getTypeSignature(), + columnHandle.getHiveColumnIndex(), + columnHandle.getColumnType(), + Optional.empty()); + }) + .collect(toList()); + } + } + + public static Object getValue(Type type, String partitionValue) + { + Class javaType = type.getJavaType(); + + if (javaType == long.class) { + return Long.valueOf(partitionValue); + } + if (javaType == double.class) { + return Double.valueOf(partitionValue); + } + if (javaType == boolean.class) { + return Boolean.valueOf(partitionValue); + } + if (javaType == Slice.class) { + return Slices.utf8Slice(partitionValue); + } + return partitionValue; + } + + protected static Domain modifyDomain(Domain domain, Optional filter) + { + Range range = domain.getValues().getRanges().getSpan(); + if (filter.isPresent() && filter.get() instanceof CallExpression) { + CallExpression call = (CallExpression) filter.get(); + BuiltInFunctionHandle builtInFunctionHandle = (BuiltInFunctionHandle) call.getFunctionHandle(); + String name = builtInFunctionHandle.getSignature().getNameSuffix(); + if (name.contains("$operator$") && Signature.unmangleOperator(name).isComparisonOperator()) { + switch (Signature.unmangleOperator(name)) { + case LESS_THAN: + range = Range.lessThan(domain.getType(), range.getHigh().getValue()); + break; + case GREATER_THAN: + range = Range.greaterThan(domain.getType(), range.getLow().getValue()); + break; + case LESS_THAN_OR_EQUAL: + range = Range.lessThanOrEqual(domain.getType(), range.getHigh().getValue()); + break; + case GREATER_THAN_OR_EQUAL: + range = Range.greaterThanOrEqual(domain.getType(), range.getLow().getValue()); + break; + default: + return domain; + } + domain = Domain.create(ValueSet.ofRanges(range), false); + } + } + return domain; + } + + private static TupleDomain getPredicate(DynamicFilter dynamicFilter, Type type, HiveColumnHandle hiveColumnHandle) + { + if (dynamicFilter instanceof CombinedDynamicFilter) { + List filters = ((CombinedDynamicFilter) dynamicFilter).getFilters(); + List> predicates = filters.stream().map(filter -> getPredicate(filter, type, hiveColumnHandle)).collect(toList()); + return predicates.stream().reduce(TupleDomain.all(), TupleDomain::intersect); + } + if (dynamicFilter instanceof FilteredDynamicFilter && !((FilteredDynamicFilter) dynamicFilter).getSetValues().isEmpty()) { + Domain domain = Domain.create(ValueSet.copyOf(type, ((FilteredDynamicFilter) dynamicFilter).getSetValues()), false); + domain = modifyDomain(domain, ((FilteredDynamicFilter) dynamicFilter).getFilterExpression()); + return TupleDomain.withColumnDomains(ImmutableMap.of(hiveColumnHandle, domain)); + } + return TupleDomain.all(); + } + + public enum ColumnMappingKind + { + REGULAR, + PREFILLED, + INTERIM, + TRANSACTIONID + } + + public static class BucketAdaptation + { + private final int[] bucketColumnIndices; + private final List bucketColumnHiveTypes; + private final BucketingVersion bucketingVersion; + private final int tableBucketCount; + private final int partitionBucketCount; + private final int bucketToKeep; + + public BucketAdaptation( + int[] bucketColumnIndices, + List bucketColumnHiveTypes, + BucketingVersion bucketingVersion, + int tableBucketCount, + int partitionBucketCount, + int bucketToKeep) + { + this.bucketColumnIndices = bucketColumnIndices; + this.bucketColumnHiveTypes = bucketColumnHiveTypes; + this.bucketingVersion = bucketingVersion; + this.tableBucketCount = tableBucketCount; + this.partitionBucketCount = partitionBucketCount; + this.bucketToKeep = bucketToKeep; + } + + public int[] getBucketColumnIndices() + { + return bucketColumnIndices; + } + + public List getBucketColumnHiveTypes() + { + return bucketColumnHiveTypes; + } + + public BucketingVersion getBucketingVersion() + { + return bucketingVersion; + } + + public int getTableBucketCount() + { + return tableBucketCount; + } + + public int getPartitionBucketCount() + { + return partitionBucketCount; + } + + public int getBucketToKeep() + { + return bucketToKeep; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartition.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartition.java new file mode 100644 index 00000000..ecfed49f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartition.java @@ -0,0 +1,88 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableMap; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.predicate.NullableValue; + +import java.util.Map; +import java.util.Objects; + +import static java.util.Objects.requireNonNull; + +public class HivePartition +{ + public static final String UNPARTITIONED_ID = ""; + + private final SchemaTableName tableName; + private final String partitionId; + private final Map keys; + + public HivePartition(SchemaTableName tableName) + { + this(tableName, UNPARTITIONED_ID, ImmutableMap.of()); + } + + public HivePartition( + SchemaTableName tableName, + String partitionId, + Map keys) + { + this.tableName = requireNonNull(tableName, "tableName is null"); + this.partitionId = requireNonNull(partitionId, "partitionId is null"); + this.keys = ImmutableMap.copyOf(requireNonNull(keys, "keys is null")); + } + + public SchemaTableName getTableName() + { + return tableName; + } + + public String getPartitionId() + { + return partitionId; + } + + public Map getKeys() + { + return keys; + } + + @Override + public int hashCode() + { + return Objects.hash(partitionId); + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + HivePartition other = (HivePartition) obj; + return Objects.equals(this.partitionId, other.partitionId); + } + + @Override + public String toString() + { + return tableName + ":" + partitionId; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionHandle.java new file mode 100644 index 00000000..83257b9a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionHandle.java @@ -0,0 +1,62 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.connector.ConnectorPartitionHandle; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; + +public class HivePartitionHandle + extends ConnectorPartitionHandle +{ + private final int bucket; + + public HivePartitionHandle(int bucket) + { + this.bucket = bucket; + } + + public int getBucket() + { + return bucket; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HivePartitionHandle that = (HivePartitionHandle) o; + return bucket == that.bucket; + } + + @Override + public int hashCode() + { + return Objects.hash(bucket); + } + + @Override + public String toString() + { + return toStringHelper(this) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionKey.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionKey.java new file mode 100644 index 00000000..de400ddf --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionKey.java @@ -0,0 +1,91 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.openjdk.jol.info.ClassLayout; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +public final class HivePartitionKey +{ + private static final int INSTANCE_SIZE = ClassLayout.parseClass(HivePartitionKey.class).instanceSize() + + ClassLayout.parseClass(String.class).instanceSize() * 2; + + public static final String HIVE_DEFAULT_DYNAMIC_PARTITION = "__HIVE_DEFAULT_PARTITION__"; + private final String name; + private final String value; + + @JsonCreator + public HivePartitionKey( + @JsonProperty("name") String name, + @JsonProperty("value") String value) + { + requireNonNull(name, "name is null"); + requireNonNull(value, "value is null"); + + this.name = name; + this.value = value.equals(HIVE_DEFAULT_DYNAMIC_PARTITION) ? "\\N" : value; + } + + @JsonProperty + public String getName() + { + return name; + } + + @JsonProperty + public String getValue() + { + return value; + } + + public int getEstimatedSizeInBytes() + { + return INSTANCE_SIZE + name.length() * Character.BYTES + value.length() * Character.BYTES; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("name", name) + .add("value", value) + .toString(); + } + + @Override + public int hashCode() + { + return Objects.hash(name, value); + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + HivePartitionKey other = (HivePartitionKey) obj; + return Objects.equals(this.name, other.name) && + Objects.equals(this.value, other.value); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionManager.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionManager.java new file mode 100644 index 00000000..47b2b53e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionManager.java @@ -0,0 +1,399 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.base.Predicates; +import com.google.common.base.VerifyException; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import io.airlift.slice.Slice; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.MetastoreUtil; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.Constraint; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.NullableValue; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.predicate.ValueSet; +import io.prestosql.spi.type.BigintType; +import io.prestosql.spi.type.BooleanType; +import io.prestosql.spi.type.CharType; +import io.prestosql.spi.type.DateType; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.Decimals; +import io.prestosql.spi.type.DoubleType; +import io.prestosql.spi.type.IntegerType; +import io.prestosql.spi.type.RealType; +import io.prestosql.spi.type.SmallintType; +import io.prestosql.spi.type.TimestampType; +import io.prestosql.spi.type.TinyintType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.type.VarcharType; +import org.apache.hadoop.hive.common.FileUtils; +import org.joda.time.format.DateTimeFormatter; +import org.joda.time.format.ISODateTimeFormat; + +import javax.inject.Inject; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.TimeUnit; +import java.util.function.Predicate; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Predicates.not; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.prestosql.plugin.hive.HiveUtil.parsePartitionValue; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.predicate.TupleDomain.all; +import static io.prestosql.spi.predicate.TupleDomain.none; +import static io.prestosql.spi.type.Chars.padSpaces; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; + +public class HivePartitionManager +{ + private static final String PARTITION_VALUE_WILDCARD = ""; + + private final int maxPartitions; + private final boolean assumeCanonicalPartitionKeys; + private final int domainCompactionThreshold; + private final TypeManager typeManager; + + @Inject + public HivePartitionManager( + TypeManager typeManager, + HiveConfig hiveConfig) + { + this( + typeManager, + hiveConfig.getMaxPartitionsPerScan(), + hiveConfig.isAssumeCanonicalPartitionKeys(), + hiveConfig.getDomainCompactionThreshold()); + } + + public HivePartitionManager( + TypeManager typeManager, + int maxPartitions, + boolean assumeCanonicalPartitionKeys, + int domainCompactionThreshold) + { + checkArgument(maxPartitions >= 1, "maxPartitions must be at least 1"); + this.maxPartitions = maxPartitions; + this.assumeCanonicalPartitionKeys = assumeCanonicalPartitionKeys; + checkArgument(domainCompactionThreshold >= 1, "domainCompactionThreshold must be at least 1"); + this.domainCompactionThreshold = domainCompactionThreshold; + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + } + + public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastore, HiveIdentity identity, ConnectorTableHandle tableHandle, Constraint constraint, Table table) + { + HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle; + TupleDomain effectivePredicate = constraint.getSummary() + .intersect(hiveTableHandle.getEnforcedConstraint()); + + SchemaTableName tableName = hiveTableHandle.getSchemaTableName(); + Optional hiveBucketHandle = hiveTableHandle.getBucketHandle(); + List partitionColumns = hiveTableHandle.getPartitionColumns(); + + if (effectivePredicate.isNone()) { + return new HivePartitionResult(partitionColumns, ImmutableList.of(), none(), none(), none(), hiveBucketHandle, Optional.empty()); + } + + Optional bucketFilter = HiveBucketing.getHiveBucketFilter(table, effectivePredicate); + TupleDomain compactEffectivePredicate = toCompactTupleDomain(effectivePredicate, domainCompactionThreshold); + + if (partitionColumns.isEmpty()) { + return new HivePartitionResult( + partitionColumns, + ImmutableList.of(new HivePartition(tableName)), + compactEffectivePredicate, + effectivePredicate, + all(), + hiveBucketHandle, + bucketFilter); + } + + List partitionTypes = partitionColumns.stream() + .map(column -> typeManager.getType(column.getTypeSignature())) + .collect(toList()); + + Iterable partitionsIterable; + Predicate> predicate = constraint.predicate().orElse(value -> true); + if (hiveTableHandle.getPartitions().isPresent()) { + partitionsIterable = hiveTableHandle.getPartitions().get().stream() + .filter(partition -> partitionMatches(partitionColumns, effectivePredicate, predicate, partition)) + .collect(toImmutableList()); + } + else { + List partitionNames = getFilteredPartitionNames(metastore, identity, tableName, partitionColumns, effectivePredicate, table); + partitionsIterable = () -> partitionNames.stream() + // Apply extra filters which could not be done by getFilteredPartitionNames + .map(partitionName -> parseValuesAndFilterPartition(tableName, partitionName, partitionColumns, partitionTypes, effectivePredicate, predicate)) + .filter(Optional::isPresent) + .map(Optional::get) + .iterator(); + } + + // All partition key domains will be fully evaluated, so we don't need to include those + TupleDomain remainingTupleDomain = TupleDomain.withColumnDomains(Maps.filterKeys(effectivePredicate.getDomains().get(), not(Predicates.in(partitionColumns)))); + TupleDomain enforcedTupleDomain = TupleDomain.withColumnDomains(Maps.filterKeys(effectivePredicate.getDomains().get(), Predicates.in(partitionColumns))); + return new HivePartitionResult(partitionColumns, partitionsIterable, compactEffectivePredicate, remainingTupleDomain, enforcedTupleDomain, hiveBucketHandle, bucketFilter); + } + + public HivePartitionResult getPartitions(ConnectorTableHandle tableHandle, List> partitionValuesList) + { + HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle; + SchemaTableName tableName = hiveTableHandle.getSchemaTableName(); + List partitionColumns = hiveTableHandle.getPartitionColumns(); + Optional bucketHandle = hiveTableHandle.getBucketHandle(); + + List partitionColumnNames = partitionColumns.stream() + .map(HiveColumnHandle::getName) + .collect(toImmutableList()); + + List partitionColumnTypes = partitionColumns.stream() + .map(column -> typeManager.getType(column.getTypeSignature())) + .collect(toImmutableList()); + + List partitionList = partitionValuesList.stream() + .map(partitionValues -> MetastoreUtil.toPartitionName(partitionColumnNames, partitionValues)) + .map(partitionName -> parseValuesAndFilterPartition(tableName, partitionName, partitionColumns, partitionColumnTypes, TupleDomain.all(), value -> true)) + .map(partition -> partition.orElseThrow(() -> new VerifyException("partition must exist"))) + .collect(toImmutableList()); + + return new HivePartitionResult(partitionColumns, partitionList, all(), all(), all(), bucketHandle, Optional.empty()); + } + + public List getPartitionsAsList(HivePartitionResult partitionResult) + { + ImmutableList.Builder partitionList = ImmutableList.builder(); + int count = 0; + Iterator iterator = partitionResult.getPartitions(); + while (iterator.hasNext()) { + HivePartition partition = iterator.next(); + if (count == maxPartitions) { + throw new PrestoException(HiveErrorCode.HIVE_EXCEEDED_PARTITION_LIMIT, format( + "Query over table '%s' can potentially read more than %s partitions", + partition.getTableName(), + maxPartitions)); + } + partitionList.add(partition); + count++; + } + return partitionList.build(); + } + + public HiveTableHandle applyPartitionResult(HiveTableHandle handle, HivePartitionResult partitions) + { + return new HiveTableHandle( + handle.getSchemaName(), + handle.getTableName(), + handle.getTableParameters(), + ImmutableList.copyOf(partitions.getPartitionColumns()), + Optional.of(getPartitionsAsList(partitions)), + partitions.getCompactEffectivePredicate(), + partitions.getEnforcedConstraint(), + partitions.getBucketHandle(), + partitions.getBucketFilter(), + handle.getAnalyzePartitionValues(), + handle.getPredicateColumns(), + handle.getDisjunctCompactEffectivePredicate(), + handle.isSuitableToPush(), + handle.getOffloadExpression()); + } + + public List getOrLoadPartitions(ConnectorSession session, SemiTransactionalHiveMetastore metastore, HiveIdentity identity, HiveTableHandle tableHandle) + { + SchemaTableName tableName = tableHandle.getSchemaTableName(); + Table table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + return tableHandle.getPartitions().orElseGet(() -> + getPartitionsAsList(getPartitions(metastore, identity, tableHandle, new Constraint(tableHandle.getEnforcedConstraint()), table))); + } + + private static TupleDomain toCompactTupleDomain(TupleDomain effectivePredicate, int threshold) + { + ImmutableMap.Builder builder = ImmutableMap.builder(); + effectivePredicate.getDomains().ifPresent(domains -> { + for (Map.Entry entry : domains.entrySet()) { + HiveColumnHandle hiveColumnHandle = (HiveColumnHandle) entry.getKey(); + + ValueSet values = entry.getValue().getValues(); + ValueSet compactValueSet = values.getValuesProcessor().>transform( + ranges -> ranges.getRangeCount() > threshold ? Optional.of(ValueSet.ofRanges(ranges.getSpan())) : Optional.empty(), + discreteValues -> discreteValues.getValues().size() > threshold ? Optional.of(ValueSet.all(values.getType())) : Optional.empty(), + allOrNone -> Optional.empty()) + .orElse(values); + builder.put(hiveColumnHandle, Domain.create(compactValueSet, entry.getValue().isNullAllowed())); + } + }); + return TupleDomain.withColumnDomains(builder.build()); + } + + private Optional parseValuesAndFilterPartition( + SchemaTableName tableName, + String partitionId, + List partitionColumns, + List partitionColumnTypes, + TupleDomain constraintSummary, + Predicate> constraint) + { + HivePartition partition = parsePartition(tableName, partitionId, partitionColumns, partitionColumnTypes); + + if (partitionMatches(partitionColumns, constraintSummary, constraint, partition)) { + return Optional.of(partition); + } + return Optional.empty(); + } + + private boolean partitionMatches(List partitionColumns, TupleDomain constraintSummary, Predicate> constraint, HivePartition partition) + { + Map domains = constraintSummary.getDomains().get(); + for (HiveColumnHandle column : partitionColumns) { + NullableValue value = partition.getKeys().get(column); + Domain allowedDomain = domains.get(column); + if (allowedDomain != null && !allowedDomain.includesNullableValue(value.getValue())) { + return false; + } + } + + return constraint.test(partition.getKeys()); + } + + private List getFilteredPartitionNames(SemiTransactionalHiveMetastore metastore, HiveIdentity identity, SchemaTableName tableName, List partitionKeys, TupleDomain effectivePredicate, Table table) + { + checkArgument(effectivePredicate.getDomains().isPresent()); + + List filter = new ArrayList<>(); + for (HiveColumnHandle partitionKey : partitionKeys) { + Domain domain = effectivePredicate.getDomains().get().get(partitionKey); + if (domain != null && domain.isNullableSingleValue()) { + Object value = domain.getNullableSingleValue(); + Type type = domain.getType(); + if (value == null) { + filter.add(HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION); + } + else if (type instanceof CharType) { + Slice slice = (Slice) value; + filter.add(padSpaces(slice, (CharType) type).toStringUtf8()); + } + else if (type instanceof VarcharType) { + Slice slice = (Slice) value; + filter.add(slice.toStringUtf8()); + } + // Types above this have only a single possible representation for each value. + // Types below this may have multiple representations for a single value. For + // example, a boolean column may represent the false value as "0", "false" or "False". + // The metastore distinguishes between these representations, so we cannot prune partitions + // unless we know that all partition values use the canonical Java representation. + else if (!assumeCanonicalPartitionKeys) { + filter.add(PARTITION_VALUE_WILDCARD); + } + else if (type instanceof DecimalType && !((DecimalType) type).isShort()) { + Slice slice = (Slice) value; + filter.add(Decimals.toString(slice, ((DecimalType) type).getScale())); + } + else if (type instanceof DecimalType && ((DecimalType) type).isShort()) { + filter.add(Decimals.toString((long) value, ((DecimalType) type).getScale())); + } + else if (type instanceof DateType) { + DateTimeFormatter dateTimeFormatter = ISODateTimeFormat.date().withZoneUTC(); + filter.add(dateTimeFormatter.print(TimeUnit.DAYS.toMillis((long) value))); + } + else if (type instanceof TimestampType) { + // we don't have time zone info, so just add a wildcard + filter.add(PARTITION_VALUE_WILDCARD); + } + else if (type instanceof TinyintType + || type instanceof SmallintType + || type instanceof IntegerType + || type instanceof BigintType + || type instanceof DoubleType + || type instanceof RealType + || type instanceof BooleanType) { + filter.add(value.toString()); + } + else { + throw new PrestoException(NOT_SUPPORTED, format("Unsupported partition key type: %s", type.getDisplayName())); + } + } + else { + filter.add(PARTITION_VALUE_WILDCARD); + } + } + + // fetch the partition names + return metastore.getPartitionNamesByParts(identity, tableName.getSchemaName(), tableName.getTableName(), filter, table) + .orElseThrow(() -> new TableNotFoundException(tableName)); + } + + public static HivePartition parsePartition( + SchemaTableName tableName, + String partitionName, + List partitionColumns, + List partitionColumnTypes) + { + List partitionValues = extractPartitionValues(partitionName); + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (int i = 0; i < partitionColumns.size(); i++) { + HiveColumnHandle column = partitionColumns.get(i); + NullableValue parsedValue = parsePartitionValue(partitionName, partitionValues.get(i), partitionColumnTypes.get(i)); + builder.put(column, parsedValue); + } + Map values = builder.build(); + return new HivePartition(tableName, partitionName, values); + } + + public static List extractPartitionValues(String partitionName) + { + ImmutableList.Builder values = ImmutableList.builder(); + + boolean inKey = true; + int valueStart = -1; + for (int i = 0; i < partitionName.length(); i++) { + char current = partitionName.charAt(i); + if (inKey) { + checkArgument(current != '/', "Invalid partition spec: %s", partitionName); + if (current == '=') { + inKey = false; + valueStart = i + 1; + } + } + else if (current == '/') { + checkArgument(valueStart != -1, "Invalid partition spec: %s", partitionName); + values.add(FileUtils.unescapePathName(partitionName.substring(valueStart, i))); + inKey = true; + valueStart = -1; + } + } + checkArgument(!inKey, "Invalid partition spec: %s", partitionName); + values.add(FileUtils.unescapePathName(partitionName.substring(valueStart, partitionName.length()))); + + return values.build(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionMetadata.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionMetadata.java new file mode 100644 index 00000000..2bdb2226 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionMetadata.java @@ -0,0 +1,56 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.plugin.hive.metastore.Partition; + +import java.util.Map; +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +public class HivePartitionMetadata +{ + private final Optional partition; + private final HivePartition hivePartition; + private final Map columnCoercions; + + HivePartitionMetadata( + HivePartition hivePartition, + Optional partition, + Map columnCoercions) + { + this.partition = requireNonNull(partition, "partition is null"); + this.hivePartition = requireNonNull(hivePartition, "hivePartition is null"); + this.columnCoercions = requireNonNull(columnCoercions, "columnCoercions is null"); + } + + public HivePartition getHivePartition() + { + return hivePartition; + } + + /** + * @return empty if this HivePartitionMetadata represents an unpartitioned table + */ + public Optional getPartition() + { + return partition; + } + + public Map getColumnCoercions() + { + return columnCoercions; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionResult.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionResult.java new file mode 100644 index 00000000..b8a49089 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitionResult.java @@ -0,0 +1,95 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.predicate.TupleDomain; + +import java.util.Iterator; +import java.util.List; +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +/** + * Result of fetching Partitions in the HivePartitionManager interface. + *

+ * Results are comprised of two parts: + * 1) The actual partitions + * 2) The TupleDomain that represents the values that the connector was not able to pre-evaluate + * when generating the partitions and will need to be double-checked by the final execution plan. + */ +public class HivePartitionResult +{ + private final List partitionColumns; + private final Iterable partitions; + private final TupleDomain compactEffectivePredicate; + private final TupleDomain unenforcedConstraint; + private final TupleDomain enforcedConstraint; + private final Optional bucketHandle; + private final Optional bucketFilter; + + public HivePartitionResult( + List partitionColumns, + Iterable partitions, + TupleDomain compactEffectivePredicate, + TupleDomain unenforcedConstraint, + TupleDomain enforcedConstraint, + Optional bucketHandle, + Optional bucketFilter) + { + this.partitionColumns = requireNonNull(partitionColumns, "partitionColumns is null"); + this.partitions = requireNonNull(partitions, "partitions is null"); + this.compactEffectivePredicate = requireNonNull(compactEffectivePredicate, "compactEffectivePredicate is null"); + this.unenforcedConstraint = requireNonNull(unenforcedConstraint, "unenforcedConstraint is null"); + this.enforcedConstraint = requireNonNull(enforcedConstraint, "enforcedConstraint is null"); + this.bucketHandle = requireNonNull(bucketHandle, "bucketHandle is null"); + this.bucketFilter = requireNonNull(bucketFilter, "bucketFilter is null"); + } + + public List getPartitionColumns() + { + return partitionColumns; + } + + public Iterator getPartitions() + { + return partitions.iterator(); + } + + public TupleDomain getCompactEffectivePredicate() + { + return compactEffectivePredicate; + } + + public TupleDomain getUnenforcedConstraint() + { + return unenforcedConstraint; + } + + public TupleDomain getEnforcedConstraint() + { + return enforcedConstraint; + } + + public Optional getBucketHandle() + { + return bucketHandle; + } + + public Optional getBucketFilter() + { + return bucketFilter; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitioningHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitioningHandle.java new file mode 100644 index 00000000..97d975fe --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePartitioningHandle.java @@ -0,0 +1,116 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.spi.connector.ConnectorPartitioningHandle; + +import java.util.List; +import java.util.Objects; +import java.util.OptionalInt; + +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class HivePartitioningHandle + implements ConnectorPartitioningHandle +{ + private final BucketingVersion bucketingVersion; + private final int bucketCount; + private final List hiveTypes; + private final OptionalInt maxCompatibleBucketCount; + private final boolean forUpdateOrDelete; + + public HivePartitioningHandle( + BucketingVersion bucketingVersion, + int bucketCount, + List hiveTypes, + OptionalInt maxCompatibleBucketCount) + { + this(bucketingVersion, bucketCount, hiveTypes, maxCompatibleBucketCount, false); + } + + @JsonCreator + public HivePartitioningHandle( + @JsonProperty("bucketingVersion") BucketingVersion bucketingVersion, + @JsonProperty("bucketCount") int bucketCount, + @JsonProperty("hiveTypes") List hiveTypes, + @JsonProperty("maxCompatibleBucketCount") OptionalInt maxCompatibleBucketCount, + @JsonProperty("forUpdate") boolean forUpdateOrDelete) + { + this.bucketingVersion = requireNonNull(bucketingVersion, "bucketingVersion is null"); + this.bucketCount = bucketCount; + this.hiveTypes = requireNonNull(hiveTypes, "hiveTypes is null"); + this.maxCompatibleBucketCount = maxCompatibleBucketCount; + this.forUpdateOrDelete = forUpdateOrDelete; + } + + @JsonProperty + public BucketingVersion getBucketingVersion() + { + return bucketingVersion; + } + + @JsonProperty + public int getBucketCount() + { + return bucketCount; + } + + @JsonProperty + public List getHiveTypes() + { + return hiveTypes; + } + + @JsonProperty + public OptionalInt getMaxCompatibleBucketCount() + { + return maxCompatibleBucketCount; + } + + @JsonProperty + public boolean isForUpdateOrDelete() + { + return forUpdateOrDelete; + } + + @Override + public String toString() + { + return format("buckets=%s, hiveTypes=%s", bucketCount, hiveTypes); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HivePartitioningHandle that = (HivePartitioningHandle) o; + return bucketCount == that.bucketCount && + Objects.equals(hiveTypes, that.hiveTypes); + } + + @Override + public int hashCode() + { + return Objects.hash(bucketCount, hiveTypes); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePlugin.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePlugin.java new file mode 100644 index 00000000..c449a52a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePlugin.java @@ -0,0 +1,73 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.StaticMetastoreConfig; +import io.prestosql.spi.Plugin; +import io.prestosql.spi.connector.ConnectorFactory; +import io.prestosql.spi.function.ConnectorConfig; +import io.prestosql.spi.queryeditorui.ConnectorUtil; +import io.prestosql.spi.queryeditorui.ConnectorWithProperties; + +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.isNullOrEmpty; +import static java.util.Objects.requireNonNull; + +@ConnectorConfig(connectorLabel = "Hive: Query data stored in a Hive data warehouse", + propertiesEnabled = true, + catalogConfigFilesEnabled = true, + globalConfigFilesEnabled = true, + docLink = "https://openlookeng.io/docs/docs/connector/hive.html", + configLink = "https://openlookeng.io/docs/docs/connector/hive.html#configuration") +public class HivePlugin + implements Plugin +{ + private final String name; + private final Optional metastore; + + public HivePlugin() + { + this("omnidata-openlookeng", Optional.empty()); + } + + public HivePlugin(String name, Optional metastore) + { + checkArgument(!isNullOrEmpty(name), "name is null or empty"); + this.name = name; + this.metastore = requireNonNull(metastore, "metastore is null"); + } + + @Override + public Iterable getConnectorFactories() + { + return ImmutableList.of(new HiveConnectorFactory(name, HivePlugin.class.getClassLoader(), metastore)); + } + + @Override + public Optional getConnectorWithProperties() + { + ConnectorConfig connectorConfig = HivePlugin.class.getAnnotation(ConnectorConfig.class); + ArrayList methods = new ArrayList<>(); + methods.addAll(Arrays.asList(StaticMetastoreConfig.class.getDeclaredMethods())); + methods.addAll(Arrays.asList(HiveConfig.class.getDeclaredMethods())); + return ConnectorUtil.assembleConnectorProperties(connectorConfig, methods); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveProcedureModule.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveProcedureModule.java new file mode 100644 index 00000000..cb9bf9b7 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveProcedureModule.java @@ -0,0 +1,34 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.inject.Binder; +import com.google.inject.Module; +import com.google.inject.Scopes; +import com.google.inject.multibindings.Multibinder; +import io.prestosql.spi.procedure.Procedure; + +import static com.google.inject.multibindings.Multibinder.newSetBinder; + +public class HiveProcedureModule + implements Module +{ + @Override + public void configure(Binder binder) + { + Multibinder procedures = newSetBinder(binder, Procedure.class); + procedures.addBinding().toProvider(CreateEmptyPartitionProcedure.class).in(Scopes.SINGLETON); + procedures.addBinding().toProvider(SyncPartitionMetadataProcedure.class).in(Scopes.SINGLETON); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePushDownRecordPageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePushDownRecordPageSource.java new file mode 100644 index 00000000..c26f3856 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HivePushDownRecordPageSource.java @@ -0,0 +1,109 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive; + +import com.huawei.boostkit.omnidata.reader.DataReader; +import io.prestosql.memory.context.AggregatedMemoryContext; +import io.prestosql.plugin.hive.util.PageSourceUtil; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorPageSource; + +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_OPERATOR_OFFLOAD_FAIL; +import static java.util.Objects.requireNonNull; + +public class HivePushDownRecordPageSource + implements ConnectorPageSource +{ + private final DataReader dataReader; + private boolean closed; + private final AggregatedMemoryContext systemMemoryContext; + private long readTimeNanos; + private long readBytes; + + public HivePushDownRecordPageSource( + DataReader dataReader, + AggregatedMemoryContext systemMemoryContext) + { + this.dataReader = requireNonNull(dataReader, "dataReader is null"); + this.systemMemoryContext = requireNonNull(systemMemoryContext, "systemMemoryContext is null"); + } + + @Override + public long getCompletedBytes() + { + return readBytes; + } + + @Override + public long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public boolean isFinished() + { + return closed; + } + + @Override + public Page getNextPage() + { + long start = System.nanoTime(); + + if (dataReader.isFinished()) { + close(); + return null; + } + + Page page = null; + try { + page = (Page) dataReader.getNextPageBlocking(); + } + catch (Exception exception) { + PageSourceUtil.closeWithSuppression(this, exception); + throw new PrestoException(HIVE_OPERATOR_OFFLOAD_FAIL, exception.getMessage()); + } + + readTimeNanos += System.nanoTime() - start; + if (page != null) { + readBytes += page.getSizeInBytes(); + } + + return page; + } + + @Override + public long getSystemMemoryUsage() + { + return systemMemoryContext.getBytes(); + } + + @Override + public void close() + { + if (closed) { + return; + } + closed = true; + try { + dataReader.close(); + } + catch (Exception e) { + e.printStackTrace(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveReadOnlyException.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveReadOnlyException.java new file mode 100644 index 00000000..5d1c6ae3 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveReadOnlyException.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.SchemaTableName; + +import java.util.Optional; + +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class HiveReadOnlyException + extends PrestoException +{ + private final SchemaTableName tableName; + private final Optional partition; + + public HiveReadOnlyException(SchemaTableName tableName, Optional partition) + { + super(partition.isPresent() ? HiveErrorCode.HIVE_PARTITION_READ_ONLY : HiveErrorCode.HIVE_TABLE_READ_ONLY, composeMessage(tableName, partition)); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.partition = requireNonNull(partition, "partition is null"); + } + + private static String composeMessage(SchemaTableName tableName, Optional partition) + { + return partition.isPresent() + ? format("Table '%s' partition '%s' is read-only", tableName, partition.get()) + : format("Table '%s' is read-only", tableName); + } + + public SchemaTableName getTableName() + { + return tableName; + } + + public Optional getPartition() + { + return partition; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveRecordCursor.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveRecordCursor.java new file mode 100644 index 00000000..55d65a2c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveRecordCursor.java @@ -0,0 +1,255 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.annotations.VisibleForTesting; +import io.airlift.slice.Slice; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; + +import java.util.List; + +import static io.prestosql.plugin.hive.HiveUtil.bigintPartitionKey; +import static io.prestosql.plugin.hive.HiveUtil.booleanPartitionKey; +import static io.prestosql.plugin.hive.HiveUtil.charPartitionKey; +import static io.prestosql.plugin.hive.HiveUtil.datePartitionKey; +import static io.prestosql.plugin.hive.HiveUtil.doublePartitionKey; +import static io.prestosql.plugin.hive.HiveUtil.floatPartitionKey; +import static io.prestosql.plugin.hive.HiveUtil.integerPartitionKey; +import static io.prestosql.plugin.hive.HiveUtil.longDecimalPartitionKey; +import static io.prestosql.plugin.hive.HiveUtil.shortDecimalPartitionKey; +import static io.prestosql.plugin.hive.HiveUtil.smallintPartitionKey; +import static io.prestosql.plugin.hive.HiveUtil.timestampPartitionKey; +import static io.prestosql.plugin.hive.HiveUtil.tinyintPartitionKey; +import static io.prestosql.plugin.hive.HiveUtil.varcharPartitionKey; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.Chars.isCharType; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.Decimals.isLongDecimal; +import static io.prestosql.spi.type.Decimals.isShortDecimal; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TimestampType.TIMESTAMP; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.Varchars.isVarcharType; +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Objects.requireNonNull; + +public class HiveRecordCursor + implements RecordCursor +{ + private final RecordCursor delegate; + + private final List columnMappings; + private final Type[] types; + + private final boolean[] booleans; + private final long[] longs; + private final double[] doubles; + private final Slice[] slices; + private final Object[] objects; + private final boolean[] nulls; + + public HiveRecordCursor( + List columnMappings, + TypeManager typeManager, + RecordCursor delegate) + { + requireNonNull(columnMappings, "columns is null"); + requireNonNull(typeManager, "typeManager is null"); + + this.delegate = requireNonNull(delegate, "delegate is null"); + this.columnMappings = columnMappings; + + int size = columnMappings.size(); + + this.types = new Type[size]; + + this.booleans = new boolean[size]; + this.longs = new long[size]; + this.doubles = new double[size]; + this.slices = new Slice[size]; + this.objects = new Object[size]; + this.nulls = new boolean[size]; + + for (int columnIndex = 0; columnIndex < size; columnIndex++) { + HivePageSourceProvider.ColumnMapping columnMapping = columnMappings.get(columnIndex); + + if (columnMapping.getKind() == HivePageSourceProvider.ColumnMappingKind.PREFILLED) { + String columnValue = columnMapping.getPrefilledValue(); + byte[] bytes = columnValue.getBytes(UTF_8); + + String name = columnMapping.getHiveColumnHandle().getName(); + Type type = typeManager.getType(columnMapping.getHiveColumnHandle().getTypeSignature()); + types[columnIndex] = type; + + if (HiveUtil.isHiveNull(bytes)) { + nulls[columnIndex] = true; + } + else if (BOOLEAN.equals(type)) { + booleans[columnIndex] = booleanPartitionKey(columnValue, name); + } + else if (TINYINT.equals(type)) { + longs[columnIndex] = tinyintPartitionKey(columnValue, name); + } + else if (SMALLINT.equals(type)) { + longs[columnIndex] = smallintPartitionKey(columnValue, name); + } + else if (INTEGER.equals(type)) { + longs[columnIndex] = integerPartitionKey(columnValue, name); + } + else if (BIGINT.equals(type)) { + longs[columnIndex] = bigintPartitionKey(columnValue, name); + } + else if (REAL.equals(type)) { + longs[columnIndex] = floatPartitionKey(columnValue, name); + } + else if (DOUBLE.equals(type)) { + doubles[columnIndex] = doublePartitionKey(columnValue, name); + } + else if (isVarcharType(type)) { + slices[columnIndex] = varcharPartitionKey(columnValue, name, type); + } + else if (isCharType(type)) { + slices[columnIndex] = charPartitionKey(columnValue, name, type); + } + else if (DATE.equals(type)) { + longs[columnIndex] = datePartitionKey(columnValue, name); + } + else if (TIMESTAMP.equals(type)) { + longs[columnIndex] = timestampPartitionKey(columnValue, name); + } + else if (isShortDecimal(type)) { + longs[columnIndex] = shortDecimalPartitionKey(columnValue, (DecimalType) type, name); + } + else if (isLongDecimal(type)) { + slices[columnIndex] = longDecimalPartitionKey(columnValue, (DecimalType) type, name); + } + else { + throw new PrestoException(NOT_SUPPORTED, format("Unsupported column type %s for prefilled column: %s", type.getDisplayName(), name)); + } + } + } + } + + @Override + public long getCompletedBytes() + { + return delegate.getCompletedBytes(); + } + + @Override + public Type getType(int field) + { + return types[field]; + } + + @Override + public boolean advanceNextPosition() + { + return delegate.advanceNextPosition(); + } + + @Override + public boolean getBoolean(int field) + { + HivePageSourceProvider.ColumnMapping columnMapping = columnMappings.get(field); + if (columnMapping.getKind() == HivePageSourceProvider.ColumnMappingKind.REGULAR) { + return delegate.getBoolean(columnMapping.getIndex()); + } + return booleans[field]; + } + + @Override + public long getLong(int field) + { + HivePageSourceProvider.ColumnMapping columnMapping = columnMappings.get(field); + if (columnMapping.getKind() == HivePageSourceProvider.ColumnMappingKind.REGULAR) { + return delegate.getLong(columnMapping.getIndex()); + } + return longs[field]; + } + + @Override + public double getDouble(int field) + { + HivePageSourceProvider.ColumnMapping columnMapping = columnMappings.get(field); + if (columnMapping.getKind() == HivePageSourceProvider.ColumnMappingKind.REGULAR) { + return delegate.getDouble(columnMapping.getIndex()); + } + return doubles[field]; + } + + @Override + public Slice getSlice(int field) + { + HivePageSourceProvider.ColumnMapping columnMapping = columnMappings.get(field); + if (columnMapping.getKind() == HivePageSourceProvider.ColumnMappingKind.REGULAR) { + return delegate.getSlice(columnMapping.getIndex()); + } + return slices[field]; + } + + @Override + public Object getObject(int field) + { + HivePageSourceProvider.ColumnMapping columnMapping = columnMappings.get(field); + if (columnMapping.getKind() == HivePageSourceProvider.ColumnMappingKind.REGULAR) { + return delegate.getObject(columnMapping.getIndex()); + } + return objects[field]; + } + + @Override + public boolean isNull(int field) + { + HivePageSourceProvider.ColumnMapping columnMapping = columnMappings.get(field); + if (columnMapping.getKind() == HivePageSourceProvider.ColumnMappingKind.REGULAR) { + return delegate.isNull(columnMapping.getIndex()); + } + return nulls[field]; + } + + @Override + public void close() + { + delegate.close(); + } + + @Override + public long getReadTimeNanos() + { + return delegate.getReadTimeNanos(); + } + + @Override + public long getSystemMemoryUsage() + { + return delegate.getSystemMemoryUsage(); + } + + @VisibleForTesting + RecordCursor getRegularColumnRecordCursor() + { + return delegate; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveRecordCursorProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveRecordCursorProvider.java new file mode 100644 index 00000000..dbf08a48 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveRecordCursorProvider.java @@ -0,0 +1,43 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; + +public interface HiveRecordCursorProvider +{ + Optional createRecordCursor( + Configuration configuration, + ConnectorSession session, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + TupleDomain effectivePredicate, + TypeManager typeManager, + boolean s3SelectPushdownEnabled, + Map customSplitInfo); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSchemaProperties.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSchemaProperties.java new file mode 100644 index 00000000..a43f76fb --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSchemaProperties.java @@ -0,0 +1,42 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.spi.session.PropertyMetadata; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static io.prestosql.spi.session.PropertyMetadata.stringProperty; + +public final class HiveSchemaProperties +{ + public static final String LOCATION_PROPERTY = "location"; + + public static final List> SCHEMA_PROPERTIES = ImmutableList.of( + stringProperty( + LOCATION_PROPERTY, + "Base file system location URI", + null, + false)); + + private HiveSchemaProperties() {} + + public static Optional getLocation(Map schemaProperties) + { + return Optional.ofNullable((String) schemaProperties.get(LOCATION_PROPERTY)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSelectivePageSourceFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSelectivePageSourceFactory.java new file mode 100644 index 00000000..e4c54daa --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSelectivePageSourceFactory.java @@ -0,0 +1,51 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.plugin.hive.coercions.HiveCoercer; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.heuristicindex.IndexMetadata; +import io.prestosql.spi.predicate.TupleDomain; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; + +public interface HiveSelectivePageSourceFactory +{ + public Optional createPageSource( + Configuration configuration, + ConnectorSession session, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + Map prefilledValues, + List outputColumns, + TupleDomain domainPredicate, + Optional>> additionPredicates, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + boolean splitCacheable, + List columnMappings, + Map coercers, + long dataSourceLastModifiedTime); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSessionProperties.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSessionProperties.java new file mode 100644 index 00000000..81808fb7 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSessionProperties.java @@ -0,0 +1,731 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.airlift.units.DataSize; +import io.prestosql.orc.OrcWriteValidation.OrcWriteValidationMode; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.session.PropertyMetadata; + +import javax.inject.Inject; + +import java.util.List; +import java.util.concurrent.ThreadLocalRandom; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.prestosql.plugin.hive.HiveSessionProperties.InsertExistingPartitionsBehavior.APPEND; +import static io.prestosql.plugin.hive.HiveSessionProperties.InsertExistingPartitionsBehavior.ERROR; +import static io.prestosql.spi.StandardErrorCode.INVALID_SESSION_PROPERTY; +import static io.prestosql.spi.session.PropertyMetadata.booleanProperty; +import static io.prestosql.spi.session.PropertyMetadata.dataSizeProperty; +import static io.prestosql.spi.session.PropertyMetadata.doubleProperty; +import static io.prestosql.spi.session.PropertyMetadata.integerProperty; +import static io.prestosql.spi.session.PropertyMetadata.longProperty; +import static io.prestosql.spi.session.PropertyMetadata.stringProperty; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.VarcharType.VARCHAR; +import static java.lang.String.format; +import static java.util.Locale.ENGLISH; + +public final class HiveSessionProperties +{ + private static final String BUCKET_EXECUTION_ENABLED = "bucket_execution_enabled"; + private static final String FORCE_LOCAL_SCHEDULING = "force_local_scheduling"; + private static final String INSERT_EXISTING_PARTITIONS_BEHAVIOR = "insert_existing_partitions_behavior"; + private static final String ORC_BLOOM_FILTERS_ENABLED = "orc_bloom_filters_enabled"; + private static final String ORC_MAX_MERGE_DISTANCE = "orc_max_merge_distance"; + private static final String ORC_MAX_BUFFER_SIZE = "orc_max_buffer_size"; + private static final String ORC_STREAM_BUFFER_SIZE = "orc_stream_buffer_size"; + private static final String ORC_TINY_STRIPE_THRESHOLD = "orc_tiny_stripe_threshold"; + private static final String ORC_MAX_READ_BLOCK_SIZE = "orc_max_read_block_size"; + private static final String ORC_LAZY_READ_SMALL_RANGES = "orc_lazy_read_small_ranges"; + private static final String ORC_NESTED_LAZY_ENABLED = "orc_nested_lazy_enabled"; + private static final String ORC_STRING_STATISTICS_LIMIT = "orc_string_statistics_limit"; + private static final String ORC_OPTIMIZED_WRITER_VALIDATE = "orc_optimized_writer_validate"; + private static final String ORC_OPTIMIZED_WRITER_VALIDATE_PERCENTAGE = "orc_optimized_writer_validate_percentage"; + private static final String ORC_OPTIMIZED_WRITER_VALIDATE_MODE = "orc_optimized_writer_validate_mode"; + private static final String ORC_OPTIMIZED_WRITER_MIN_STRIPE_SIZE = "orc_optimized_writer_min_stripe_size"; + private static final String ORC_OPTIMIZED_WRITER_MAX_STRIPE_SIZE = "orc_optimized_writer_max_stripe_size"; + private static final String ORC_OPTIMIZED_WRITER_MAX_STRIPE_ROWS = "orc_optimized_writer_max_stripe_rows"; + private static final String ORC_OPTIMIZED_WRITER_MAX_DICTIONARY_MEMORY = "orc_optimized_writer_max_dictionary_memory"; + private static final String ORC_FILE_TAIL_CACHE_ENABLED = "orc_file_tail_cache_enabled"; + private static final String ORC_STRIPE_FOOTER_CACHE_ENABLED = "orc_stripe_footer_cache_enabled"; + private static final String ORC_ROW_INDEX_CACHE_ENABLED = "orc_row_index_cache_enabled"; + private static final String ORC_BLOOM_FILTERS_CACHE_ENABLED = "orc_bloom_filters_cache_enabled"; + private static final String ORC_ROW_DATA_CACHE_ENABLED = "orc_row_data_cache_enabled"; + private static final String HIVE_STORAGE_FORMAT = "hive_storage_format"; + private static final String RESPECT_TABLE_FORMAT = "respect_table_format"; + private static final String CREATE_EMPTY_BUCKET_FILES = "create_empty_bucket_files"; + private static final String PARQUET_USE_COLUMN_NAME = "parquet_use_column_names"; + private static final String PARQUET_FAIL_WITH_CORRUPTED_STATISTICS = "parquet_fail_with_corrupted_statistics"; + private static final String PARQUET_MAX_READ_BLOCK_SIZE = "parquet_max_read_block_size"; + private static final String PARQUET_WRITER_BLOCK_SIZE = "parquet_writer_block_size"; + private static final String PARQUET_WRITER_PAGE_SIZE = "parquet_writer_page_size"; + private static final String MAX_SPLIT_SIZE = "max_split_size"; + private static final String MAX_INITIAL_SPLIT_SIZE = "max_initial_split_size"; + private static final String RCFILE_OPTIMIZED_WRITER_VALIDATE = "rcfile_optimized_writer_validate"; + private static final String SORTED_WRITING_ENABLED = "sorted_writing_enabled"; + private static final String STATISTICS_ENABLED = "statistics_enabled"; + private static final String PARTITION_STATISTICS_SAMPLE_SIZE = "partition_statistics_sample_size"; + private static final String IGNORE_CORRUPTED_STATISTICS = "ignore_corrupted_statistics"; + private static final String COLLECT_COLUMN_STATISTICS_ON_WRITE = "collect_column_statistics_on_write"; + private static final String OPTIMIZE_MISMATCHED_BUCKET_COUNT = "optimize_mismatched_bucket_count"; + private static final String S3_SELECT_PUSHDOWN_ENABLED = "s3_select_pushdown_enabled"; + private static final String TEMPORARY_STAGING_DIRECTORY_ENABLED = "temporary_staging_directory_enabled"; + private static final String TEMPORARY_STAGING_DIRECTORY_PATH = "temporary_staging_directory_path"; + private static final String DYNAMIC_FILTERING_SPLIT_FILTERING = "dynamic_filtering_partition_filtering"; + private static final String DYNAMIC_FILTERING_ROW_FILTERING_THRESHOLD = "dynamic_filtering_filter_rows_threshold"; + private static final String ORC_PREDICATE_PUSHDOWN = "orc_predicate_pushdown_enabled"; + private static final String ORC_DISJUCT_PREDICATE_PUSHDOWN = "orc_disjunct_predicate_pushdown_enabled"; + private static final String ORC_PUSHDOWN_DATACACHE = "orc_pushdown_data_cache_enabled"; + private static final String WRITE_PARTITION_DISTRIBUTION = "write_partition_distribution"; + private static final String FILTER_OFFLOAD = "filter_offload_enabled"; + private static final String AGGREGATOR_OFFLOAD = "aggregator_offload_enabled"; + private static final String MIN_FILTER_OFFLOAD_FACTOR = "min_filter_offload_factor"; + private static final String MIN_AGGREGATOR_OFFLOAD_FACTOR = "min_aggregator_offload_factor"; + private static final String MIN_OFFLOAD_ROW_NUMBER = "min_offload_row_number"; + private static final String OMNIDATA_ENABLE = "omnidata_enabled"; + private static final String METASTORE_WRITE_BATCH_SIZE = "metastore_write_batch_size"; + + private final List> sessionProperties; + + public enum InsertExistingPartitionsBehavior + { + ERROR, + APPEND, + OVERWRITE, + /**/; + + public static InsertExistingPartitionsBehavior valueOf(String value, boolean immutablePartition) + { + InsertExistingPartitionsBehavior enumValue = valueOf(value.toUpperCase(ENGLISH)); + if (immutablePartition) { + checkArgument(enumValue != APPEND, + "Presto is configured to treat Hive partitions as immutable. %s is not allowed to be set to %s", + INSERT_EXISTING_PARTITIONS_BEHAVIOR, APPEND); + } + + return enumValue; + } + } + + @Inject + public HiveSessionProperties(HiveConfig hiveConfig, OrcFileWriterConfig orcFileWriterConfig, + ParquetFileWriterConfig parquetFileWriterConfig) + { + sessionProperties = ImmutableList.of( + booleanProperty( + BUCKET_EXECUTION_ENABLED, + "Enable bucket-aware execution: only use a single worker per bucket", + hiveConfig.isBucketExecutionEnabled(), + false), + booleanProperty( + WRITE_PARTITION_DISTRIBUTION, + "Distribute writes based on partition columns", + false, + false), + booleanProperty( + FORCE_LOCAL_SCHEDULING, + "Only schedule splits on workers colocated with data node", + hiveConfig.isForceLocalScheduling(), + false), + new PropertyMetadata<>( + INSERT_EXISTING_PARTITIONS_BEHAVIOR, + "Behavior on insert existing partitions; this session property doesn't control behavior on insert existing unpartitioned table", + VARCHAR, + InsertExistingPartitionsBehavior.class, + hiveConfig.isImmutablePartitions() ? ERROR : APPEND, + false, + value -> InsertExistingPartitionsBehavior.valueOf((String) value, + hiveConfig.isImmutablePartitions()), + InsertExistingPartitionsBehavior::toString), + booleanProperty( + ORC_BLOOM_FILTERS_ENABLED, + "ORC: Enable bloom filters for predicate pushdown", + hiveConfig.isOrcBloomFiltersEnabled(), + false), + dataSizeProperty( + ORC_MAX_MERGE_DISTANCE, + "ORC: Maximum size of gap between two reads to merge into a single read", + hiveConfig.getOrcMaxMergeDistance(), + false), + dataSizeProperty( + ORC_MAX_BUFFER_SIZE, + "ORC: Maximum size of a single read", + hiveConfig.getOrcMaxBufferSize(), + false), + dataSizeProperty( + ORC_STREAM_BUFFER_SIZE, + "ORC: Size of buffer for streaming reads", + hiveConfig.getOrcStreamBufferSize(), + false), + dataSizeProperty( + ORC_TINY_STRIPE_THRESHOLD, + "ORC: Threshold below which an ORC stripe or file will read in its entirety", + hiveConfig.getOrcTinyStripeThreshold(), + false), + dataSizeProperty( + ORC_MAX_READ_BLOCK_SIZE, + "ORC: Soft max size of Presto blocks produced by ORC reader", + hiveConfig.getOrcMaxReadBlockSize(), + false), + booleanProperty( + ORC_LAZY_READ_SMALL_RANGES, + "Experimental: ORC: Read small file segments lazily", + hiveConfig.isOrcLazyReadSmallRanges(), + false), + booleanProperty( + ORC_NESTED_LAZY_ENABLED, + "Experimental: ORC: Lazily read nested data", + true, + false), + dataSizeProperty( + ORC_STRING_STATISTICS_LIMIT, + "ORC: Maximum size of string statistics; drop if exceeding", + orcFileWriterConfig.getStringStatisticsLimit(), + false), + booleanProperty( + ORC_OPTIMIZED_WRITER_VALIDATE, + "Experimental: ORC: Force all validation for files", + hiveConfig.getOrcWriterValidationPercentage() > 0.0, + false), + new PropertyMetadata<>( + ORC_OPTIMIZED_WRITER_VALIDATE_PERCENTAGE, + "Experimental: ORC: sample percentage for validation for files", + DOUBLE, + Double.class, + hiveConfig.getOrcWriterValidationPercentage(), + false, + value -> { + double doubleValue = ((Number) value).doubleValue(); + if (doubleValue < 0.0 || doubleValue > 100.0) { + throw new PrestoException( + INVALID_SESSION_PROPERTY, + format("%s must be between 0.0 and 100.0 inclusive: %s", + ORC_OPTIMIZED_WRITER_VALIDATE_PERCENTAGE, doubleValue)); + } + return doubleValue; + }, + value -> value), + stringProperty( + ORC_OPTIMIZED_WRITER_VALIDATE_MODE, + "Experimental: ORC: Level of detail in ORC validation", + hiveConfig.getOrcWriterValidationMode().toString(), + false), + dataSizeProperty( + ORC_OPTIMIZED_WRITER_MIN_STRIPE_SIZE, + "Experimental: ORC: Min stripe size", + orcFileWriterConfig.getStripeMinSize(), + false), + dataSizeProperty( + ORC_OPTIMIZED_WRITER_MAX_STRIPE_SIZE, + "Experimental: ORC: Max stripe size", + orcFileWriterConfig.getStripeMaxSize(), + false), + integerProperty( + ORC_OPTIMIZED_WRITER_MAX_STRIPE_ROWS, + "Experimental: ORC: Max stripe row count", + orcFileWriterConfig.getStripeMaxRowCount(), + false), + dataSizeProperty( + ORC_OPTIMIZED_WRITER_MAX_DICTIONARY_MEMORY, + "Experimental: ORC: Max dictionary memory", + orcFileWriterConfig.getDictionaryMaxMemory(), + false), + booleanProperty( + ORC_FILE_TAIL_CACHE_ENABLED, + "Cache Orc file tail", + hiveConfig.isOrcFileTailCacheEnabled(), + false), + booleanProperty( + ORC_STRIPE_FOOTER_CACHE_ENABLED, + "Cache Orc stripes' footer", + hiveConfig.isOrcStripeFooterCacheEnabled(), + false), + booleanProperty( + ORC_ROW_INDEX_CACHE_ENABLED, + "Cache Orc row index", + hiveConfig.isOrcRowIndexCacheEnabled(), + false), + booleanProperty( + ORC_BLOOM_FILTERS_CACHE_ENABLED, + "Cache Orc bloom filters", + hiveConfig.isOrcBloomFiltersCacheEnabled(), + false), + booleanProperty( + ORC_ROW_DATA_CACHE_ENABLED, + "Cache Orc row data", + hiveConfig.isOrcRowDataCacheEnabled(), + false), + stringProperty( + HIVE_STORAGE_FORMAT, + "Default storage format for new tables or partitions", + hiveConfig.getHiveStorageFormat().toString(), + false), + booleanProperty( + RESPECT_TABLE_FORMAT, + "Write new partitions using table format rather than default storage format", + hiveConfig.isRespectTableFormat(), + false), + booleanProperty( + CREATE_EMPTY_BUCKET_FILES, + "Create empty files for buckets that have no data", + hiveConfig.isCreateEmptyBucketFiles(), + false), + booleanProperty( + PARQUET_USE_COLUMN_NAME, + "Experimental: Parquet: Access Parquet columns using names from the file", + hiveConfig.isUseParquetColumnNames(), + false), + booleanProperty( + PARQUET_FAIL_WITH_CORRUPTED_STATISTICS, + "Parquet: Fail when scanning Parquet files with corrupted statistics", + hiveConfig.isFailOnCorruptedParquetStatistics(), + false), + dataSizeProperty( + PARQUET_MAX_READ_BLOCK_SIZE, + "Parquet: Maximum size of a block to read", + hiveConfig.getParquetMaxReadBlockSize(), + false), + dataSizeProperty( + PARQUET_WRITER_BLOCK_SIZE, + "Parquet: Writer block size", + parquetFileWriterConfig.getBlockSize(), + false), + dataSizeProperty( + PARQUET_WRITER_PAGE_SIZE, + "Parquet: Writer page size", + parquetFileWriterConfig.getPageSize(), + false), + dataSizeProperty( + MAX_SPLIT_SIZE, + "Max split size", + hiveConfig.getMaxSplitSize(), + true), + dataSizeProperty( + MAX_INITIAL_SPLIT_SIZE, + "Max initial split size", + hiveConfig.getMaxInitialSplitSize(), + true), + booleanProperty( + RCFILE_OPTIMIZED_WRITER_VALIDATE, + "Experimental: RCFile: Validate writer files", + hiveConfig.isRcfileWriterValidate(), + false), + booleanProperty( + SORTED_WRITING_ENABLED, + "Enable writing to bucketed sorted tables", + hiveConfig.isSortedWritingEnabled(), + false), + booleanProperty( + STATISTICS_ENABLED, + "Experimental: Expose table statistics", + hiveConfig.isTableStatisticsEnabled(), + false), + integerProperty( + PARTITION_STATISTICS_SAMPLE_SIZE, + "Maximum sample size of the partitions column statistics", + hiveConfig.getPartitionStatisticsSampleSize(), + false), + booleanProperty( + IGNORE_CORRUPTED_STATISTICS, + "Experimental: Ignore corrupted statistics rather than failing", + hiveConfig.isIgnoreCorruptedStatistics(), + false), + booleanProperty( + COLLECT_COLUMN_STATISTICS_ON_WRITE, + "Experimental: Enables automatic column level statistics collection on write", + hiveConfig.isCollectColumnStatisticsOnWrite(), + false), + booleanProperty( + OPTIMIZE_MISMATCHED_BUCKET_COUNT, + "Experimenal: Enable optimization to avoid shuffle when bucket count is compatible but not the same", + hiveConfig.isOptimizeMismatchedBucketCount(), + false), + booleanProperty( + S3_SELECT_PUSHDOWN_ENABLED, + "S3 Select pushdown enabled", + hiveConfig.isS3SelectPushdownEnabled(), + false), + booleanProperty( + TEMPORARY_STAGING_DIRECTORY_ENABLED, + "Should use temporary staging directory for write operations", + hiveConfig.isTemporaryStagingDirectoryEnabled(), + false), + stringProperty( + TEMPORARY_STAGING_DIRECTORY_PATH, + "Temporary staging directory location", + hiveConfig.getTemporaryStagingDirectoryPath(), + false), + integerProperty( + METASTORE_WRITE_BATCH_SIZE, + "Batch size for requests to HMS for partition and partition statistics write operation", + hiveConfig.getMetastoreWriteBatchSize(), + false), + integerProperty( + DYNAMIC_FILTERING_ROW_FILTERING_THRESHOLD, + "Only enable row filtering with dynamic filter if the filter size is below this threshold", + hiveConfig.getDynamicFilteringRowFilteringThreshold(), + false), + booleanProperty( + DYNAMIC_FILTERING_SPLIT_FILTERING, + "Filter out hive splits early based on partition value using dynamic filter", + hiveConfig.isDynamicFilterPartitionFilteringEnabled(), + false), + booleanProperty( + ORC_PREDICATE_PUSHDOWN, + "Experimental: Consume deterministic predicates(conjucts: AND) for ORC scan.", + hiveConfig.isOrcPredicatePushdownEnabled(), + false), + booleanProperty( + ORC_DISJUCT_PREDICATE_PUSHDOWN, + "Experimental: Consume deterministic predicates(disjucts: OR) for ORC scan.", + true, + false), + booleanProperty( + ORC_PUSHDOWN_DATACACHE, + "Experimental: Enable data cache or result cache with predicate pushdown.", + true, + false), + booleanProperty( + FILTER_OFFLOAD, + "Enables offload filter operators to storage device.", + hiveConfig.isFilterOffloadEnabled(), + false), + booleanProperty( + AGGREGATOR_OFFLOAD, + "Enables offload aggregator operators to storage device.", + hiveConfig.isAggregatorOffloadEnabled(), + false), + booleanProperty( + OMNIDATA_ENABLE, + "Enables OmniData feature.", + hiveConfig.isOmniDataEnabled(), + false), + doubleProperty( + MIN_FILTER_OFFLOAD_FACTOR, + "The minimum data filtering threshold for predicate expression offload.", + hiveConfig.getMinFilterOffloadFactor(), + false), + doubleProperty( + MIN_AGGREGATOR_OFFLOAD_FACTOR, + "The minimum data aggregation threshold for aggregation expression offload.", + hiveConfig.getMinAggregatorOffloadFactor(), + false), + longProperty( + MIN_OFFLOAD_ROW_NUMBER, + "The minimum table size for operator offload.", + hiveConfig.getMinOffloadRowNumber(), + false)); + } + + public List> getSessionProperties() + { + return sessionProperties; + } + + public static boolean isBucketExecutionEnabled(ConnectorSession session) + { + return session.getProperty(BUCKET_EXECUTION_ENABLED, Boolean.class); + } + + public static boolean isWritePartitionDistributionEnabled(ConnectorSession session) + { + return session.getProperty(WRITE_PARTITION_DISTRIBUTION, Boolean.class); + } + + public static int getMetastoreWriteBatchSize(ConnectorSession session) + { + return session.getProperty(METASTORE_WRITE_BATCH_SIZE, Integer.class); + } + + public static boolean isForceLocalScheduling(ConnectorSession session) + { + return session.getProperty(FORCE_LOCAL_SCHEDULING, Boolean.class); + } + + public static InsertExistingPartitionsBehavior getInsertExistingPartitionsBehavior(ConnectorSession session) + { + return session.getProperty(INSERT_EXISTING_PARTITIONS_BEHAVIOR, InsertExistingPartitionsBehavior.class); + } + + public static boolean isOrcBloomFiltersEnabled(ConnectorSession session) + { + return session.getProperty(ORC_BLOOM_FILTERS_ENABLED, Boolean.class); + } + + public static DataSize getOrcMaxMergeDistance(ConnectorSession session) + { + return session.getProperty(ORC_MAX_MERGE_DISTANCE, DataSize.class); + } + + public static DataSize getOrcMaxBufferSize(ConnectorSession session) + { + return session.getProperty(ORC_MAX_BUFFER_SIZE, DataSize.class); + } + + public static DataSize getOrcStreamBufferSize(ConnectorSession session) + { + return session.getProperty(ORC_STREAM_BUFFER_SIZE, DataSize.class); + } + + public static DataSize getOrcTinyStripeThreshold(ConnectorSession session) + { + return session.getProperty(ORC_TINY_STRIPE_THRESHOLD, DataSize.class); + } + + public static DataSize getOrcMaxReadBlockSize(ConnectorSession session) + { + return session.getProperty(ORC_MAX_READ_BLOCK_SIZE, DataSize.class); + } + + public static boolean getOrcLazyReadSmallRanges(ConnectorSession session) + { + return session.getProperty(ORC_LAZY_READ_SMALL_RANGES, Boolean.class); + } + + public static boolean isOrcNestedLazy(ConnectorSession session) + { + return session.getProperty(ORC_NESTED_LAZY_ENABLED, Boolean.class); + } + + public static DataSize getOrcStringStatisticsLimit(ConnectorSession session) + { + return session.getProperty(ORC_STRING_STATISTICS_LIMIT, DataSize.class); + } + + public static boolean isOrcOptimizedWriterValidate(ConnectorSession session) + { + boolean validate = session.getProperty(ORC_OPTIMIZED_WRITER_VALIDATE, Boolean.class); + double percentage = session.getProperty(ORC_OPTIMIZED_WRITER_VALIDATE_PERCENTAGE, Double.class); + + checkArgument(percentage >= 0.0 && percentage <= 100.0); + + // session property can disabled validation + if (!validate) { + return false; + } + + // session property can not force validation when sampling is enabled + // todo change this if session properties support null + return ThreadLocalRandom.current().nextDouble(100) < percentage; + } + + public static OrcWriteValidationMode getOrcOptimizedWriterValidateMode(ConnectorSession session) + { + return OrcWriteValidationMode.valueOf( + session.getProperty(ORC_OPTIMIZED_WRITER_VALIDATE_MODE, String.class).toUpperCase(ENGLISH)); + } + + public static DataSize getOrcOptimizedWriterMinStripeSize(ConnectorSession session) + { + return session.getProperty(ORC_OPTIMIZED_WRITER_MIN_STRIPE_SIZE, DataSize.class); + } + + public static DataSize getOrcOptimizedWriterMaxStripeSize(ConnectorSession session) + { + return session.getProperty(ORC_OPTIMIZED_WRITER_MAX_STRIPE_SIZE, DataSize.class); + } + + public static int getOrcOptimizedWriterMaxStripeRows(ConnectorSession session) + { + return session.getProperty(ORC_OPTIMIZED_WRITER_MAX_STRIPE_ROWS, Integer.class); + } + + public static DataSize getOrcOptimizedWriterMaxDictionaryMemory(ConnectorSession session) + { + return session.getProperty(ORC_OPTIMIZED_WRITER_MAX_DICTIONARY_MEMORY, DataSize.class); + } + + public static boolean isOrcFileTailCacheEnabled(ConnectorSession session) + { + return session.getProperty(ORC_FILE_TAIL_CACHE_ENABLED, Boolean.class); + } + + public static boolean isOrcStripeFooterCacheEnabled(ConnectorSession session) + { + return session.getProperty(ORC_STRIPE_FOOTER_CACHE_ENABLED, Boolean.class); + } + + public static boolean isOrcRowIndexCacheEnabled(ConnectorSession session) + { + return session.getProperty(ORC_ROW_INDEX_CACHE_ENABLED, Boolean.class); + } + + public static boolean isOrcBloomFiltersCacheEnabled(ConnectorSession session) + { + return session.getProperty(ORC_BLOOM_FILTERS_CACHE_ENABLED, Boolean.class); + } + + public static boolean isOrcRowDataCacheEnabled(ConnectorSession session) + { + return session.getProperty(ORC_ROW_DATA_CACHE_ENABLED, Boolean.class); + } + + public static HiveStorageFormat getHiveStorageFormat(ConnectorSession session) + { + return HiveStorageFormat.valueOf(session.getProperty(HIVE_STORAGE_FORMAT, String.class).toUpperCase(ENGLISH)); + } + + public static boolean isRespectTableFormat(ConnectorSession session) + { + return session.getProperty(RESPECT_TABLE_FORMAT, Boolean.class); + } + + public static boolean isCreateEmptyBucketFiles(ConnectorSession session) + { + return session.getProperty(CREATE_EMPTY_BUCKET_FILES, Boolean.class); + } + + public static boolean isUseParquetColumnNames(ConnectorSession session) + { + return session.getProperty(PARQUET_USE_COLUMN_NAME, Boolean.class); + } + + public static boolean isFailOnCorruptedParquetStatistics(ConnectorSession session) + { + return session.getProperty(PARQUET_FAIL_WITH_CORRUPTED_STATISTICS, Boolean.class); + } + + public static DataSize getParquetMaxReadBlockSize(ConnectorSession session) + { + return session.getProperty(PARQUET_MAX_READ_BLOCK_SIZE, DataSize.class); + } + + public static DataSize getParquetWriterBlockSize(ConnectorSession session) + { + return session.getProperty(PARQUET_WRITER_BLOCK_SIZE, DataSize.class); + } + + public static DataSize getParquetWriterPageSize(ConnectorSession session) + { + return session.getProperty(PARQUET_WRITER_PAGE_SIZE, DataSize.class); + } + + public static DataSize getMaxSplitSize(ConnectorSession session) + { + return session.getProperty(MAX_SPLIT_SIZE, DataSize.class); + } + + public static DataSize getMaxInitialSplitSize(ConnectorSession session) + { + return session.getProperty(MAX_INITIAL_SPLIT_SIZE, DataSize.class); + } + + public static boolean isRcfileOptimizedWriterValidate(ConnectorSession session) + { + return session.getProperty(RCFILE_OPTIMIZED_WRITER_VALIDATE, Boolean.class); + } + + public static boolean isSortedWritingEnabled(ConnectorSession session) + { + return session.getProperty(SORTED_WRITING_ENABLED, Boolean.class); + } + + public static boolean isS3SelectPushdownEnabled(ConnectorSession session) + { + return session.getProperty(S3_SELECT_PUSHDOWN_ENABLED, Boolean.class); + } + + public static boolean isStatisticsEnabled(ConnectorSession session) + { + return session.getProperty(STATISTICS_ENABLED, Boolean.class); + } + + public static int getPartitionStatisticsSampleSize(ConnectorSession session) + { + int size = session.getProperty(PARTITION_STATISTICS_SAMPLE_SIZE, Integer.class); + if (size < 1) { + throw new PrestoException(INVALID_SESSION_PROPERTY, format("%s must be greater than 0: %s", PARTITION_STATISTICS_SAMPLE_SIZE, size)); + } + return size; + } + + public static boolean isIgnoreCorruptedStatistics(ConnectorSession session) + { + return session.getProperty(IGNORE_CORRUPTED_STATISTICS, Boolean.class); + } + + public static boolean isCollectColumnStatisticsOnWrite(ConnectorSession session) + { + return session.getProperty(COLLECT_COLUMN_STATISTICS_ON_WRITE, Boolean.class); + } + + public static boolean isOptimizedMismatchedBucketCount(ConnectorSession session) + { + return session.getProperty(OPTIMIZE_MISMATCHED_BUCKET_COUNT, Boolean.class); + } + + public static boolean isTemporaryStagingDirectoryEnabled(ConnectorSession session) + { + return session.getProperty(TEMPORARY_STAGING_DIRECTORY_ENABLED, Boolean.class); + } + + public static int getDynamicFilteringRowFilteringThreshold(ConnectorSession session) + { + return session.getProperty(DYNAMIC_FILTERING_ROW_FILTERING_THRESHOLD, Integer.class); + } + + public static boolean isDynamicFilteringSplitFilteringEnabled(ConnectorSession session) + { + return session.getProperty(DYNAMIC_FILTERING_SPLIT_FILTERING, Boolean.class); + } + + public static boolean isOrcPredicatePushdownEnabled(ConnectorSession session) + { + return session.getProperty(ORC_PREDICATE_PUSHDOWN, Boolean.class); + } + + public static boolean isOrcDisjunctPredicatePushdownEnabled(ConnectorSession session) + { + return session.getProperty(ORC_DISJUCT_PREDICATE_PUSHDOWN, Boolean.class); + } + + public static boolean isOrcPushdownDataCacheEnabled(ConnectorSession session) + { + return session.getProperty(ORC_PUSHDOWN_DATACACHE, Boolean.class); + } + + public static boolean isFilterOffloadEnabled(ConnectorSession session) + { + return session.getProperty(FILTER_OFFLOAD, Boolean.class); + } + + public static boolean isAggregatorOffloadEnabled(ConnectorSession session) + { + return session.getProperty(AGGREGATOR_OFFLOAD, Boolean.class); + } + + public static boolean isOmniDataEnabled(ConnectorSession session) + { + return session.getProperty(OMNIDATA_ENABLE, Boolean.class); + } + + public static double getMinFilterOffloadFactor(ConnectorSession session) + { + return session.getProperty(MIN_FILTER_OFFLOAD_FACTOR, Double.class); + } + + public static double getMinAggregatorOffloadFactor(ConnectorSession session) + { + return session.getProperty(MIN_AGGREGATOR_OFFLOAD_FACTOR, Double.class); + } + + public static long getMinOffloadRowNumber(ConnectorSession session) + { + return session.getProperty(MIN_OFFLOAD_ROW_NUMBER, Long.class); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplit.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplit.java new file mode 100644 index 00000000..f12038c2 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplit.java @@ -0,0 +1,353 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.spi.HostAddress; + +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class HiveSplit +{ + private final String path; + private final long start; + private final long length; + private final long fileSize; + private final long lastModifiedTime; + private final Properties schema; + private final List partitionKeys; + private final List addresses; + private final String database; + private final String table; + private final String partitionName; + private final OptionalInt bucketNumber; + private final boolean forceLocalScheduling; + private final Map columnCoercions; // key: hiveColumnIndex + private final Optional bucketConversion; + private final boolean s3SelectPushdownEnabled; + private final Optional deleteDeltaLocations; + private final Optional startRowOffsetOfFile; + private final boolean cacheable; + private final Map customSplitInfo; + + @JsonCreator + public HiveSplit( + @JsonProperty("database") String database, + @JsonProperty("table") String table, + @JsonProperty("partitionName") String partitionName, + @JsonProperty("path") String path, + @JsonProperty("start") long start, + @JsonProperty("length") long length, + @JsonProperty("fileSize") long fileSize, + @JsonProperty("lastModifiedTime") long lastModifiedTime, + @JsonProperty("schema") Properties schema, + @JsonProperty("partitionKeys") List partitionKeys, + @JsonProperty("addresses") List addresses, + @JsonProperty("bucketNumber") OptionalInt bucketNumber, + @JsonProperty("forceLocalScheduling") boolean forceLocalScheduling, + @JsonProperty("columnCoercions") Map columnCoercions, + @JsonProperty("bucketConversion") Optional bucketConversion, + @JsonProperty("s3SelectPushdownEnabled") boolean s3SelectPushdownEnabled, + @JsonProperty("deleteDeltaLocations") Optional deleteDeltaLocations, + @JsonProperty("validWriteIdList") Optional startRowOffsetOfFile, + @JsonProperty("cacheable") boolean cacheable, + @JsonProperty("customSplitInfo") Map customSplitInfo) + { + checkArgument(start >= 0, "start must be positive"); + checkArgument(length >= 0, "length must be positive"); + checkArgument(fileSize >= 0, "fileSize must be positive"); + checkArgument(lastModifiedTime >= 0, "lastModifiedTime must be positive"); + requireNonNull(database, "database is null"); + requireNonNull(table, "table is null"); + requireNonNull(partitionName, "partitionName is null"); + requireNonNull(path, "path is null"); + requireNonNull(schema, "schema is null"); + requireNonNull(partitionKeys, "partitionKeys is null"); + requireNonNull(addresses, "addresses is null"); + requireNonNull(bucketNumber, "bucketNumber is null"); + requireNonNull(columnCoercions, "columnCoercions is null"); + requireNonNull(bucketConversion, "bucketConversion is null"); + requireNonNull(deleteDeltaLocations, "deleteDeltaLocations is null"); + + this.database = database; + this.table = table; + this.partitionName = partitionName; + this.path = path; + this.start = start; + this.length = length; + this.fileSize = fileSize; + this.lastModifiedTime = lastModifiedTime; + this.schema = schema; + this.partitionKeys = ImmutableList.copyOf(partitionKeys); + this.addresses = ImmutableList.copyOf(addresses); + this.bucketNumber = bucketNumber; + this.forceLocalScheduling = forceLocalScheduling; + this.columnCoercions = columnCoercions; + this.bucketConversion = bucketConversion; + this.s3SelectPushdownEnabled = s3SelectPushdownEnabled; + this.deleteDeltaLocations = deleteDeltaLocations; + this.startRowOffsetOfFile = startRowOffsetOfFile; + this.cacheable = cacheable; + this.customSplitInfo = ImmutableMap.copyOf(requireNonNull(customSplitInfo, "customSplitInfo is null")); + } + + @JsonProperty + public String getDatabase() + { + return database; + } + + @JsonProperty + public String getTable() + { + return table; + } + + @JsonProperty + public String getPartitionName() + { + return partitionName; + } + + @JsonProperty + public String getPath() + { + return path; + } + + @JsonProperty + public long getStart() + { + return start; + } + + @JsonProperty + public long getLength() + { + return length; + } + + @JsonProperty + public long getFileSize() + { + return fileSize; + } + + @JsonProperty + public Properties getSchema() + { + return schema; + } + + @JsonProperty + public List getPartitionKeys() + { + return partitionKeys; + } + + @JsonProperty + public List getAddresses() + { + return addresses; + } + + @JsonProperty + public OptionalInt getBucketNumber() + { + return bucketNumber; + } + + @JsonProperty + public boolean isForceLocalScheduling() + { + return forceLocalScheduling; + } + + @JsonProperty + public Map getColumnCoercions() + { + return columnCoercions; + } + + @JsonProperty + public Optional getBucketConversion() + { + return bucketConversion; + } + + public boolean isRemotelyAccessible() + { + return !forceLocalScheduling; + } + + @JsonProperty + public boolean isS3SelectPushdownEnabled() + { + return s3SelectPushdownEnabled; + } + + @JsonProperty + public Optional getDeleteDeltaLocations() + { + return deleteDeltaLocations; + } + + //presto: default method to get split path for Split filter + public String getFilePath() + { + return path; + } + + public long getStartIndex() + { + return start; + } + + public long getEndIndex() + { + return start + length; + } + + @JsonProperty + public long getLastModifiedTime() + { + return lastModifiedTime; + } + + @JsonProperty + public boolean isCacheable() + { + return cacheable; + } + + @JsonProperty + public Map getCustomSplitInfo() + { + return customSplitInfo; + } + + public Object getInfo() + { + return ImmutableMap.builder() + .put("path", path) + .put("start", start) + .put("length", length) + .put("fileSize", fileSize) + .put("lastModifiedTime", lastModifiedTime) + .put("hosts", addresses) + .put("database", database) + .put("table", table) + .put("forceLocalScheduling", forceLocalScheduling) + .put("partitionName", partitionName) + .put("s3SelectPushdownEnabled", s3SelectPushdownEnabled) + .put("cacheable", cacheable) + .build(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(path) + .addValue(start) + .addValue(length) + .addValue(fileSize) + .toString(); + } + + public Optional getStartRowOffsetOfFile() + { + return startRowOffsetOfFile; + } + + public static class BucketConversion + { + private final BucketingVersion bucketingVersion; + private final int tableBucketCount; + private final int partitionBucketCount; + private final List bucketColumnNames; + // bucketNumber is needed, but can be found in bucketNumber field of HiveSplit. + + @JsonCreator + public BucketConversion( + @JsonProperty("bucketingVersion") BucketingVersion bucketingVersion, + @JsonProperty("tableBucketCount") int tableBucketCount, + @JsonProperty("partitionBucketCount") int partitionBucketCount, + @JsonProperty("bucketColumnHandles") List bucketColumnHandles) + { + this.bucketingVersion = requireNonNull(bucketingVersion, "bucketingVersion is null"); + this.tableBucketCount = tableBucketCount; + this.partitionBucketCount = partitionBucketCount; + this.bucketColumnNames = requireNonNull(bucketColumnHandles, "bucketColumnHandles is null"); + } + + @JsonProperty + public BucketingVersion getBucketingVersion() + { + return bucketingVersion; + } + + @JsonProperty + public int getTableBucketCount() + { + return tableBucketCount; + } + + @JsonProperty + public int getPartitionBucketCount() + { + return partitionBucketCount; + } + + @JsonProperty + public List getBucketColumnHandles() + { + return bucketColumnNames; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + BucketConversion that = (BucketConversion) o; + return tableBucketCount == that.tableBucketCount && + partitionBucketCount == that.partitionBucketCount && + Objects.equals(bucketColumnNames, that.bucketColumnNames); + } + + @Override + public int hashCode() + { + return Objects.hash(tableBucketCount, partitionBucketCount, bucketColumnNames); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitLoader.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitLoader.java new file mode 100644 index 00000000..60314672 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitLoader.java @@ -0,0 +1,21 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +interface HiveSplitLoader +{ + void start(HiveSplitSource splitSource); + + void stop(); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitManager.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitManager.java new file mode 100644 index 00000000..47fe70ac --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitManager.java @@ -0,0 +1,496 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.AbstractIterator; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import com.google.common.collect.Ordering; +import io.airlift.concurrent.BoundedExecutor; +import io.airlift.stats.CounterStat; +import io.airlift.units.DataSize; +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.MetastoreUtil; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.VersionEmbedder; +import io.prestosql.spi.connector.ColumnMetadata; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorSplitManager; +import io.prestosql.spi.connector.ConnectorSplitSource; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.connector.FixedSplitSource; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.dynamicfilter.DynamicFilter; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.resourcegroups.QueryType; +import io.prestosql.spi.type.TypeManager; +import org.weakref.jmx.Managed; +import org.weakref.jmx.Nested; + +import javax.annotation.Nullable; +import javax.inject.Inject; + +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.Executor; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.RejectedExecutionException; +import java.util.function.Function; +import java.util.function.Supplier; + +import static com.google.common.base.MoreObjects.firstNonNull; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.isNullOrEmpty; +import static com.google.common.collect.Iterables.concat; +import static com.google.common.collect.Iterables.getOnlyElement; +import static com.google.common.collect.Iterables.transform; +import static io.prestosql.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.StandardErrorCode.SERVER_SHUTTING_DOWN; +import static io.prestosql.spi.connector.ConnectorSplitManager.SplitSchedulingStrategy.GROUPED_SCHEDULING; +import static java.lang.Math.min; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class HiveSplitManager + implements ConnectorSplitManager +{ + public static final String PRESTO_OFFLINE = "presto_offline"; + public static final String OBJECT_NOT_READABLE = "object_not_readable"; + + private final Function metastoreProvider; + private final HivePartitionManager partitionManager; + private final NamenodeStats namenodeStats; + private final HdfsEnvironment hdfsEnvironment; + private final DirectoryLister directoryLister; + private final Executor executor; + private final CoercionPolicy coercionPolicy; + private final int maxOutstandingSplits; + private final DataSize maxOutstandingSplitsSize; + private final int minPartitionBatchSize; + private final int maxPartitionBatchSize; + private final int maxInitialSplits; + private final int splitLoaderConcurrency; + private final int maxSplitsPerSecond; + private final boolean recursiveDfsWalkerEnabled; + private final CounterStat highMemorySplitSourceCounter; + private final TypeManager typeManager; + private final HiveConfig hiveConfig; + + @Inject + public HiveSplitManager( + HiveConfig hiveConfig, + Function metastoreProvider, + HivePartitionManager partitionManager, + NamenodeStats namenodeStats, + HdfsEnvironment hdfsEnvironment, + DirectoryLister directoryLister, + @ForHive ExecutorService executorService, + VersionEmbedder versionEmbedder, + TypeManager typeManager, + CoercionPolicy coercionPolicy) + { + this( + metastoreProvider, + partitionManager, + namenodeStats, + hdfsEnvironment, + directoryLister, + versionEmbedder.embedVersion(new BoundedExecutor(executorService, hiveConfig.getMaxSplitIteratorThreads())), + coercionPolicy, + new CounterStat(), + hiveConfig.getMaxOutstandingSplits(), + hiveConfig.getMaxOutstandingSplitsSize(), + hiveConfig.getMinPartitionBatchSize(), + hiveConfig.getMaxPartitionBatchSize(), + hiveConfig.getMaxInitialSplits(), + hiveConfig.getSplitLoaderConcurrency(), + hiveConfig.getMaxSplitsPerSecond(), + hiveConfig.getRecursiveDirWalkerEnabled(), + typeManager, + hiveConfig); + } + + public HiveSplitManager( + Function metastoreProvider, + HivePartitionManager partitionManager, + NamenodeStats namenodeStats, + HdfsEnvironment hdfsEnvironment, + DirectoryLister directoryLister, + Executor executor, + CoercionPolicy coercionPolicy, + CounterStat highMemorySplitSourceCounter, + int maxOutstandingSplits, + DataSize maxOutstandingSplitsSize, + int minPartitionBatchSize, + int maxPartitionBatchSize, + int maxInitialSplits, + int splitLoaderConcurrency, + @Nullable Integer maxSplitsPerSecond, + boolean recursiveDfsWalkerEnabled, + TypeManager typeManager, + HiveConfig hiveConfig) + { + this.metastoreProvider = requireNonNull(metastoreProvider, "metastore is null"); + this.partitionManager = requireNonNull(partitionManager, "partitionManager is null"); + this.namenodeStats = requireNonNull(namenodeStats, "namenodeStats is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.directoryLister = requireNonNull(directoryLister, "directoryLister is null"); + this.executor = new ErrorCodedExecutor(executor); + this.coercionPolicy = requireNonNull(coercionPolicy, "coercionPolicy is null"); + this.highMemorySplitSourceCounter = requireNonNull(highMemorySplitSourceCounter, "highMemorySplitSourceCounter is null"); + checkArgument(maxOutstandingSplits >= 1, "maxOutstandingSplits must be at least 1"); + this.maxOutstandingSplits = maxOutstandingSplits; + this.maxOutstandingSplitsSize = maxOutstandingSplitsSize; + this.minPartitionBatchSize = minPartitionBatchSize; + this.maxPartitionBatchSize = maxPartitionBatchSize; + this.maxInitialSplits = maxInitialSplits; + this.splitLoaderConcurrency = splitLoaderConcurrency; + this.maxSplitsPerSecond = firstNonNull(maxSplitsPerSecond, Integer.MAX_VALUE); + this.recursiveDfsWalkerEnabled = recursiveDfsWalkerEnabled; + this.typeManager = typeManager; + this.hiveConfig = hiveConfig; + } + + @Override + public ConnectorSplitSource getSplits(ConnectorTransactionHandle transaction, + ConnectorSession session, + ConnectorTableHandle table, + SplitSchedulingStrategy splitSchedulingStrategy) + { + return getSplits(transaction, session, table, splitSchedulingStrategy, null, Optional.empty(), ImmutableMap.of(), ImmutableSet.of(), false); + } + + @Override + public ConnectorSplitSource getSplits( + ConnectorTransactionHandle transaction, + ConnectorSession session, + ConnectorTableHandle tableHandle, + SplitSchedulingStrategy splitSchedulingStrategy, + Supplier>> dynamicFilterSupplier, + Optional queryType, + Map queryInfo, + Set> userDefinedCachePredicates, + boolean partOfReuse) + { + HiveTableHandle hiveTable = (HiveTableHandle) tableHandle; + SchemaTableName tableName = hiveTable.getSchemaTableName(); + + // get table metadata + SemiTransactionalHiveMetastore metastore = metastoreProvider.apply((HiveTransactionHandle) transaction); + Table table = metastore.getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + if (table.getStorage().getStorageFormat().getInputFormat().contains("carbon")) { + throw new PrestoException(NOT_SUPPORTED, "Hive connector can't read carbondata tables"); + } + + // verify table is not marked as non-readable + String tableNotReadable = table.getParameters().get(OBJECT_NOT_READABLE); + if (!isNullOrEmpty(tableNotReadable)) { + throw new HiveNotReadableException(tableName, Optional.empty(), tableNotReadable); + } + + // get partitions + List partitions = partitionManager.getOrLoadPartitions(session, metastore, new HiveIdentity(session), hiveTable); + + // short circuit if we don't have any partitions + if (partitions.isEmpty()) { + return new FixedSplitSource(ImmutableList.of()); + } + + // get buckets from first partition (arbitrary) + Optional bucketFilter = hiveTable.getBucketFilter(); + + // validate bucket bucketed execution + Optional bucketHandle = hiveTable.getBucketHandle(); + if ((splitSchedulingStrategy == GROUPED_SCHEDULING) && !bucketHandle.isPresent()) { + throw new PrestoException(GENERIC_INTERNAL_ERROR, "SchedulingPolicy is bucketed, but BucketHandle is not present"); + } + + // sort partitions + partitions = Ordering.natural().onResultOf(HivePartition::getPartitionId).reverse().sortedCopy(partitions); + + Iterable hivePartitions = getPartitionMetadata(session, metastore, table, tableName, partitions, bucketHandle.map(HiveBucketHandle::toTableBucketProperty)); + + HiveSplitLoader hiveSplitLoader = new BackgroundHiveSplitLoader( + table, + hivePartitions, + hiveTable.getCompactEffectivePredicate(), + BackgroundHiveSplitLoader.BucketSplitInfo.createBucketSplitInfo(bucketHandle, bucketFilter), + session, + hdfsEnvironment, + namenodeStats, + directoryLister, + executor, + splitLoaderConcurrency, + recursiveDfsWalkerEnabled, + metastore.getValidWriteIds(session, hiveTable, queryType.map(t -> t == QueryType.VACUUM).orElse(false)) + .map(validTxnWriteIdList -> validTxnWriteIdList.getTableValidWriteIdList(table.getDatabaseName() + "." + table.getTableName())), + dynamicFilterSupplier, + queryType, + queryInfo, + typeManager); + + HiveSplitSource splitSource; + HiveStorageFormat hiveStorageFormat = HiveMetadata.extractHiveStorageFormat(table); + switch (splitSchedulingStrategy) { + case UNGROUPED_SCHEDULING: + splitSource = HiveSplitSource.allAtOnce( + session, + table.getDatabaseName(), + table.getTableName(), + partOfReuse ? 0 : maxInitialSplits, //For reuse, we should make sure to have same split size all time for a table. + maxOutstandingSplits, + maxOutstandingSplitsSize, + maxSplitsPerSecond, + hiveSplitLoader, + executor, + new CounterStat(), + dynamicFilterSupplier, + userDefinedCachePredicates, + typeManager, + hiveConfig, + hiveStorageFormat); + break; + case GROUPED_SCHEDULING: + splitSource = HiveSplitSource.bucketed( + session, + table.getDatabaseName(), + table.getTableName(), + partOfReuse ? 0 : maxInitialSplits, //For reuse, we should make sure to have same split size all time for a table. + maxOutstandingSplits, + maxOutstandingSplitsSize, + maxSplitsPerSecond, + hiveSplitLoader, + executor, + new CounterStat(), + dynamicFilterSupplier, + userDefinedCachePredicates, + typeManager, + hiveConfig, + hiveStorageFormat); + break; + default: + throw new IllegalArgumentException("Unknown splitSchedulingStrategy: " + splitSchedulingStrategy); + } + hiveSplitLoader.start(splitSource); + + if (queryType.isPresent() && queryType.get() == QueryType.VACUUM) { + HdfsContext hdfsContext = new HdfsContext(session, table.getDatabaseName(), table.getTableName()); + return new HiveVacuumSplitSource(splitSource, (HiveVacuumTableHandle) queryInfo.get("vacuumHandle"), hdfsEnvironment, hdfsContext, session); + } + + return splitSource; + } + + @Managed + @Nested + public CounterStat getHighMemorySplitSource() + { + return highMemorySplitSourceCounter; + } + + private Iterable getPartitionMetadata(ConnectorSession session, SemiTransactionalHiveMetastore metastore, Table table, SchemaTableName tableName, List hivePartitions, Optional bucketProperty) + { + if (hivePartitions.isEmpty()) { + return ImmutableList.of(); + } + + if (hivePartitions.size() == 1) { + HivePartition firstPartition = getOnlyElement(hivePartitions); + if (firstPartition.getPartitionId().equals(HivePartition.UNPARTITIONED_ID)) { + return ImmutableList.of(new HivePartitionMetadata(firstPartition, Optional.empty(), ImmutableMap.of())); + } + } + + Iterable> partitionNameBatches = partitionExponentially(hivePartitions, minPartitionBatchSize, maxPartitionBatchSize); + Iterable> partitionBatches = transform(partitionNameBatches, partitionBatch -> { + Map> batch = metastore.getPartitionsByNames( + new HiveIdentity(session), + tableName.getSchemaName(), + tableName.getTableName(), + Lists.transform(partitionBatch, HivePartition::getPartitionId)); + ImmutableMap.Builder partitionBuilder = ImmutableMap.builder(); + for (Map.Entry> entry : batch.entrySet()) { + if (!entry.getValue().isPresent()) { + throw new PrestoException(HiveErrorCode.HIVE_PARTITION_DROPPED_DURING_QUERY, "Partition no longer exists: " + entry.getKey()); + } + partitionBuilder.put(entry.getKey(), entry.getValue().get()); + } + Map partitions = partitionBuilder.build(); + if (partitionBatch.size() != partitions.size()) { + throw new PrestoException(GENERIC_INTERNAL_ERROR, format("Expected %s partitions but found %s", partitionBatch.size(), partitions.size())); + } + + ImmutableList.Builder results = ImmutableList.builder(); + for (HivePartition hivePartition : partitionBatch) { + Partition partition = partitions.get(hivePartition.getPartitionId()); + if (partition == null) { + throw new PrestoException(GENERIC_INTERNAL_ERROR, "Partition not loaded: " + hivePartition); + } + String partName = MetastoreUtil.makePartitionName(table.getPartitionColumns(), partition.getValues()); + + // verify partition is online + MetastoreUtil.verifyOnline(tableName, Optional.of(partName), MetastoreUtil.getProtectMode(partition), partition.getParameters()); + + // verify partition is not marked as non-readable + String partitionNotReadable = partition.getParameters().get(OBJECT_NOT_READABLE); + if (!isNullOrEmpty(partitionNotReadable)) { + throw new HiveNotReadableException(tableName, Optional.of(partName), partitionNotReadable); + } + + // Verify that the partition schema matches the table schema. + // Either adding or dropping columns from the end of the table + // without modifying existing partitions is allowed, but every + // column that exists in both the table and partition must have + // the same type. + List tableColumns = table.getDataColumns(); + List partitionColumns = partition.getColumns(); + if ((tableColumns == null) || (partitionColumns == null)) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, format("Table '%s' or partition '%s' has null columns", tableName, partName)); + } + ImmutableMap.Builder columnCoercions = ImmutableMap.builder(); + for (int i = 0; i < min(partitionColumns.size(), tableColumns.size()); i++) { + HiveType tableType = tableColumns.get(i).getType(); + HiveType partitionType = partitionColumns.get(i).getType(); + if (!tableType.equals(partitionType)) { + if (!coercionPolicy.canCoerce(partitionType, tableType)) { + throw new PrestoException(HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH, format("" + + "There is a mismatch between the table and partition schemas. " + + "The types are incompatible and cannot be coerced. " + + "The column '%s' in table '%s' is declared as type '%s', " + + "but partition '%s' declared column '%s' as type '%s'.", + tableColumns.get(i).getName(), + tableName, + tableType, + partName, + partitionColumns.get(i).getName(), + partitionType)); + } + columnCoercions.put(i, partitionType.getHiveTypeName()); + } + } + + if (bucketProperty.isPresent()) { + Optional partitionBucketProperty = partition.getStorage().getBucketProperty(); + if (!partitionBucketProperty.isPresent()) { + throw new PrestoException(HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH, format( + "Hive table (%s) is bucketed but partition (%s) is not bucketed", + hivePartition.getTableName(), + hivePartition.getPartitionId())); + } + int tableBucketCount = bucketProperty.get().getBucketCount(); + int partitionBucketCount = partitionBucketProperty.get().getBucketCount(); + List tableBucketColumns = bucketProperty.get().getBucketedBy(); + List partitionBucketColumns = partitionBucketProperty.get().getBucketedBy(); + if (!tableBucketColumns.equals(partitionBucketColumns) || !isBucketCountCompatible(tableBucketCount, partitionBucketCount)) { + throw new PrestoException(HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH, format( + "Hive table (%s) bucketing (columns=%s, buckets=%s) is not compatible with partition (%s) bucketing (columns=%s, buckets=%s)", + hivePartition.getTableName(), + tableBucketColumns, + tableBucketCount, + hivePartition.getPartitionId(), + partitionBucketColumns, + partitionBucketCount)); + } + } + + results.add(new HivePartitionMetadata(hivePartition, Optional.of(partition), columnCoercions.build())); + } + + return results.build(); + }); + return concat(partitionBatches); + } + + static boolean isBucketCountCompatible(int tableBucketCount, int partitionBucketCount) + { + checkArgument(tableBucketCount > 0 && partitionBucketCount > 0); + int larger = Math.max(tableBucketCount, partitionBucketCount); + int smaller = Math.min(tableBucketCount, partitionBucketCount); + if (larger % smaller != 0) { + // must be evenly divisible + return false; + } + if (Integer.bitCount(larger / smaller) != 1) { + // ratio must be power of two + return false; + } + return true; + } + + /** + * Partition the given list in exponentially (power of 2) increasing batch sizes starting at 1 up to maxBatchSize + */ + private static Iterable> partitionExponentially(List values, int minBatchSize, int maxBatchSize) + { + return () -> new AbstractIterator>() + { + private int currentSize = minBatchSize; + private final Iterator iterator = values.iterator(); + + @Override + protected List computeNext() + { + if (!iterator.hasNext()) { + return endOfData(); + } + + int count = 0; + ImmutableList.Builder builder = ImmutableList.builder(); + while (iterator.hasNext() && count < currentSize) { + builder.add(iterator.next()); + ++count; + } + + currentSize = min(maxBatchSize, currentSize * 2); + return builder.build(); + } + }; + } + + private static class ErrorCodedExecutor + implements Executor + { + private final Executor delegate; + + private ErrorCodedExecutor(Executor delegate) + { + this.delegate = requireNonNull(delegate, "delegate is null"); + } + + @Override + public void execute(Runnable command) + { + try { + delegate.execute(command); + } + catch (RejectedExecutionException e) { + throw new PrestoException(SERVER_SHUTTING_DOWN, "Server is shutting down", e); + } + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitSource.java new file mode 100644 index 00000000..2bdb5654 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitSource.java @@ -0,0 +1,828 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.HashMultimap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Multimap; +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import io.airlift.log.Logger; +import io.airlift.stats.CounterStat; +import io.airlift.units.DataSize; +import io.prestosql.plugin.hive.util.AsyncQueue; +import io.prestosql.plugin.hive.util.ThrottledAsyncQueue; +import io.prestosql.spi.HostAddress; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ColumnMetadata; +import io.prestosql.spi.connector.ConnectorPartitionHandle; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorSplit; +import io.prestosql.spi.connector.ConnectorSplitSource; +import io.prestosql.spi.dynamicfilter.DynamicFilter; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.NullableValue; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.TypeManager; + +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.OptionalInt; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Executor; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.Maps.transformValues; +import static com.google.common.util.concurrent.Futures.immediateFuture; +import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static io.airlift.concurrent.MoreFutures.failedFuture; +import static io.airlift.concurrent.MoreFutures.toCompletableFuture; +import static io.airlift.units.DataSize.succinctBytes; +import static io.prestosql.plugin.hive.HiveSessionProperties.getMaxInitialSplitSize; +import static io.prestosql.plugin.hive.HiveSessionProperties.getMaxSplitSize; +import static io.prestosql.plugin.hive.HiveSessionProperties.isDynamicFilteringSplitFilteringEnabled; +import static io.prestosql.plugin.hive.HiveSplitSource.StateKind.CLOSED; +import static io.prestosql.plugin.hive.HiveSplitSource.StateKind.FAILED; +import static io.prestosql.plugin.hive.HiveSplitSource.StateKind.INITIAL; +import static io.prestosql.plugin.hive.HiveSplitSource.StateKind.NO_MORE_SPLITS; +import static io.prestosql.plugin.hive.HiveUtil.isPartitionFiltered; +import static io.prestosql.spi.connector.NotPartitionedPartitionHandle.NOT_PARTITIONED; +import static java.lang.Math.min; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +class HiveSplitSource + implements ConnectorSplitSource +{ + private static final Logger log = Logger.get(HiveSplitSource.class); + + private final String queryId; + private final String databaseName; + private final String tableName; + private final PerBucket queues; + private final AtomicInteger bufferedInternalSplitCount = new AtomicInteger(); + private final int maxOutstandingSplitsBytes; + + private final DataSize maxSplitSize; + private final DataSize maxInitialSplitSize; + private final AtomicInteger remainingInitialSplits; + + private final HiveSplitLoader splitLoader; + private final AtomicReference stateReference; + + private final AtomicLong estimatedSplitSizeInBytes = new AtomicLong(); + + private final CounterStat highMemorySplitSourceCounter; + private final AtomicBoolean loggedHighMemoryWarning = new AtomicBoolean(); + + private final Supplier>> dynamicFilterSupplier; + private final Set> userDefinedCachePredicates; + private final boolean isSplitFilteringEnabled; + + private final HiveConfig hiveConfig; + + private final TypeManager typeManager; + private final HiveStorageFormat hiveStorageFormat; + + private HiveSplitSource( + ConnectorSession session, + String databaseName, + String tableName, + PerBucket queues, + int maxInitialSplits, + DataSize maxOutstandingSplitsSize, + HiveSplitLoader splitLoader, + AtomicReference stateReference, + CounterStat highMemorySplitSourceCounter, + Supplier>> dynamicFilterSupplier, + Set> userDefinedCachedPredicates, + TypeManager typeManager, + HiveConfig hiveConfig, + HiveStorageFormat hiveStorageFormat) + { + requireNonNull(session, "session is null"); + this.queryId = session.getQueryId(); + this.databaseName = requireNonNull(databaseName, "databaseName is null"); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.queues = requireNonNull(queues, "queues is null"); + this.maxOutstandingSplitsBytes = toIntExact(maxOutstandingSplitsSize.toBytes()); + this.splitLoader = requireNonNull(splitLoader, "splitLoader is null"); + this.stateReference = requireNonNull(stateReference, "stateReference is null"); + this.highMemorySplitSourceCounter = requireNonNull(highMemorySplitSourceCounter, "highMemorySplitSourceCounter is null"); + + this.maxSplitSize = getMaxSplitSize(session); + this.maxInitialSplitSize = getMaxInitialSplitSize(session); + this.remainingInitialSplits = new AtomicInteger(maxInitialSplits); + + this.dynamicFilterSupplier = dynamicFilterSupplier; + this.isSplitFilteringEnabled = isDynamicFilteringSplitFilteringEnabled(session); + this.userDefinedCachePredicates = userDefinedCachedPredicates; + this.typeManager = typeManager; + this.hiveConfig = hiveConfig; + this.hiveStorageFormat = hiveStorageFormat; + } + + public static HiveSplitSource allAtOnce( + ConnectorSession session, + String databaseName, + String tableName, + int maxInitialSplits, + int maxOutstandingSplits, + DataSize maxOutstandingSplitsSize, + int maxSplitsPerSecond, + HiveSplitLoader splitLoader, + Executor executor, + CounterStat highMemorySplitSourceCounter, + Supplier>> dynamicFilterSupplier, + Set> userDefinedCachePredicates, + TypeManager typeManager, + HiveConfig hiveConfig, + HiveStorageFormat hiveStorageFormat) + { + AtomicReference stateReference = new AtomicReference<>(State.initial()); + return new HiveSplitSource( + session, + databaseName, + tableName, + new PerBucket() + { + private final AsyncQueue queue = new ThrottledAsyncQueue<>(maxSplitsPerSecond, maxOutstandingSplits, executor); + + @Override + public ListenableFuture offer(OptionalInt bucketNumber, InternalHiveSplit connectorSplit) + { + // bucketNumber can be non-empty because BackgroundHiveSplitLoader does not have knowledge of execution plan + return queue.offer(connectorSplit); + } + + @Override + public ListenableFuture borrowBatchAsync(OptionalInt bucketNumber, int maxSize, Function, AsyncQueue.BorrowResult> function) + { + checkArgument(!bucketNumber.isPresent()); + return queue.borrowBatchAsync(maxSize, function); + } + + @Override + public void finish() + { + queue.finish(); + } + + @Override + public boolean isFinished(OptionalInt bucketNumber) + { + checkArgument(!bucketNumber.isPresent()); + return queue.isFinished(); + } + }, + maxInitialSplits, + maxOutstandingSplitsSize, + splitLoader, + stateReference, + highMemorySplitSourceCounter, + dynamicFilterSupplier, + userDefinedCachePredicates, + typeManager, + hiveConfig, + hiveStorageFormat); + } + + public static HiveSplitSource bucketed( + ConnectorSession session, + String databaseName, + String tableName, + int estimatedOutstandingSplitsPerBucket, + int maxInitialSplits, + DataSize maxOutstandingSplitsSize, + int maxSplitsPerSecond, + HiveSplitLoader splitLoader, + Executor executor, + CounterStat highMemorySplitSourceCounter, + Supplier>> dynamicFilterSupplier, + Set> userDefinedCachePredicates, + TypeManager typeManager, + HiveConfig hiveConfig, + HiveStorageFormat hiveStorageFormat) + { + AtomicReference stateReference = new AtomicReference<>(State.initial()); + return new HiveSplitSource( + session, + databaseName, + tableName, + new PerBucket() + { + private final Map> queues = new ConcurrentHashMap<>(); + private final AtomicBoolean finished = new AtomicBoolean(); + + @Override + public ListenableFuture offer(OptionalInt bucketNumber, InternalHiveSplit connectorSplit) + { + AsyncQueue queue = queueFor(bucketNumber); + queue.offer(connectorSplit); + // Do not block "offer" when running split discovery in bucketed mode. + // A limit is enforced on estimatedSplitSizeInBytes. + return immediateFuture(null); + } + + @Override + public ListenableFuture borrowBatchAsync(OptionalInt bucketNumber, int maxSize, Function, AsyncQueue.BorrowResult> function) + { + return queueFor(bucketNumber).borrowBatchAsync(maxSize, function); + } + + @Override + public void finish() + { + if (finished.compareAndSet(false, true)) { + queues.values().forEach(AsyncQueue::finish); + } + } + + @Override + public boolean isFinished(OptionalInt bucketNumber) + { + return queueFor(bucketNumber).isFinished(); + } + + public AsyncQueue queueFor(OptionalInt bucketNumber) + { + checkArgument(bucketNumber.isPresent()); + AtomicBoolean isNew = new AtomicBoolean(); + AsyncQueue queue = queues.computeIfAbsent(bucketNumber.getAsInt(), ignored -> { + isNew.set(true); + return new ThrottledAsyncQueue<>(maxSplitsPerSecond, estimatedOutstandingSplitsPerBucket, executor); + }); + if (isNew.get() && finished.get()) { + // Check `finished` and invoke `queue.finish` after the `queue` is added to the map. + // Otherwise, `queue.finish` may not be invoked if `finished` is set while the lambda above is being evaluated. + queue.finish(); + } + return queue; + } + }, + maxInitialSplits, + maxOutstandingSplitsSize, + splitLoader, + stateReference, + highMemorySplitSourceCounter, + dynamicFilterSupplier, + userDefinedCachePredicates, + typeManager, + hiveConfig, + hiveStorageFormat); + } + + /** + * The upper bound of outstanding split count. + * It might be larger than the actual number when called concurrently with other methods. + */ + @VisibleForTesting + int getBufferedInternalSplitCount() + { + return bufferedInternalSplitCount.get(); + } + + ListenableFuture addToQueue(List splits) + { + ListenableFuture lastResult = immediateFuture(null); + for (InternalHiveSplit split : splits) { + lastResult = addToQueue(split); + } + return lastResult; + } + + ListenableFuture addToQueue(InternalHiveSplit split) + { + if (stateReference.get().getKind() != INITIAL) { + return immediateFuture(null); + } + if (estimatedSplitSizeInBytes.addAndGet(split.getEstimatedSizeInBytes()) > maxOutstandingSplitsBytes) { + // TODO: investigate alternative split discovery strategies when this error is hit. + // This limit should never be hit given there is a limit of maxOutstandingSplits. + // If it's hit, it means individual splits are huge. + if (loggedHighMemoryWarning.compareAndSet(false, true)) { + highMemorySplitSourceCounter.update(1); + log.warn("Split buffering for %s.%s in query %s exceeded memory limit (%s). %s splits are buffered.", + databaseName, tableName, queryId, succinctBytes(maxOutstandingSplitsBytes), getBufferedInternalSplitCount()); + } + throw new PrestoException(HiveErrorCode.HIVE_EXCEEDED_SPLIT_BUFFERING_LIMIT, format( + "Split buffering for %s.%s exceeded memory limit (%s). %s splits are buffered.", + databaseName, tableName, succinctBytes(maxOutstandingSplitsBytes), getBufferedInternalSplitCount())); + } + bufferedInternalSplitCount.incrementAndGet(); + OptionalInt bucketNumber = split.getBucketNumber(); + return queues.offer(bucketNumber, split); + } + + void noMoreSplits() + { + if (setIf(stateReference, State.noMoreSplits(), state -> state.getKind() == INITIAL)) { + // Stop the split loader before finishing the queue. + // Once the queue is finished, it will always return a completed future to avoid blocking any caller. + // This could lead to a short period of busy loop in splitLoader (although unlikely in general setup). + splitLoader.stop(); + queues.finish(); + } + } + + void fail(Throwable e) + { + // The error must be recorded before setting the finish marker to make sure + // isFinished will observe failure instead of successful completion. + // Only record the first error message. + if (setIf(stateReference, State.failed(e), state -> state.getKind() == INITIAL)) { + // Stop the split loader before finishing the queue. + // Once the queue is finished, it will always return a completed future to avoid blocking any caller. + // This could lead to a short period of busy loop in splitLoader (although unlikely in general setup). + splitLoader.stop(); + queues.finish(); + } + } + + @Override + public CompletableFuture getNextBatch(ConnectorPartitionHandle partitionHandle, int maxSize) + { + boolean noMoreSplits; + State state = stateReference.get(); + switch (state.getKind()) { + case INITIAL: + noMoreSplits = false; + break; + case NO_MORE_SPLITS: + noMoreSplits = true; + break; + case FAILED: + return failedFuture(state.getThrowable()); + case CLOSED: + throw new IllegalStateException("HiveSplitSource is already closed"); + default: + throw new UnsupportedOperationException(); + } + + OptionalInt bucketNumber = toBucketNumber(partitionHandle); + ListenableFuture> future = queues.borrowBatchAsync(bucketNumber, maxSize, internalSplits -> { + ImmutableList.Builder splitsToInsertBuilder = ImmutableList.builder(); + ImmutableList.Builder resultBuilder = ImmutableList.builder(); + int removedEstimatedSizeInBytes = 0; + for (InternalHiveSplit internalSplit : internalSplits) { + long maxSplitBytes = getMaxSplitBytes(); + + InternalHiveSplit.InternalHiveBlock block = internalSplit.currentBlock(); + long splitBytes; + if (internalSplit.isSplittable()) { + splitBytes = min(maxSplitBytes, block.getEnd() - internalSplit.getStart()); + } + else { + splitBytes = internalSplit.getEnd() - internalSplit.getStart(); + } + boolean splitCacheable = matchesUserDefinedCachedPredicates(internalSplit.getPartitionKeys()); + + resultBuilder.add(HiveSplitWrapper.wrap(new HiveSplit( + databaseName, + tableName, + internalSplit.getPartitionName(), + internalSplit.getPath(), + internalSplit.getStart(), + splitBytes, + internalSplit.getFileSize(), + internalSplit.getLastModifiedTime(), + internalSplit.getSchema(), + internalSplit.getPartitionKeys(), + block.getAddresses(), + internalSplit.getBucketNumber(), + internalSplit.isForceLocalScheduling(), + transformValues(internalSplit.getColumnCoercions(), HiveTypeName::toHiveType), + internalSplit.getBucketConversion(), + internalSplit.isS3SelectPushdownEnabled(), + internalSplit.getDeleteDeltaLocations(), + internalSplit.getStartRowOffsetOfFile(), + splitCacheable, + internalSplit.getCustomSplitInfo()))); + + internalSplit.increaseStart(splitBytes); + + if (internalSplit.isDone()) { + removedEstimatedSizeInBytes += internalSplit.getEstimatedSizeInBytes(); + } + else { + splitsToInsertBuilder.add(internalSplit); + } + } + estimatedSplitSizeInBytes.addAndGet(-removedEstimatedSizeInBytes); + + List splitsToInsert = splitsToInsertBuilder.build(); + List result = resultBuilder.build(); + bufferedInternalSplitCount.addAndGet(splitsToInsert.size() - result.size()); + + return new AsyncQueue.BorrowResult<>(splitsToInsert, result); + }); + + ListenableFuture transform = Futures.transform(future, splits -> { + requireNonNull(splits, "splits is null"); + + // Filter out splits if dynamic filter is available + if (dynamicFilterSupplier != null && isSplitFilteringEnabled) { + splits = splits.stream() + .filter(split -> !isPartitionFiltered(HiveSplitWrapper.getOnlyHiveSplit(split).getPartitionKeys(), dynamicFilterSupplier.get(), typeManager)) + .collect(Collectors.toList()); + } + + if (noMoreSplits) { + // Checking splits.isEmpty() here is required for thread safety. + // Let's say there are 10 splits left, and max number of splits per batch is 5. + // The futures constructed in two getNextBatch calls could each fetch 5, resulting in zero splits left. + // After fetching the splits, both futures reach this line at the same time. + // Without the isEmpty check, both will claim they are the last. + // Side note 1: In such a case, it doesn't actually matter which one gets to claim it's the last. + // But having both claim they are the last would be a surprising behavior. + // Side note 2: One could argue that the isEmpty check is overly conservative. + // The caller of getNextBatch will likely need to make an extra invocation. + // But an extra invocation likely doesn't matter. + return new ConnectorSplitBatch(splits, splits.isEmpty() && queues.isFinished(bucketNumber)); + } + else { + return new ConnectorSplitBatch(splits, false); + } + }, directExecutor()); + + return toCompletableFuture(transform); + } + + /** + * Validate the partitions key against all the user defined predicates + * to determine whether or not that split should be cached. + * + * @return true if partition key matches the user defined cache predicates + * false otherwise + */ + private boolean matchesUserDefinedCachedPredicates(List partitionKeys) + { + if (userDefinedCachePredicates == null || userDefinedCachePredicates.isEmpty() || partitionKeys == null || partitionKeys.isEmpty()) { + return false; + } + + try { + Map hivePartitionKeyMap = partitionKeys.stream().collect(Collectors.toMap(HivePartitionKey::getName, Function.identity())); + for (TupleDomain tupleDomain : userDefinedCachePredicates) { + if (!tupleDomain.getDomains().isPresent()) { + continue; + } + + Map domainMap = tupleDomain.getDomains().get(); + Collection columnsDefinedInPredicate = domainMap.keySet().stream().map(ColumnMetadata::getName).collect(Collectors.toList()); + if (!hivePartitionKeyMap.keySet().containsAll(columnsDefinedInPredicate)) { + continue; + } + + boolean allMatches = domainMap.entrySet().stream().allMatch(entry -> { + ColumnMetadata columnMetadata = entry.getKey(); + Domain domain = entry.getValue(); + String partitionStringValue = hivePartitionKeyMap.get(columnMetadata.getName()).getValue(); + NullableValue nullableValue; + if (partitionStringValue.equals("\\N")) { + nullableValue = NullableValue.asNull(columnMetadata.getType()); + } + else { + nullableValue = HiveUtil.parsePartitionValue(columnMetadata.getName(), partitionStringValue, columnMetadata.getType()); + } + return domain.includesNullableValue(nullableValue.getValue()); + }); + + if (allMatches) { + return true; + } + } + } + catch (Exception ex) { + log.warn(ex, "Unable to match partition keys %s with cached predicates. Ignoring this partition key. Error = %s", partitionKeys, ex.getMessage()); + } + return false; + } + + @Override + public boolean isFinished() + { + State state = stateReference.get(); + + switch (state.getKind()) { + case INITIAL: + return false; + case NO_MORE_SPLITS: + return bufferedInternalSplitCount.get() == 0; + case FAILED: + throw propagatePrestoException(state.getThrowable()); + case CLOSED: + throw new IllegalStateException("HiveSplitSource is already closed"); + default: + throw new UnsupportedOperationException(); + } + } + + @Override + public void close() + { + if (setIf(stateReference, State.closed(), state -> state.getKind() == INITIAL || state.getKind() == NO_MORE_SPLITS)) { + // Stop the split loader before finishing the queue. + // Once the queue is finished, it will always return a completed future to avoid blocking any caller. + // This could lead to a short period of busy loop in splitLoader (although unlikely in general setup). + splitLoader.stop(); + queues.finish(); + } + } + + private static OptionalInt toBucketNumber(ConnectorPartitionHandle partitionHandle) + { + if (partitionHandle == NOT_PARTITIONED) { + return OptionalInt.empty(); + } + return OptionalInt.of(((HivePartitionHandle) partitionHandle).getBucket()); + } + + private static boolean setIf(AtomicReference atomicReference, T newValue, Predicate predicate) + { + while (true) { + T current = atomicReference.get(); + if (!predicate.test(current)) { + return false; + } + if (atomicReference.compareAndSet(current, newValue)) { + return true; + } + } + } + + static RuntimeException propagatePrestoException(Throwable throwable) + { + if (throwable instanceof PrestoException) { + throw (PrestoException) throwable; + } + if (throwable instanceof FileNotFoundException) { + throw new PrestoException(HiveErrorCode.HIVE_FILE_NOT_FOUND, throwable); + } + throw new PrestoException(HiveErrorCode.HIVE_UNKNOWN_ERROR, throwable); + } + + interface PerBucket + { + ListenableFuture offer(OptionalInt bucketNumber, InternalHiveSplit split); + + ListenableFuture borrowBatchAsync(OptionalInt bucketNumber, int maxSize, Function, AsyncQueue.BorrowResult> function); + + void finish(); + + boolean isFinished(OptionalInt bucketNumber); + } + + static class State + { + private final StateKind kind; + private final Throwable throwable; + + private State(StateKind kind, Throwable throwable) + { + this.kind = kind; + this.throwable = throwable; + } + + public StateKind getKind() + { + return kind; + } + + public Throwable getThrowable() + { + checkState(throwable != null); + return throwable; + } + + public static State initial() + { + return new State(INITIAL, null); + } + + public static State noMoreSplits() + { + return new State(NO_MORE_SPLITS, null); + } + + public static State failed(Throwable throwable) + { + return new State(FAILED, throwable); + } + + public static State closed() + { + return new State(CLOSED, null); + } + } + + enum StateKind + { + INITIAL, + NO_MORE_SPLITS, + FAILED, + CLOSED, + } + + @Override + public List groupSmallSplits(List splitList, int maxGroupSize) + { + if (splitList.isEmpty()) { + return splitList; + } + + int maxSmallSplitsCanBeGrouped = Math.max(hiveConfig.getMaxSplitsToGroup(), maxGroupSize); + if (maxSmallSplitsCanBeGrouped < 2) { + return splitList; + } + + if (hiveStorageFormat != HiveStorageFormat.ORC) { + return splitList; + } + + ImmutableList.Builder connectorSplitList = ImmutableList.builder(); + List hiveSplitWrappers = new ArrayList<>(); + splitList.forEach(pendingSplit -> hiveSplitWrappers.add((HiveSplitWrapper) pendingSplit)); + + long maxSplitBytes = getMaxSplitBytes(); + + /* + 1) selecting splits that are less than MaxSplitSize and Not cached + 2) using MultiMap bucketNumToHiveSplitsMap separating splits based on bucked Id + 3) in every same bucket ID list, using MultiMap separating splits based on locations + 4) Sorting the List base on size, in descending order + 5) grouping first element from list (large file size) + last element from list (small file size) + i) till it reaches maxSplitSize and other conditions. + ii) Number of files should not cross MinValue(max-splits-to-group, Total number of files with location / Number of selected locations). + */ + + Multimap bucketNumToHiveSplitsMap = HashMultimap.create(); + int replicationFactor = hiveSplitWrappers.get(0).getSplits().get(0).getAddresses().size(); + boolean bucketNumberPresent = hiveSplitWrappers.get(0).getBucketNumber().isPresent(); + + //1> add to MultiMap bucketNumToHiveSplitsMap small files base of bucket number. + if (false == getSmallerSplits(hiveSplitWrappers, bucketNumToHiveSplitsMap, maxSplitBytes, replicationFactor, connectorSplitList)) { + return splitList; + } + + // 3> in each and very Bucket, Multimap based on location, key first 3 location & value List + for (Integer bucketNumber : bucketNumToHiveSplitsMap.keySet()) { + Multimap hostAddressHiveSplits = HashMultimap.create(); + + Collection hiveSplits = bucketNumToHiveSplitsMap.get(bucketNumber); + //Location base Multimap + groupBaseOnLocation(hiveSplits, hostAddressHiveSplits); + + /* + Number of files should not cross MinValue(max-splits-to-group, Total number of files with location / Number of selected locations). + */ + + for (String hostAddressText : hostAddressHiveSplits.keySet()) { + List locationBaseHiveSplits = new ArrayList<>(); + hostAddressHiveSplits.get(hostAddressText).forEach(split1 -> locationBaseHiveSplits.add(split1)); + + //4> sort in descending order by file size + //locationBaseHiveSplits.sort(new HiveSplitSortBySize()); + locationBaseHiveSplits.sort((split1, split2) -> ((int) (split2.getFileSize() - split1.getFileSize()))); + + int numberOfSplitsPerLocation = locationBaseHiveSplits.size(); + + //when number of flies are less than number of replication factor,splits are grouped in to single group. + int avgSplitsPerNode = ((replicationFactor != 0) && (numberOfSplitsPerLocation >= replicationFactor)) ? numberOfSplitsPerLocation / replicationFactor : numberOfSplitsPerLocation; + + List groupedHiveSplit = new ArrayList<>(); + long totalSize = 0; + int numberOfSplitsGrouped = 0; + + // to mean the size , add one big size file + one small size file present in the list + while (!locationBaseHiveSplits.isEmpty()) { + int i = 0; + // add bigger file + totalSize += locationBaseHiveSplits.get(i).getFileSize(); + numberOfSplitsGrouped += 1; + if ((maxSplitBytes < totalSize) || (avgSplitsPerNode < numberOfSplitsGrouped) || (maxSmallSplitsCanBeGrouped < numberOfSplitsGrouped)) { + connectorSplitList.add(HiveSplitWrapper.wrap(groupedHiveSplit, bucketNumberPresent ? OptionalInt.of(bucketNumber) : OptionalInt.empty())); + log.debug("info table %s, groupedHiveSplit size %d, maxSplitBytes %d, totalSize %d, avgSplitsPerNode %d, numberOfSplitsGrouped %d, maxSmallSplitsCanBeGrouped %d, numberOfSplitsGrouped %d ", + groupedHiveSplit.get(0).getTable(), groupedHiveSplit.size(), maxSplitBytes, totalSize, avgSplitsPerNode, numberOfSplitsGrouped, maxSmallSplitsCanBeGrouped, numberOfSplitsGrouped); + totalSize = 0; + numberOfSplitsGrouped = 0; + groupedHiveSplit = new ArrayList<>(); + continue; + } + groupedHiveSplit.add(locationBaseHiveSplits.get(i)); + locationBaseHiveSplits.remove(i); + if (locationBaseHiveSplits.isEmpty()) { + break; + } + + // add smaller file + int lastSplitLocation = locationBaseHiveSplits.size() - 1; + totalSize += locationBaseHiveSplits.get(lastSplitLocation).getFileSize(); + numberOfSplitsGrouped += 1; + if ((maxSplitBytes < totalSize) || (avgSplitsPerNode < numberOfSplitsGrouped) || (maxSmallSplitsCanBeGrouped < numberOfSplitsGrouped)) { + connectorSplitList.add(HiveSplitWrapper.wrap(groupedHiveSplit, bucketNumberPresent ? OptionalInt.of(bucketNumber) : OptionalInt.empty())); + log.debug("info table %s, groupedHiveSplit size %d, maxSplitBytes %d, totalSize %d, avgSplitsPerNode %d, numberOfSplitsGrouped %d, maxSmallSplitsCanBeGrouped %d, numberOfSplitsGrouped %d ", + groupedHiveSplit.get(0).getTable(), groupedHiveSplit.size(), maxSplitBytes, totalSize, avgSplitsPerNode, numberOfSplitsGrouped, maxSmallSplitsCanBeGrouped, numberOfSplitsGrouped); + totalSize = 0; + numberOfSplitsGrouped = 0; + groupedHiveSplit = new ArrayList<>(); + continue; + } + + groupedHiveSplit.add(locationBaseHiveSplits.get(lastSplitLocation)); + locationBaseHiveSplits.remove(lastSplitLocation); + } + if (!groupedHiveSplit.isEmpty()) { + connectorSplitList.add(HiveSplitWrapper.wrap(groupedHiveSplit, bucketNumberPresent ? OptionalInt.of(bucketNumber) : OptionalInt.empty())); + } + } + } + List resultConnectorSplits = connectorSplitList.build(); + log.debug("info resultBuilder size %d", resultConnectorSplits.size()); + return resultConnectorSplits; + } + + private boolean getSmallerSplits(List hiveSplitWrappers, Multimap bucketNumberHiveSplits, + long maxSplitBytes, int replicationFactor, ImmutableList.Builder connectorSplitList) + { + int numSmallSplits = 0; + for (HiveSplitWrapper hiveSplitWrapper : hiveSplitWrappers) { + HiveSplit hiveSplit = hiveSplitWrapper.getSplits().get(0); + long fileSize = hiveSplit.getFileSize(); + + if (hiveSplit.isCacheable() || (replicationFactor != hiveSplit.getAddresses().size())) { + //if different files have different replication factor or cache table, if will not be grouped. + return false; + } + + // 1) filtering small files. + if (fileSize < maxSplitBytes) { + //2) using MultiMap separating splits based on bucked Id. + bucketNumberHiveSplits.put(hiveSplit.getBucketNumber().isPresent() ? hiveSplit.getBucketNumber().getAsInt() : 0, hiveSplit); + numSmallSplits++; + } + else { + connectorSplitList.add(HiveSplitWrapper.wrap(hiveSplit)); + } + } + + if (0 == numSmallSplits) { + // There are no small files to group + return false; + } + + log.info("info total Split %d, numSmallSplits %d ", hiveSplitWrappers.size(), numSmallSplits); + return true; + } + + private void groupBaseOnLocation(Collection bucketBasedHiveSplits, Multimap hostAddressHiveSplits) + { + for (HiveSplit hiveSplit : bucketBasedHiveSplits) { + List hostAddresses = new ArrayList<>(); + hostAddresses.addAll(hiveSplit.getAddresses()); + hostAddresses.sort((host1, host2) -> (host1.getHostText().compareTo(host2.getHostText()))); + + StringBuilder hostAddressText = new StringBuilder(); + hostAddresses.forEach(hostAddress -> hostAddressText.append(hostAddress.getHostText())); + hostAddressHiveSplits.put(hostAddressText.toString(), hiveSplit); + } + } + + private long getMaxSplitBytes() + { + long maxSplitBytes = maxSplitSize.toBytes(); + if (remainingInitialSplits.get() > 0) { + if (remainingInitialSplits.getAndDecrement() > 0) { + maxSplitBytes = maxInitialSplitSize.toBytes(); + } + } + return maxSplitBytes; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitWrapper.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitWrapper.java new file mode 100644 index 00000000..33ef1acf --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveSplitWrapper.java @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.spi.HostAddress; +import io.prestosql.spi.connector.ConnectorSplit; + +import java.util.List; +import java.util.OptionalInt; +import java.util.stream.Collectors; + +import static com.google.common.collect.Iterables.getOnlyElement; +import static java.util.Objects.requireNonNull; + +public class HiveSplitWrapper + implements ConnectorSplit +{ + private final List splits; + private final OptionalInt bucketNumber; + + @JsonCreator + public HiveSplitWrapper( + @JsonProperty("splits") List splits, + @JsonProperty("bucketNumber") OptionalInt bucketNumber) + { + this.splits = requireNonNull(splits, "split lists is null"); + this.bucketNumber = bucketNumber; + } + + @Override + public String getFilePath() + { + return splits.stream().findFirst().orElseThrow(IllegalArgumentException::new).getFilePath(); + } + + @Override + public long getStartIndex() + { + return splits.stream().findFirst().orElseThrow(IllegalArgumentException::new).getStartIndex(); + } + + @Override + public long getEndIndex() + { + return splits.stream().findFirst().orElseThrow(IllegalArgumentException::new).getEndIndex(); + } + + @Override + public long getLastModifiedTime() + { + return splits.stream().findFirst().orElseThrow(IllegalArgumentException::new).getLastModifiedTime(); + } + + @Override + public boolean isCacheable() + { + return splits.stream().findFirst().orElseThrow(IllegalAccessError::new).isCacheable(); + } + + @JsonProperty + public List getSplits() + { + return splits; + } + + @Override + public List getUnwrappedSplits() + { + /* + * Splits are unwrapped here when called by Split#getSplits() + * for Split Assignment processing in Reuse Exchange, where the Consumer splits are assigned + * to same node as Producer. Split by Split comparison is done for the same + * and therefore, wrapped splits cannot be used. + * */ + return splits.stream().map(x -> wrap(x)).collect(Collectors.toList()); + } + + @JsonProperty + public OptionalInt getBucketNumber() + { + return bucketNumber; + } + + @Override + public boolean isRemotelyAccessible() + { + return true; + } + + @Override + public List getAddresses() + { + return splits.stream() + .flatMap(s -> s.getAddresses().stream()) + .distinct() + .collect(Collectors.toList()); + } + + @Override + public Object getInfo() + { + if (splits.isEmpty()) { + return ImmutableMap.of(); + } + HiveSplit split = splits.get(0); + return ImmutableMap.builder() + .put("hosts", getAddresses()) + .put("database", split.getDatabase()) + .put("table", split.getTable()) + .put("partitionName", split.getPartitionName()) + .build(); + } + + public static HiveSplitWrapper wrap(HiveSplit hiveSplit) + { + return new HiveSplitWrapper(ImmutableList.of(hiveSplit), hiveSplit.getBucketNumber()); + } + + public static HiveSplitWrapper wrap(List hiveSplitList, OptionalInt bucketNumber) + { + return new HiveSplitWrapper(hiveSplitList, bucketNumber); + } + + public static HiveSplit getOnlyHiveSplit(ConnectorSplit connectorSplit) + { + return getOnlyElement(((HiveSplitWrapper) connectorSplit).getSplits()); + } + + @Override + public int getSplitCount() + { + return splits.size(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveStorageFormat.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveStorageFormat.java new file mode 100644 index 00000000..74da4a24 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveStorageFormat.java @@ -0,0 +1,173 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.units.DataSize; +import io.airlift.units.DataSize.Unit; +import io.prestosql.spi.PrestoException; +import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat; +import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat; +import org.apache.hadoop.hive.ql.io.RCFileInputFormat; +import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; +import org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat; +import org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcSerde; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.serde2.OpenCSVSerde; +import org.apache.hadoop.hive.serde2.avro.AvroSerDe; +import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; +import org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hive.hcatalog.data.JsonSerDe; + +import java.util.List; + +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public enum HiveStorageFormat + implements BaseStorageFormat +{ + ORC( + OrcSerde.class.getName(), + OrcInputFormat.class.getName(), + OrcOutputFormat.class.getName(), + new DataSize(256, Unit.MEGABYTE)), + PARQUET( + ParquetHiveSerDe.class.getName(), + MapredParquetInputFormat.class.getName(), + MapredParquetOutputFormat.class.getName(), + new DataSize(128, Unit.MEGABYTE)), + AVRO( + AvroSerDe.class.getName(), + AvroContainerInputFormat.class.getName(), + AvroContainerOutputFormat.class.getName(), + new DataSize(64, Unit.MEGABYTE)), + RCBINARY( + LazyBinaryColumnarSerDe.class.getName(), + RCFileInputFormat.class.getName(), + RCFileOutputFormat.class.getName(), + new DataSize(8, Unit.MEGABYTE)), + RCTEXT( + ColumnarSerDe.class.getName(), + RCFileInputFormat.class.getName(), + RCFileOutputFormat.class.getName(), + new DataSize(8, Unit.MEGABYTE)), + SEQUENCEFILE( + LazySimpleSerDe.class.getName(), + SequenceFileInputFormat.class.getName(), + HiveSequenceFileOutputFormat.class.getName(), + new DataSize(8, Unit.MEGABYTE)), + JSON( + JsonSerDe.class.getName(), + TextInputFormat.class.getName(), + HiveIgnoreKeyTextOutputFormat.class.getName(), + new DataSize(8, Unit.MEGABYTE)), + TEXTFILE( + LazySimpleSerDe.class.getName(), + TextInputFormat.class.getName(), + HiveIgnoreKeyTextOutputFormat.class.getName(), + new DataSize(8, Unit.MEGABYTE)), + CSV( + OpenCSVSerde.class.getName(), + TextInputFormat.class.getName(), + HiveIgnoreKeyTextOutputFormat.class.getName(), + new DataSize(8, Unit.MEGABYTE)); + + private final String serde; + private final String inputFormat; + private final String outputFormat; + private final DataSize estimatedWriterSystemMemoryUsage; + + HiveStorageFormat(String serde, String inputFormat, String outputFormat, DataSize estimatedWriterSystemMemoryUsage) + { + this.serde = requireNonNull(serde, "serde is null"); + this.inputFormat = requireNonNull(inputFormat, "inputFormat is null"); + this.outputFormat = requireNonNull(outputFormat, "outputFormat is null"); + this.estimatedWriterSystemMemoryUsage = requireNonNull(estimatedWriterSystemMemoryUsage, "estimatedWriterSystemMemoryUsage is null"); + } + + public String getSerDe() + { + return serde; + } + + public String getInputFormat() + { + return inputFormat; + } + + public String getOutputFormat() + { + return outputFormat; + } + + public DataSize getEstimatedWriterSystemMemoryUsage() + { + return estimatedWriterSystemMemoryUsage; + } + + public void validateColumns(List handles) + { + if (this == AVRO) { + for (HiveColumnHandle handle : handles) { + if (!handle.isPartitionKey()) { + validateAvroType(handle.getHiveType().getTypeInfo(), handle.getName()); + } + } + } + } + + private static void validateAvroType(TypeInfo type, String columnName) + { + if (type.getCategory() == Category.MAP) { + TypeInfo keyType = mapTypeInfo(type).getMapKeyTypeInfo(); + if ((keyType.getCategory() != Category.PRIMITIVE) || + (primitiveTypeInfo(keyType).getPrimitiveCategory() != PrimitiveCategory.STRING)) { + throw new PrestoException(NOT_SUPPORTED, format("Column %s has a non-varchar map key, which is not supported by Avro", columnName)); + } + } + else if (type.getCategory() == Category.PRIMITIVE) { + PrimitiveCategory primitive = primitiveTypeInfo(type).getPrimitiveCategory(); + if (primitive == PrimitiveCategory.BYTE) { + throw new PrestoException(NOT_SUPPORTED, format("Column %s is tinyint, which is not supported by Avro. Use integer instead.", columnName)); + } + if (primitive == PrimitiveCategory.SHORT) { + throw new PrestoException(NOT_SUPPORTED, format("Column %s is smallint, which is not supported by Avro. Use integer instead.", columnName)); + } + } + } + + private static PrimitiveTypeInfo primitiveTypeInfo(TypeInfo typeInfo) + { + return (PrimitiveTypeInfo) typeInfo; + } + + private static MapTypeInfo mapTypeInfo(TypeInfo typeInfo) + { + return (MapTypeInfo) typeInfo; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTableHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTableHandle.java new file mode 100644 index 00000000..be7c7568 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTableHandle.java @@ -0,0 +1,472 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.TupleDomain; +import org.apache.hadoop.hive.ql.io.AcidUtils; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static io.prestosql.plugin.hive.HiveMetadata.STORAGE_FORMAT; +import static java.util.Objects.requireNonNull; + +public class HiveTableHandle + implements ConnectorTableHandle +{ + private final String schemaName; + private final String tableName; + private final Optional> tableParameters; + private final List partitionColumns; + private final Optional> partitions; + private final TupleDomain compactEffectivePredicate; + private final TupleDomain enforcedConstraint; + private final Optional bucketHandle; + private final Optional bucketFilter; + private final Optional>> analyzePartitionValues; + private final Map predicateColumns; + private final Optional>> disjunctCompactEffectivePredicate; + private final boolean suitableToPush; +// private final RowExpression remainingPredicate; //For Complex Expression. + private final HiveOffloadExpression offloadExpression; + + @JsonCreator + public HiveTableHandle( + @JsonProperty("schemaName") String schemaName, + @JsonProperty("tableName") String tableName, + @JsonProperty("partitionColumns") List partitionColumns, + @JsonProperty("compactEffectivePredicate") TupleDomain compactEffectivePredicate, + @JsonProperty("enforcedConstraint") TupleDomain enforcedConstraint, + @JsonProperty("bucketHandle") Optional bucketHandle, + @JsonProperty("bucketFilter") Optional bucketFilter, + @JsonProperty("analyzePartitionValues") Optional>> analyzePartitionValues, + @JsonProperty("predicateColumns") Map predicateColumns, + @JsonProperty("additionaPredicates") Optional>> disjunctCompactEffectivePredicate, + @JsonProperty("suitableToPush") boolean suitableToPush, + @JsonProperty("offloadExpression") HiveOffloadExpression offloadExpression) + { + this( + schemaName, + tableName, + Optional.empty(), + partitionColumns, + Optional.empty(), + compactEffectivePredicate, + enforcedConstraint, + bucketHandle, + bucketFilter, + analyzePartitionValues, + predicateColumns, + disjunctCompactEffectivePredicate, + suitableToPush, + offloadExpression); + } + + public HiveTableHandle( + String schemaName, + String tableName, + Map tableParameters, + List partitionColumns, + Optional bucketHandle) + { + this( + schemaName, + tableName, + Optional.of(tableParameters), + partitionColumns, + Optional.empty(), + TupleDomain.all(), + TupleDomain.all(), + bucketHandle, + Optional.empty(), + Optional.empty(), + Collections.emptyMap(), + Optional.empty(), + false, + new HiveOffloadExpression()); + } + + public HiveTableHandle( + String schemaName, + String tableName, + Optional> tableParameters, + List partitionColumns, + Optional> partitions, + TupleDomain compactEffectivePredicate, + TupleDomain enforcedConstraint, + Optional bucketHandle, + Optional bucketFilter, + Optional>> analyzePartitionValues, + Map predicateColumns, + Optional>> disjunctCompactEffectivePredicate, + boolean suitableToPush, + HiveOffloadExpression offloadExpression) + { + this.schemaName = requireNonNull(schemaName, "schemaName is null"); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.tableParameters = requireNonNull(tableParameters, "tableParameters is null").map(ImmutableMap::copyOf); + this.partitionColumns = ImmutableList.copyOf(requireNonNull(partitionColumns, "partitionColumns is null")); + this.partitions = requireNonNull(partitions, "partitions is null").map(ImmutableList::copyOf); + this.compactEffectivePredicate = requireNonNull(compactEffectivePredicate, "compactEffectivePredicate is null"); + this.enforcedConstraint = requireNonNull(enforcedConstraint, "enforcedConstraint is null"); + this.bucketHandle = requireNonNull(bucketHandle, "bucketHandle is null"); + this.bucketFilter = requireNonNull(bucketFilter, "bucketFilter is null"); + this.analyzePartitionValues = requireNonNull(analyzePartitionValues, "analyzePartitionValues is null"); + this.predicateColumns = predicateColumns; + this.disjunctCompactEffectivePredicate = requireNonNull(disjunctCompactEffectivePredicate, "disjunctCompactEffectivePredicate is null"); + this.suitableToPush = suitableToPush; + this.offloadExpression = offloadExpression; + } + + public HiveTableHandle withAnalyzePartitionValues(Optional>> analyzePartitionValues) + { + return new HiveTableHandle( + schemaName, + tableName, + tableParameters, + partitionColumns, + partitions, + compactEffectivePredicate, + enforcedConstraint, + bucketHandle, + bucketFilter, + analyzePartitionValues, + predicateColumns, + Optional.empty(), + suitableToPush, + offloadExpression); + } + + public HiveTableHandle withOffloadExpression(HiveOffloadExpression offloadExpression) + { + return new HiveTableHandle( + schemaName, + tableName, + tableParameters, + partitionColumns, + partitions, + compactEffectivePredicate, + enforcedConstraint, + bucketHandle, + bucketFilter, + analyzePartitionValues, + predicateColumns, + disjunctCompactEffectivePredicate, + suitableToPush, + offloadExpression); + } + + @JsonProperty + public String getSchemaName() + { + return schemaName; + } + + @JsonProperty + public String getTableName() + { + return tableName; + } + + // do not serialize tableParameters as they are not needed on workers + @JsonIgnore + public Optional> getTableParameters() + { + return tableParameters; + } + + //hetu: add implementation to get qualified name + @Override + public String getSchemaPrefixedTableName() + { + return String.format("%s.%s", schemaName, tableName); + } + + //hetu: return true for isFilterSupported for hive connector + @Override + public boolean isFilterSupported() + { + return true; + } + + @JsonProperty + public List getPartitionColumns() + { + return partitionColumns; + } + + // do not serialize partitions as they are not needed on workers + @JsonIgnore + public Optional> getPartitions() + { + return partitions; + } + + @JsonProperty + public TupleDomain getCompactEffectivePredicate() + { + return compactEffectivePredicate; + } + + @JsonProperty + public Map getPredicateColumns() + { + return predicateColumns; + } + + @JsonProperty + public TupleDomain getEnforcedConstraint() + { + return enforcedConstraint; + } + + @JsonProperty + public Optional getBucketHandle() + { + return bucketHandle; + } + + @JsonProperty + public Optional getBucketFilter() + { + return bucketFilter; + } + + @JsonProperty + public Optional>> getAnalyzePartitionValues() + { + return analyzePartitionValues; + } + + public SchemaTableName getSchemaTableName() + { + return new SchemaTableName(schemaName, tableName); + } + + @JsonProperty + public Optional>> getDisjunctCompactEffectivePredicate() + { + return disjunctCompactEffectivePredicate; + } + + @JsonProperty + public boolean isSuitableToPush() + { + return suitableToPush; + } + + @JsonProperty + public HiveOffloadExpression getOffloadExpression() + { + return offloadExpression; + } + + /** + * Hetu execution plan caching functionality requires a method to update + * {@link ConnectorTableHandle} from a previous execution plan with new info from + * a new query. This is a workaround for hetu-main module to modify jdbc connector table + * handles in a generic way without needing access to classes in hetu-hive package or + * knowledge of connector specific constructor implementations. Connectors must override this + * method to support execution plan caching. + * + * @param oldConnectorTableHandle connector table handle containing information + * to be passed into a new {@link HiveTableHandle} + * @return new {@link HiveTableHandle} containing the constraints, limit, + * and subquery from an old {@link HiveTableHandle} + */ + @Override + public ConnectorTableHandle createFrom(ConnectorTableHandle oldConnectorTableHandle) + { + HiveTableHandle oldHiveConnectorTableHandle = (HiveTableHandle) oldConnectorTableHandle; + return new HiveTableHandle(oldHiveConnectorTableHandle.getSchemaName(), + oldHiveConnectorTableHandle.getTableName(), + oldHiveConnectorTableHandle.getTableParameters(), + oldHiveConnectorTableHandle.getPartitionColumns(), + oldHiveConnectorTableHandle.getPartitions(), + oldHiveConnectorTableHandle.getCompactEffectivePredicate(), + oldHiveConnectorTableHandle.getEnforcedConstraint(), + oldHiveConnectorTableHandle.getBucketHandle(), + oldHiveConnectorTableHandle.getBucketFilter(), + oldHiveConnectorTableHandle.getAnalyzePartitionValues(), + oldHiveConnectorTableHandle.getPredicateColumns(), + oldHiveConnectorTableHandle.getDisjunctCompactEffectivePredicate(), + oldHiveConnectorTableHandle.isSuitableToPush(), + oldHiveConnectorTableHandle.getOffloadExpression()); + } + + @Override + public boolean hasDisjunctFiltersPushdown() + { + return disjunctCompactEffectivePredicate.isPresent() + && disjunctCompactEffectivePredicate.get().size() > 0; + } + + private String formatPredicate(Function printer, TupleDomain predicate) + { + return predicate.getDomains().get().entrySet().stream() + .map(filter -> filter.getKey().getColumnName() + " <- " + printer.apply(filter.getValue())) + .collect(Collectors.joining(" AND ", "{", "}")); + } + + @Override + public String getDisjunctFilterConditions(Function printer) + { + if (disjunctCompactEffectivePredicate.isPresent()) { + return disjunctCompactEffectivePredicate.get().stream() + .map(predicate -> "[ " + formatPredicate(printer, predicate) + " ]") + .collect(Collectors.joining(" OR ")); + } + + return ""; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HiveTableHandle that = (HiveTableHandle) o; + return Objects.equals(schemaName, that.schemaName) && + Objects.equals(tableName, that.tableName) && + Objects.equals(tableParameters, that.tableParameters) && + Objects.equals(partitionColumns, that.partitionColumns) && + Objects.equals(partitions, that.partitions) && + Objects.equals(compactEffectivePredicate, that.compactEffectivePredicate) && + Objects.equals(enforcedConstraint, that.enforcedConstraint) && + Objects.equals(bucketHandle, that.bucketHandle) && + Objects.equals(bucketFilter, that.bucketFilter) && + Objects.equals(analyzePartitionValues, that.analyzePartitionValues) && + Objects.equals(predicateColumns, that.predicateColumns) && + Objects.equals(disjunctCompactEffectivePredicate, that.disjunctCompactEffectivePredicate) && + suitableToPush == that.suitableToPush; + } + + @Override + public boolean basicEquals(ConnectorTableHandle o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HiveTableHandle that = (HiveTableHandle) o; + return Objects.equals(schemaName, that.schemaName) && + Objects.equals(tableName, that.tableName) && + Objects.equals(tableParameters, that.tableParameters) && + Objects.equals(partitionColumns, that.partitionColumns) && + Objects.equals(bucketHandle, that.bucketHandle); + } + + @Override + public int hashCode() + { + return Objects.hash(schemaName, tableName); + } + + @Override + public String toString() + { + StringBuilder builder = new StringBuilder(); + builder.append(schemaName).append(":").append(tableName); + bucketHandle.ifPresent(bucket -> + builder.append(" bucket=").append(bucket.getReadBucketCount())); + builder.append(offloadExpression.toString()); + return builder.toString(); + } + + @Override + public boolean isDeleteAsInsertSupported() + { + return AcidUtils.isTransactionalTable(getTableParameters().get()) && !AcidUtils.isInsertOnlyTable(getTableParameters().get()); + } + + @Override + public boolean isUpdateAsInsertSupported() + { + return true; + } + + @Override + public boolean isSuitableForPushdown() + { + return this.suitableToPush; + } + + @Override + public boolean isTableCacheable() + { + return HiveStorageFormat.ORC.getOutputFormat().equals(tableParameters.get().get(STORAGE_FORMAT)); + } + + /** + * ORC is the only format supported to create heuristic index now + * We will add more formats in the future. + */ + @Override + public boolean isHeuristicIndexSupported() + { + return Stream.of(HiveStorageFormat.ORC) + .anyMatch(storageFormat -> storageFormat.getOutputFormat().equals(tableParameters.get().get(STORAGE_FORMAT))); + } + + /** + * Create heuristic index... where predicate = xxx + * The predicate column only support partition columns + */ + @Override + public boolean isPartitionColumn(String column) + { + return partitionColumns.stream().map(HiveColumnHandle::getColumnName).collect(Collectors.toSet()).contains(column); + } + + /* This method checks if reuse table scan can be used*/ + @Override + public boolean isReuseTableScanSupported() + { + return true; + } + + /* This method checks if table properties caching supported*/ + @Override + public boolean isTablePropertiesCacheSupported() + { + return true; + } + + @Override + public boolean isSortBasedAggregationSupported() + { + return true; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTableProperties.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTableProperties.java new file mode 100644 index 00000000..84bce23f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTableProperties.java @@ -0,0 +1,307 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.plugin.hive.metastore.SortingColumn; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.session.PropertyMetadata; +import io.prestosql.spi.type.TypeManager; + +import javax.inject.Inject; + +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.prestosql.plugin.hive.HiveBucketing.BucketingVersion.BUCKETING_V1; +import static io.prestosql.plugin.hive.HiveBucketing.BucketingVersion.BUCKETING_V2; +import static io.prestosql.spi.StandardErrorCode.INVALID_TABLE_PROPERTY; +import static io.prestosql.spi.session.PropertyMetadata.booleanProperty; +import static io.prestosql.spi.session.PropertyMetadata.doubleProperty; +import static io.prestosql.spi.session.PropertyMetadata.enumProperty; +import static io.prestosql.spi.session.PropertyMetadata.integerProperty; +import static io.prestosql.spi.session.PropertyMetadata.stringProperty; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; +import static java.lang.String.format; +import static java.util.Locale.ENGLISH; + +public class HiveTableProperties +{ + @Deprecated + public static final String EXTERNAL_LOCATION_PROPERTY = "external_location"; + + public static final String LOCATION_PROPERTY = "location"; + public static final String IS_EXTERNAL_TABLE = "external"; + public static final String STORAGE_FORMAT_PROPERTY = "format"; + public static final String PARTITIONED_BY_PROPERTY = "partitioned_by"; + public static final String BUCKETED_BY_PROPERTY = "bucketed_by"; + public static final String BUCKETING_VERSION = "bucketing_version"; + public static final String BUCKET_COUNT_PROPERTY = "bucket_count"; + public static final String SORTED_BY_PROPERTY = "sorted_by"; + public static final String ORC_BLOOM_FILTER_COLUMNS = "orc_bloom_filter_columns"; + public static final String ORC_BLOOM_FILTER_FPP = "orc_bloom_filter_fpp"; + public static final String AVRO_SCHEMA_URL = "avro_schema_url"; + public static final String TEXTFILE_SKIP_HEADER_LINE_COUNT = "textfile_skip_header_line_count"; + public static final String TEXTFILE_SKIP_FOOTER_LINE_COUNT = "textfile_skip_footer_line_count"; + public static final String CSV_SEPARATOR = "csv_separator"; + public static final String CSV_QUOTE = "csv_quote"; + public static final String CSV_ESCAPE = "csv_escape"; + public static final String TRANSACTIONAL = "transactional"; + + public static final Set NON_INHERITABLE_PROPERTIES = ImmutableSet.of(EXTERNAL_LOCATION_PROPERTY, IS_EXTERNAL_TABLE, LOCATION_PROPERTY); + + private final List> tableProperties; + + @Inject + public HiveTableProperties(TypeManager typeManager, HiveConfig config) + { + tableProperties = ImmutableList.of( + stringProperty( + EXTERNAL_LOCATION_PROPERTY, + format("Deprecated, use '%s' and '%s' table properties instead", LOCATION_PROPERTY, IS_EXTERNAL_TABLE), + null, + false), + stringProperty( + LOCATION_PROPERTY, + "File system location URI for the table", + null, + false), + booleanProperty( + IS_EXTERNAL_TABLE, + "Is the table external Hive table", + false, + false), + enumProperty( + STORAGE_FORMAT_PROPERTY, + "Hive storage format for the table", + HiveStorageFormat.class, + config.getHiveStorageFormat(), + false), + new PropertyMetadata<>( + PARTITIONED_BY_PROPERTY, + "Partition columns", + typeManager.getType(parseTypeSignature("array(varchar)")), + List.class, + ImmutableList.of(), + false, + value -> ImmutableList.copyOf(((Collection) value).stream() + .map(name -> ((String) name).toLowerCase(ENGLISH)) + .collect(Collectors.toList())), + value -> value), + new PropertyMetadata<>( + BUCKETED_BY_PROPERTY, + "Bucketing columns", + typeManager.getType(parseTypeSignature("array(varchar)")), + List.class, + ImmutableList.of(), + false, + value -> ImmutableList.copyOf(((Collection) value).stream() + .map(name -> ((String) name).toLowerCase(ENGLISH)) + .collect(Collectors.toList())), + value -> value), + new PropertyMetadata<>( + SORTED_BY_PROPERTY, + "Bucket sorting columns", + typeManager.getType(parseTypeSignature("array(varchar)")), + List.class, + ImmutableList.of(), + false, + value -> ((Collection) value).stream() + .map(String.class::cast) + .map(name -> name.toLowerCase(ENGLISH)) + .map(HiveTableProperties::sortingColumnFromString) + .collect(toImmutableList()), + value -> ((Collection) value).stream() + .map(SortingColumn.class::cast) + .map(HiveTableProperties::sortingColumnToString) + .collect(toImmutableList())), + new PropertyMetadata<>( + ORC_BLOOM_FILTER_COLUMNS, + "ORC Bloom filter index columns", + typeManager.getType(parseTypeSignature("array(varchar)")), + List.class, + ImmutableList.of(), + false, + value -> ((Collection) value).stream() + .map(String.class::cast) + .map(name -> name.toLowerCase(ENGLISH)) + .collect(toImmutableList()), + value -> value), + doubleProperty( + ORC_BLOOM_FILTER_FPP, + "ORC Bloom filter false positive probability", + config.getOrcDefaultBloomFilterFpp(), + false), + integerProperty(BUCKETING_VERSION, "Bucketing version", null, false), + integerProperty(BUCKET_COUNT_PROPERTY, "Number of buckets", 0, false), + stringProperty(AVRO_SCHEMA_URL, "URI pointing to Avro schema for the table", null, false), + integerProperty(TEXTFILE_SKIP_HEADER_LINE_COUNT, "Number of header lines", null, false), + integerProperty(TEXTFILE_SKIP_FOOTER_LINE_COUNT, "Number of footer lines", null, false), + stringProperty(CSV_SEPARATOR, "CSV separator character", null, false), + stringProperty(CSV_QUOTE, "CSV quote character", null, false), + stringProperty(CSV_ESCAPE, "CSV escape character", null, false), + booleanProperty( + TRANSACTIONAL, + "Is transactional property enabled", + false, + false)); + } + + public List> getTableProperties() + { + return tableProperties; + } + + public static String getExternalLocation(Map tableProperties) + { + return (String) tableProperties.get(EXTERNAL_LOCATION_PROPERTY); + } + + public static Optional getLocation(Map tableProperties) + { + return Optional.ofNullable((String) tableProperties.get(LOCATION_PROPERTY)); + } + + public static boolean isExternalTable(Map tableProperties) + { + return Boolean.TRUE.equals(tableProperties.get(IS_EXTERNAL_TABLE)); + } + + public static String getAvroSchemaUrl(Map tableProperties) + { + return (String) tableProperties.get(AVRO_SCHEMA_URL); + } + + public static Optional getTextHeaderSkipCount(Map tableProperties) + { + return Optional.ofNullable((Integer) tableProperties.get(TEXTFILE_SKIP_HEADER_LINE_COUNT)); + } + + public static Optional getTextFooterSkipCount(Map tableProperties) + { + return Optional.ofNullable((Integer) tableProperties.get(TEXTFILE_SKIP_FOOTER_LINE_COUNT)); + } + + public static HiveStorageFormat getHiveStorageFormat(Map tableProperties) + { + return (HiveStorageFormat) tableProperties.get(STORAGE_FORMAT_PROPERTY); + } + + @SuppressWarnings("unchecked") + public static List getPartitionedBy(Map tableProperties) + { + List partitionedBy = (List) tableProperties.get(PARTITIONED_BY_PROPERTY); + return partitionedBy == null ? ImmutableList.of() : ImmutableList.copyOf(partitionedBy); + } + + public static Optional getBucketProperty(Map tableProperties) + { + List bucketedBy = getBucketedBy(tableProperties); + List sortedBy = getSortedBy(tableProperties); + int bucketCount = (Integer) tableProperties.get(BUCKET_COUNT_PROPERTY); + if ((bucketedBy.isEmpty()) && (bucketCount == 0)) { + if (!sortedBy.isEmpty()) { + throw new PrestoException(INVALID_TABLE_PROPERTY, format("%s may be specified only when %s is specified", SORTED_BY_PROPERTY, BUCKETED_BY_PROPERTY)); + } + return Optional.empty(); + } + if (bucketCount < 0) { + throw new PrestoException(INVALID_TABLE_PROPERTY, format("%s must be greater than zero", BUCKET_COUNT_PROPERTY)); + } + if (bucketedBy.isEmpty() || bucketCount == 0) { + throw new PrestoException(INVALID_TABLE_PROPERTY, format("%s and %s must be specified together", BUCKETED_BY_PROPERTY, BUCKET_COUNT_PROPERTY)); + } + BucketingVersion bucketingVersion = getBucketingVersion(tableProperties); + return Optional.of(new HiveBucketProperty(bucketedBy, bucketingVersion, bucketCount, sortedBy)); + } + + public static BucketingVersion getBucketingVersion(Map tableProperties) + { + Integer property = (Integer) tableProperties.get(BUCKETING_VERSION); + if (property == null || property == 1) { + return BUCKETING_V1; + } + if (property == 2) { + return BUCKETING_V2; + } + throw new PrestoException(INVALID_TABLE_PROPERTY, format("%s must be between 1 and 2 (inclusive): %s", BUCKETING_VERSION, property)); + } + + @SuppressWarnings("unchecked") + private static List getBucketedBy(Map tableProperties) + { + return (List) tableProperties.get(BUCKETED_BY_PROPERTY); + } + + @SuppressWarnings("unchecked") + private static List getSortedBy(Map tableProperties) + { + return (List) tableProperties.get(SORTED_BY_PROPERTY); + } + + @SuppressWarnings("unchecked") + public static List getOrcBloomFilterColumns(Map tableProperties) + { + return (List) tableProperties.get(ORC_BLOOM_FILTER_COLUMNS); + } + + public static Double getOrcBloomFilterFpp(Map tableProperties) + { + return (Double) tableProperties.get(ORC_BLOOM_FILTER_FPP); + } + + public static Optional getCsvProperty(Map tableProperties, String key) + { + Object value = tableProperties.get(key); + if (value == null) { + return Optional.empty(); + } + String csvValue = (String) value; + if (csvValue.length() != 1) { + throw new PrestoException(INVALID_TABLE_PROPERTY, format("%s must be a single character string, but was: '%s'", key, csvValue)); + } + return Optional.of(csvValue.charAt(0)); + } + + private static SortingColumn sortingColumnFromString(String name) + { + SortingColumn.Order order = SortingColumn.Order.ASCENDING; + String lower = name.toUpperCase(ENGLISH); + if (lower.endsWith(" ASC")) { + name = name.substring(0, name.length() - 4).trim(); + } + else if (lower.endsWith(" DESC")) { + name = name.substring(0, name.length() - 5).trim(); + order = SortingColumn.Order.DESCENDING; + } + return new SortingColumn(name, order); + } + + private static String sortingColumnToString(SortingColumn column) + { + return column.getColumnName() + ((column.getOrder() == SortingColumn.Order.DESCENDING) ? " DESC" : ""); + } + + public static boolean getTransactionalValue(Map tableProperties) + { + return Boolean.TRUE.equals(tableProperties.get(TRANSACTIONAL)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTransactionHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTransactionHandle.java new file mode 100644 index 00000000..e009b686 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTransactionHandle.java @@ -0,0 +1,71 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.prestosql.spi.connector.ConnectorTransactionHandle; + +import java.util.Objects; +import java.util.UUID; + +import static java.util.Objects.requireNonNull; + +public class HiveTransactionHandle + implements ConnectorTransactionHandle +{ + private final UUID uuid; + + public HiveTransactionHandle() + { + this(UUID.randomUUID()); + } + + @JsonCreator + public HiveTransactionHandle(@JsonProperty("uuid") UUID uuid) + { + this.uuid = requireNonNull(uuid, "uuid is null"); + } + + @JsonProperty + public UUID getUuid() + { + return uuid; + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if ((obj == null) || (getClass() != obj.getClass())) { + return false; + } + HiveTransactionHandle other = (HiveTransactionHandle) obj; + return Objects.equals(uuid, other.uuid); + } + + @Override + public int hashCode() + { + return Objects.hash(uuid); + } + + @Override + public String toString() + { + return uuid.toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTransactionManager.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTransactionManager.java new file mode 100644 index 00000000..530adbbf --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTransactionManager.java @@ -0,0 +1,43 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.connector.ConnectorMetadata; +import io.prestosql.spi.connector.ConnectorTransactionHandle; + +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; + +import static com.google.common.base.Preconditions.checkState; + +public class HiveTransactionManager +{ + private final ConcurrentMap transactions = new ConcurrentHashMap<>(); + + public TransactionalMetadata get(ConnectorTransactionHandle transactionHandle) + { + return transactions.get(transactionHandle); + } + + public TransactionalMetadata remove(ConnectorTransactionHandle transactionHandle) + { + return transactions.remove(transactionHandle); + } + + public void put(ConnectorTransactionHandle transactionHandle, TransactionalMetadata metadata) + { + ConnectorMetadata previousValue = transactions.putIfAbsent(transactionHandle, metadata); + checkState(previousValue == null); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveType.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveType.java new file mode 100644 index 00000000..93f59fd5 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveType.java @@ -0,0 +1,291 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonValue; +import com.google.common.collect.ImmutableList; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.type.NamedTypeSignature; +import io.prestosql.spi.type.RowFieldName; +import io.prestosql.spi.type.StandardTypes; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.type.TypeSignature; +import io.prestosql.spi.type.TypeSignatureParameter; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; + +import java.util.List; +import java.util.Locale; +import java.util.Optional; + +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.CharType.createCharType; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DecimalType.createDecimalType; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TimestampType.TIMESTAMP; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.VarbinaryType.VARBINARY; +import static io.prestosql.spi.type.VarcharType.createUnboundedVarcharType; +import static io.prestosql.spi.type.VarcharType.createVarcharType; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.binaryTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.booleanTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.byteTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.dateTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.doubleTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.floatTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.intTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.longTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.shortTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.stringTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.timestampTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils.getTypeInfoFromTypeString; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils.getTypeInfosFromTypeString; + +public final class HiveType +{ + public static final HiveType HIVE_BOOLEAN = new HiveType(booleanTypeInfo); + public static final HiveType HIVE_BYTE = new HiveType(byteTypeInfo); + public static final HiveType HIVE_SHORT = new HiveType(shortTypeInfo); + public static final HiveType HIVE_INT = new HiveType(intTypeInfo); + public static final HiveType HIVE_LONG = new HiveType(longTypeInfo); + public static final HiveType HIVE_FLOAT = new HiveType(floatTypeInfo); + public static final HiveType HIVE_DOUBLE = new HiveType(doubleTypeInfo); + public static final HiveType HIVE_STRING = new HiveType(stringTypeInfo); + public static final HiveType HIVE_TIMESTAMP = new HiveType(timestampTypeInfo); + public static final HiveType HIVE_DATE = new HiveType(dateTypeInfo); + public static final HiveType HIVE_BINARY = new HiveType(binaryTypeInfo); + + private final HiveTypeName hiveTypeName; + private final TypeInfo typeInfo; + + private HiveType(TypeInfo typeInfo) + { + requireNonNull(typeInfo, "typeInfo is null"); + this.hiveTypeName = new HiveTypeName(typeInfo.getTypeName()); + this.typeInfo = typeInfo; + } + + static HiveType createHiveType(TypeInfo typeInfo) + { + return new HiveType(typeInfo); + } + + public HiveTypeName getHiveTypeName() + { + return hiveTypeName; + } + + public Category getCategory() + { + return typeInfo.getCategory(); + } + + public TypeInfo getTypeInfo() + { + return typeInfo; + } + + public TypeSignature getTypeSignature() + { + return getTypeSignature(typeInfo); + } + + public Type getType(TypeManager typeManager) + { + return typeManager.getType(getTypeSignature()); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + HiveType hiveType = (HiveType) o; + + if (!hiveTypeName.equals(hiveType.hiveTypeName)) { + return false; + } + + return true; + } + + @Override + public int hashCode() + { + return hiveTypeName.hashCode(); + } + + @JsonValue + @Override + public String toString() + { + return hiveTypeName.toString(); + } + + public boolean isSupportedType() + { + return isSupportedType(getTypeInfo()); + } + + public static boolean isSupportedType(TypeInfo typeInfo) + { + switch (typeInfo.getCategory()) { + case PRIMITIVE: + return getPrimitiveType((PrimitiveTypeInfo) typeInfo) != null; + case MAP: + MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo; + return isSupportedType(mapTypeInfo.getMapKeyTypeInfo()) && isSupportedType(mapTypeInfo.getMapValueTypeInfo()); + case LIST: + ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; + return isSupportedType(listTypeInfo.getListElementTypeInfo()); + case STRUCT: + StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; + return structTypeInfo.getAllStructFieldTypeInfos().stream() + .allMatch(HiveType::isSupportedType); + } + return false; + } + + @JsonCreator + public static HiveType valueOf(String hiveTypeName) + { + requireNonNull(hiveTypeName, "hiveTypeName is null"); + return toHiveType(getTypeInfoFromTypeString(hiveTypeName)); + } + + public static List toHiveTypes(String hiveTypes) + { + requireNonNull(hiveTypes, "hiveTypes is null"); + return ImmutableList.copyOf(getTypeInfosFromTypeString(hiveTypes).stream() + .map(HiveType::toHiveType) + .collect(toList())); + } + + private static HiveType toHiveType(TypeInfo typeInfo) + { + requireNonNull(typeInfo, "typeInfo is null"); + return new HiveType(typeInfo); + } + + public static HiveType toHiveType(TypeTranslator typeTranslator, Type type) + { + requireNonNull(typeTranslator, "typeTranslator is null"); + requireNonNull(type, "type is null"); + return new HiveType(typeTranslator.translate(type)); + } + + private static TypeSignature getTypeSignature(TypeInfo typeInfo) + { + switch (typeInfo.getCategory()) { + case PRIMITIVE: + Type primitiveType = getPrimitiveType((PrimitiveTypeInfo) typeInfo); + if (primitiveType == null) { + break; + } + return primitiveType.getTypeSignature(); + case MAP: + MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo; + TypeSignature keyType = getTypeSignature(mapTypeInfo.getMapKeyTypeInfo()); + TypeSignature valueType = getTypeSignature(mapTypeInfo.getMapValueTypeInfo()); + return new TypeSignature( + StandardTypes.MAP, + ImmutableList.of(TypeSignatureParameter.of(keyType), TypeSignatureParameter.of(valueType))); + case LIST: + ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; + TypeSignature elementType = getTypeSignature(listTypeInfo.getListElementTypeInfo()); + return new TypeSignature( + StandardTypes.ARRAY, + ImmutableList.of(TypeSignatureParameter.of(elementType))); + case STRUCT: + StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; + List structFieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos(); + List structFieldNames = structTypeInfo.getAllStructFieldNames(); + if (structFieldTypeInfos.size() != structFieldNames.size()) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, format("Invalid Hive struct type: %s", typeInfo)); + } + ImmutableList.Builder typeSignatureBuilder = ImmutableList.builder(); + for (int i = 0; i < structFieldTypeInfos.size(); i++) { + TypeSignature typeSignature = getTypeSignature(structFieldTypeInfos.get(i)); + // Lower case the struct field names. + // Otherwise, Presto will refuse to write to columns whose struct type has field names containing upper case characters. + // Users can't work around this by casting in their queries because Presto parser always lower case types. + // TODO: This is a hack. Presto engine should be able to handle identifiers in a case insensitive way where necessary. + String rowFieldName = structFieldNames.get(i).toLowerCase(Locale.US); + typeSignatureBuilder.add(TypeSignatureParameter.of(new NamedTypeSignature(Optional.of(new RowFieldName(rowFieldName, false)), typeSignature))); + } + return new TypeSignature(StandardTypes.ROW, typeSignatureBuilder.build()); + } + throw new PrestoException(NOT_SUPPORTED, format("Unsupported Hive type: %s", typeInfo)); + } + + public static Type getPrimitiveType(PrimitiveTypeInfo typeInfo) + { + switch (typeInfo.getPrimitiveCategory()) { + case BOOLEAN: + return BOOLEAN; + case BYTE: + return TINYINT; + case SHORT: + return SMALLINT; + case INT: + return INTEGER; + case LONG: + return BIGINT; + case FLOAT: + return REAL; + case DOUBLE: + return DOUBLE; + case STRING: + return createUnboundedVarcharType(); + case VARCHAR: + return createVarcharType(((VarcharTypeInfo) typeInfo).getLength()); + case CHAR: + return createCharType(((CharTypeInfo) typeInfo).getLength()); + case DATE: + return DATE; + case TIMESTAMP: + return TIMESTAMP; + case BINARY: + return VARBINARY; + case DECIMAL: + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo; + return createDecimalType(decimalTypeInfo.precision(), decimalTypeInfo.scale()); + default: + return null; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTypeName.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTypeName.java new file mode 100644 index 00000000..e4e65ba9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTypeName.java @@ -0,0 +1,68 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import org.openjdk.jol.info.ClassLayout; + +import java.util.Objects; + +import static java.util.Objects.requireNonNull; + +public final class HiveTypeName +{ + private static final int INSTANCE_SIZE = ClassLayout.parseClass(HivePartitionKey.class).instanceSize() + + ClassLayout.parseClass(String.class).instanceSize(); + + private final String value; + + public HiveTypeName(String value) + { + this.value = requireNonNull(value, "value is null"); + } + + @Override + public String toString() + { + return value; + } + + public HiveType toHiveType() + { + return HiveType.valueOf(value); + } + + public int getEstimatedSizeInBytes() + { + return INSTANCE_SIZE + value.length() * Character.BYTES; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HiveTypeName that = (HiveTypeName) o; + return Objects.equals(value, that.value); + } + + @Override + public int hashCode() + { + return Objects.hash(value); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTypeTranslator.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTypeTranslator.java new file mode 100644 index 00000000..b917c512 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveTypeTranslator.java @@ -0,0 +1,150 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.type.CharType; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.NamedTypeSignature; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeSignatureParameter; +import io.prestosql.spi.type.VarcharType; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import static io.prestosql.plugin.hive.HiveType.HIVE_BINARY; +import static io.prestosql.plugin.hive.HiveType.HIVE_BOOLEAN; +import static io.prestosql.plugin.hive.HiveType.HIVE_BYTE; +import static io.prestosql.plugin.hive.HiveType.HIVE_DATE; +import static io.prestosql.plugin.hive.HiveType.HIVE_DOUBLE; +import static io.prestosql.plugin.hive.HiveType.HIVE_FLOAT; +import static io.prestosql.plugin.hive.HiveType.HIVE_INT; +import static io.prestosql.plugin.hive.HiveType.HIVE_LONG; +import static io.prestosql.plugin.hive.HiveType.HIVE_SHORT; +import static io.prestosql.plugin.hive.HiveType.HIVE_STRING; +import static io.prestosql.plugin.hive.HiveType.HIVE_TIMESTAMP; +import static io.prestosql.plugin.hive.HiveUtil.isArrayType; +import static io.prestosql.plugin.hive.HiveUtil.isMapType; +import static io.prestosql.plugin.hive.HiveUtil.isRowType; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TimestampType.TIMESTAMP; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.VarbinaryType.VARBINARY; +import static java.lang.String.format; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getCharTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getListTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getMapTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getStructTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getVarcharTypeInfo; + +public class HiveTypeTranslator + implements TypeTranslator +{ + @Override + public TypeInfo translate(Type type) + { + if (BOOLEAN.equals(type)) { + return HIVE_BOOLEAN.getTypeInfo(); + } + if (BIGINT.equals(type)) { + return HIVE_LONG.getTypeInfo(); + } + if (INTEGER.equals(type)) { + return HIVE_INT.getTypeInfo(); + } + if (SMALLINT.equals(type)) { + return HIVE_SHORT.getTypeInfo(); + } + if (TINYINT.equals(type)) { + return HIVE_BYTE.getTypeInfo(); + } + if (REAL.equals(type)) { + return HIVE_FLOAT.getTypeInfo(); + } + if (DOUBLE.equals(type)) { + return HIVE_DOUBLE.getTypeInfo(); + } + if (type instanceof VarcharType) { + VarcharType varcharType = (VarcharType) type; + if (varcharType.isUnbounded()) { + return HIVE_STRING.getTypeInfo(); + } + if (varcharType.getBoundedLength() <= HiveVarchar.MAX_VARCHAR_LENGTH) { + return getVarcharTypeInfo(varcharType.getBoundedLength()); + } + throw new PrestoException(NOT_SUPPORTED, format("Unsupported Hive type: %s. Supported VARCHAR types: VARCHAR(<=%d), VARCHAR.", type, HiveVarchar.MAX_VARCHAR_LENGTH)); + } + if (type instanceof CharType) { + CharType charType = (CharType) type; + int charLength = charType.getLength(); + if (charLength <= HiveChar.MAX_CHAR_LENGTH) { + return getCharTypeInfo(charLength); + } + throw new PrestoException(NOT_SUPPORTED, format("Unsupported Hive type: %s. Supported CHAR types: CHAR(<=%d).", + type, HiveChar.MAX_CHAR_LENGTH)); + } + if (VARBINARY.equals(type)) { + return HIVE_BINARY.getTypeInfo(); + } + if (DATE.equals(type)) { + return HIVE_DATE.getTypeInfo(); + } + if (TIMESTAMP.equals(type)) { + return HIVE_TIMESTAMP.getTypeInfo(); + } + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + return new DecimalTypeInfo(decimalType.getPrecision(), decimalType.getScale()); + } + if (isArrayType(type)) { + TypeInfo elementType = translate(type.getTypeParameters().get(0)); + return getListTypeInfo(elementType); + } + if (isMapType(type)) { + TypeInfo keyType = translate(type.getTypeParameters().get(0)); + TypeInfo valueType = translate(type.getTypeParameters().get(1)); + return getMapTypeInfo(keyType, valueType); + } + if (isRowType(type)) { + ImmutableList.Builder fieldNames = ImmutableList.builder(); + for (TypeSignatureParameter parameter : type.getTypeSignature().getParameters()) { + if (!parameter.isNamedTypeSignature()) { + throw new IllegalArgumentException(format("Expected all parameters to be named type, but got %s", parameter)); + } + NamedTypeSignature namedTypeSignature = parameter.getNamedTypeSignature(); + if (!namedTypeSignature.getName().isPresent()) { + throw new PrestoException(NOT_SUPPORTED, format("Anonymous row type is not supported in Hive. Please give each field a name: %s", type)); + } + fieldNames.add(namedTypeSignature.getName().get()); + } + return getStructTypeInfo( + fieldNames.build(), + type.getTypeParameters().stream() + .map(this::translate) + .collect(toList())); + } + throw new PrestoException(NOT_SUPPORTED, format("Unsupported Hive type: %s", type)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveUpdateTableHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveUpdateTableHandle.java new file mode 100644 index 00000000..c7805163 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveUpdateTableHandle.java @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.prestosql.plugin.hive.metastore.HivePageSinkMetadata; +import io.prestosql.spi.connector.ConnectorUpdateTableHandle; + +import java.util.List; +import java.util.Optional; + +public class HiveUpdateTableHandle + extends HiveWritableTableHandle + implements ConnectorUpdateTableHandle +{ + @JsonCreator + public HiveUpdateTableHandle( + @JsonProperty("schemaName") String schemaName, + @JsonProperty("tableName") String tableName, + @JsonProperty("inputColumns") List inputColumns, + @JsonProperty("pageSinkMetadata") HivePageSinkMetadata pageSinkMetadata, + @JsonProperty("locationHandle") LocationHandle locationHandle, + @JsonProperty("bucketProperty") Optional bucketProperty, + @JsonProperty("tableStorageFormat") HiveStorageFormat tableStorageFormat, + @JsonProperty("partitionStorageFormat") HiveStorageFormat partitionStorageFormat) + { + super( + schemaName, + tableName, + inputColumns, + pageSinkMetadata, + locationHandle, + bucketProperty, + tableStorageFormat, + partitionStorageFormat, + false); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveUtil.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveUtil.java new file mode 100644 index 00000000..744b8432 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveUtil.java @@ -0,0 +1,1215 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Joiner; +import com.google.common.base.Predicate; +import com.google.common.base.Splitter; +import com.google.common.base.VerifyException; +import com.google.common.collect.AbstractIterator; +import com.google.common.collect.ImmutableList; +import io.airlift.compress.lzo.LzoCodec; +import io.airlift.compress.lzo.LzopCodec; +import io.airlift.json.JsonCodec; +import io.airlift.json.JsonCodecFactory; +import io.airlift.json.ObjectMapperProvider; +import io.airlift.log.Logger; +import io.airlift.slice.Slice; +import io.airlift.slice.SliceUtf8; +import io.airlift.slice.Slices; +import io.prestosql.hadoop.TextLineLengthLimitExceededException; +import io.prestosql.plugin.hive.avro.PrestoAvroSerDe; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.SortingColumn; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.util.ConfigurationUtils; +import io.prestosql.plugin.hive.util.FooterAwareRecordReader; +import io.prestosql.plugin.hive.util.HudiRealtimeSplitConverter; +import io.prestosql.plugin.hive.util.MergingPageIterator; +import io.prestosql.spi.ErrorCodeSupplier; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.SortOrder; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorViewDefinition; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.dynamicfilter.BloomFilterDynamicFilter; +import io.prestosql.spi.dynamicfilter.DynamicFilter; +import io.prestosql.spi.predicate.NullableValue; +import io.prestosql.spi.type.AbstractVariableWidthType; +import io.prestosql.spi.type.CharType; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.Decimals; +import io.prestosql.spi.type.StandardTypes; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.type.VarcharType; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.JavaUtils; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.IOConstants; +import org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.serde2.AbstractSerDe; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hadoop.util.ReflectionUtils; +import org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; +import org.joda.time.format.DateTimeFormatterBuilder; +import org.joda.time.format.DateTimeParser; +import org.joda.time.format.DateTimePrinter; +import org.joda.time.format.ISODateTimeFormat; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.lang.annotation.Annotation; +import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.Arrays; +import java.util.Base64; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import static com.google.common.base.MoreObjects.firstNonNull; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.Iterables.filter; +import static com.google.common.collect.Lists.newArrayList; +import static com.google.common.collect.Lists.transform; +import static io.prestosql.plugin.hive.HiveBucketing.bucketedOnTimestamp; +import static io.prestosql.plugin.hive.HiveBucketing.getHiveBucketHandle; +import static io.prestosql.plugin.hive.HiveColumnHandle.bucketColumnHandle; +import static io.prestosql.plugin.hive.util.CustomSplitConversionUtils.recreateSplitWithCustomInfo; +import static io.prestosql.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.Chars.isCharType; +import static io.prestosql.spi.type.Chars.trimTrailingSpaces; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DecimalType.createDecimalType; +import static io.prestosql.spi.type.Decimals.isLongDecimal; +import static io.prestosql.spi.type.Decimals.isShortDecimal; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TimestampType.TIMESTAMP; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.Varchars.isVarcharType; +import static java.lang.Byte.parseByte; +import static java.lang.Double.parseDouble; +import static java.lang.Float.floatToRawIntBits; +import static java.lang.Float.parseFloat; +import static java.lang.Integer.parseInt; +import static java.lang.Long.parseLong; +import static java.lang.Short.parseShort; +import static java.lang.String.format; +import static java.math.BigDecimal.ROUND_UNNECESSARY; +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.joining; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.common.FileUtils.unescapePathName; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; +import static org.apache.hadoop.hive.serde.serdeConstants.DECIMAL_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; +import static org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_ALL_COLUMNS; +import static org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR; +import static org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; + +public final class HiveUtil +{ + public static final Logger log = Logger.get(HiveUtil.class); + + public static final String PRESTO_VIEW_FLAG = "presto_view"; + + private static final String VIEW_PREFIX = "/* Presto View: "; + private static final String VIEW_SUFFIX = " */"; + private static final JsonCodec VIEW_CODEC = + new JsonCodecFactory(new ObjectMapperProvider()).jsonCodec(ConnectorViewDefinition.class); + + private static final DateTimeFormatter HIVE_DATE_PARSER = ISODateTimeFormat.date().withZoneUTC(); + private static final DateTimeFormatter HIVE_TIMESTAMP_PARSER; + private static final Field COMPRESSION_CODECS_FIELD; + + private static final Pattern SUPPORTED_DECIMAL_TYPE = Pattern.compile(DECIMAL_TYPE_NAME + "\\((\\d+),(\\d+)\\)"); + private static final int DECIMAL_PRECISION_GROUP = 1; + private static final int DECIMAL_SCALE_GROUP = 2; + + private static final String BIG_DECIMAL_POSTFIX = "BD"; + + private static final Splitter COLUMN_NAMES_SPLITTER = Splitter.on(',').trimResults().omitEmptyStrings(); + + private static final String DEFAULT_PARTITION_VALUE = "\\N"; + + private static final Iterable BUCKET_PATTERNS = ImmutableList.of( + // Hive naming pattern per `org.apache.hadoop.hive.ql.exec.Utilities#getBucketIdFromFile()` + Pattern.compile("(0\\d+)_\\d+.*"), + // Hive ACID + Pattern.compile("bucket_(\\d+)"), + // legacy Hetu naming pattern (current version matches Hive) + Pattern.compile("\\d{8}_\\d{6}_\\d{5}_[a-z0-9]{5}_bucket-(\\d+)(?:[-_.].*)?")); + + static { + DateTimeParser[] timestampWithoutTimeZoneParser = { + DateTimeFormat.forPattern("yyyy-M-d").getParser(), + DateTimeFormat.forPattern("yyyy-M-d H:m").getParser(), + DateTimeFormat.forPattern("yyyy-M-d H:m:s").getParser(), + DateTimeFormat.forPattern("yyyy-M-d H:m:s.SSS").getParser(), + DateTimeFormat.forPattern("yyyy-M-d H:m:s.SSSSSSS").getParser(), + DateTimeFormat.forPattern("yyyy-M-d H:m:s.SSSSSSSSS").getParser(), + }; + DateTimePrinter timestampWithoutTimeZonePrinter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSSSSSSSS").getPrinter(); + HIVE_TIMESTAMP_PARSER = new DateTimeFormatterBuilder().append(timestampWithoutTimeZonePrinter, timestampWithoutTimeZoneParser).toFormatter().withZoneUTC(); + + try { + COMPRESSION_CODECS_FIELD = TextInputFormat.class.getDeclaredField("compressionCodecs"); + COMPRESSION_CODECS_FIELD.setAccessible(true); + } + catch (ReflectiveOperationException e) { + throw new AssertionError(e); + } + } + + private HiveUtil() + { + } + + public static RecordReader createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List columns, Map customSplitInfo) + { + // determine which hive columns we will read + List readColumns = ImmutableList.copyOf(filter(columns, column -> column.getColumnType() == HiveColumnHandle.ColumnType.REGULAR)); + List readHiveColumnIndexes = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getHiveColumnIndex)); + + // Tell hive the columns we would like to read, this lets hive optimize reading column oriented files + setReadColumns(configuration, readHiveColumnIndexes); + + // Only propagate serialization schema configs by default + Predicate schemaFilter = schemaProperty -> schemaProperty.startsWith("serialization."); + + JobConf jobConf = ConfigurationUtils.toJobConf(configuration); + InputFormat inputFormat = getInputFormat(configuration, schema, true, jobConf); + FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null); + + if (!customSplitInfo.isEmpty() && isHudiRealtimeSplit(customSplitInfo)) { + fileSplit = recreateSplitWithCustomInfo(fileSplit, customSplitInfo); + + // Add additional column information for record reader + List readHiveColumnNames = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getName)); + jobConf.set(READ_COLUMN_NAMES_CONF_STR, Joiner.on(',').join(readHiveColumnNames)); + + // Remove filter when using customSplitInfo as the record reader requires complete schema configs + schemaFilter = schemaProperty -> true; + } + + schema.stringPropertyNames().stream() + .filter(schemaFilter) + .forEach(name -> jobConf.set(name, schema.getProperty(name))); + + // add Airlift LZO and LZOP to head of codecs list so as to not override existing entries + List codecs = newArrayList(Splitter.on(",").trimResults().omitEmptyStrings().split(jobConf.get("io.compression.codecs", ""))); + if (!codecs.contains(LzoCodec.class.getName())) { + codecs.add(0, LzoCodec.class.getName()); + } + if (!codecs.contains(LzopCodec.class.getName())) { + codecs.add(0, LzopCodec.class.getName()); + } + jobConf.set("io.compression.codecs", codecs.stream().collect(joining(","))); + + try { + RecordReader recordReader = (RecordReader) inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL); + + int headerCount = getHeaderCount(schema); + if (headerCount > 0) { + Utilities.skipHeader(recordReader, headerCount, recordReader.createKey(), recordReader.createValue()); + } + + int footerCount = getFooterCount(schema); + if (footerCount > 0) { + recordReader = new FooterAwareRecordReader<>(recordReader, footerCount, jobConf); + } + + return recordReader; + } + catch (IOException e) { + if (e instanceof TextLineLengthLimitExceededException) { + throw new PrestoException(HiveErrorCode.HIVE_BAD_DATA, "Line too long in text file: " + path, e); + } + + throw new PrestoException(HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", + path, + start, + length, + getInputFormatName(schema), + firstNonNull(e.getMessage(), e.getClass().getName())), + e); + } + } + + private static boolean isHudiRealtimeSplit(Map customSplitInfo) + { + String customSplitClass = customSplitInfo.get(HudiRealtimeSplitConverter.CUSTOM_SPLIT_CLASS_KEY); + return HoodieRealtimeFileSplit.class.getName().equals(customSplitClass); + } + + public static void setReadColumns(Configuration configuration, List readHiveColumnIndexes) + { + configuration.set(READ_COLUMN_IDS_CONF_STR, Joiner.on(',').join(readHiveColumnIndexes)); + configuration.setBoolean(READ_ALL_COLUMNS, false); + } + + public static Optional getCompressionCodec(TextInputFormat inputFormat, Path file) + { + CompressionCodecFactory compressionCodecFactory; + + try { + compressionCodecFactory = (CompressionCodecFactory) COMPRESSION_CODECS_FIELD.get(inputFormat); + } + catch (IllegalAccessException e) { + throw new PrestoException(GENERIC_INTERNAL_ERROR, "Failed to find compressionCodec for inputFormat: " + inputFormat.getClass().getName(), e); + } + + if (compressionCodecFactory == null) { + return Optional.empty(); + } + + return Optional.ofNullable(compressionCodecFactory.getCodec(file)); + } + + static InputFormat getInputFormat(Configuration configuration, Properties schema, boolean symlinkTarget, JobConf jobConf) + { + String inputFormatName = getInputFormatName(schema); + try { + Class> inputFormatClass = getInputFormatClass(jobConf, inputFormatName); + if (symlinkTarget && (inputFormatClass == SymlinkTextInputFormat.class)) { + // symlink targets are always TextInputFormat + inputFormatClass = TextInputFormat.class; + } + + return ReflectionUtils.newInstance(inputFormatClass, jobConf); + } + catch (ClassNotFoundException | RuntimeException e) { + throw new PrestoException(HiveErrorCode.HIVE_UNSUPPORTED_FORMAT, "Unable to create input format " + inputFormatName, e); + } + } + + public static boolean shouldUseRecordReaderFromInputFormat(Configuration configuration, Properties schema) + { + JobConf jobConf = ConfigurationUtils.toJobConf(configuration); + InputFormat inputFormat = HiveUtil.getInputFormat(configuration, schema, false, jobConf); + return Arrays.stream(inputFormat.getClass().getAnnotations()) + .map(Annotation::annotationType) + .map(Class::getSimpleName) + .anyMatch(name -> name.equals("UseRecordReaderFromInputFormat")); + } + + @SuppressWarnings({"unchecked", "RedundantCast"}) + private static Class> getInputFormatClass(JobConf conf, String inputFormatName) + throws ClassNotFoundException + { + // CDH uses different names for Parquet + if ("parquet.hive.DeprecatedParquetInputFormat".equals(inputFormatName) || + "parquet.hive.MapredParquetInputFormat".equals(inputFormatName)) { + return MapredParquetInputFormat.class; + } + + Class clazz = conf.getClassByName(inputFormatName); + return (Class>) clazz.asSubclass(InputFormat.class); + } + + static String getInputFormatName(Properties schema) + { + String name = schema.getProperty(FILE_INPUT_FORMAT); + checkCondition(name != null, HiveErrorCode.HIVE_INVALID_METADATA, "Table or partition is missing Hive input format property: %s", FILE_INPUT_FORMAT); + return name; + } + + public static long parseHiveDate(String value) + { + long millis = HIVE_DATE_PARSER.parseMillis(value); + return TimeUnit.MILLISECONDS.toDays(millis); + } + + public static long parseHiveTimestamp(String value) + { + return HIVE_TIMESTAMP_PARSER.parseMillis(value); + } + + public static boolean isSplittable(InputFormat inputFormat, FileSystem fileSystem, Path path) + { + // ORC uses a custom InputFormat but is always splittable + if (inputFormat.getClass().getSimpleName().equals("OrcInputFormat")) { + return true; + } + + // use reflection to get isSplittable method on FileInputFormat + Method method = null; + for (Class clazz = inputFormat.getClass(); clazz != null; clazz = clazz.getSuperclass()) { + try { + method = clazz.getDeclaredMethod("isSplitable", FileSystem.class, Path.class); + break; + } + catch (NoSuchMethodException ignored) { + } + } + + if (method == null) { + return false; + } + try { + method.setAccessible(true); + return (boolean) method.invoke(inputFormat, fileSystem, path); + } + catch (InvocationTargetException | IllegalAccessException e) { + throw new RuntimeException(e); + } + } + + public static StructObjectInspector getTableObjectInspector(Deserializer deserializer) + { + try { + ObjectInspector inspector = deserializer.getObjectInspector(); + checkArgument(inspector.getCategory() == Category.STRUCT, "expected STRUCT: %s", inspector.getCategory()); + return (StructObjectInspector) inspector; + } + catch (SerDeException e) { + throw new RuntimeException(e); + } + } + + public static boolean isDeserializerClass(Properties schema, Class deserializerClass) + { + return getDeserializerClassName(schema).equals(deserializerClass.getName()); + } + + public static String getDeserializerClassName(Properties schema) + { + String name = schema.getProperty(SERIALIZATION_LIB); + checkCondition(name != null, HiveErrorCode.HIVE_INVALID_METADATA, "Table or partition is missing Hive deserializer property: %s", SERIALIZATION_LIB); + return name; + } + + public static Deserializer getDeserializer(Configuration configuration, Properties schema) + { + String name = getDeserializerClassName(schema); + + Deserializer deserializer = createDeserializer(getDeserializerClass(name)); + initializeDeserializer(configuration, deserializer, schema); + return deserializer; + } + + private static Class getDeserializerClass(String name) + { + // CDH uses different names for Parquet + if ("parquet.hive.serde.ParquetHiveSerDe".equals(name)) { + return ParquetHiveSerDe.class; + } + + if ("org.apache.hadoop.hive.serde2.avro.AvroSerDe".equals(name)) { + return PrestoAvroSerDe.class; + } + + try { + return Class.forName(name, true, JavaUtils.getClassLoader()).asSubclass(Deserializer.class); + } + catch (ClassNotFoundException e) { + throw new PrestoException(HiveErrorCode.HIVE_SERDE_NOT_FOUND, "deserializer does not exist: " + name); + } + catch (ClassCastException e) { + throw new RuntimeException("invalid deserializer class: " + name); + } + } + + private static Deserializer createDeserializer(Class clazz) + { + try { + return clazz.getConstructor().newInstance(); + } + catch (ReflectiveOperationException e) { + throw new RuntimeException("error creating deserializer: " + clazz.getName(), e); + } + } + + private static void initializeDeserializer(Configuration configuration, Deserializer deserializer, Properties schema) + { + try { + configuration = ConfigurationUtils.copy(configuration); // Some SerDes (e.g. Avro) modify passed configuration + deserializer.initialize(configuration, schema); + validate(deserializer); + } + catch (SerDeException | RuntimeException e) { + throw new RuntimeException("error initializing deserializer: " + deserializer.getClass().getName(), e); + } + } + + private static void validate(Deserializer deserializer) + { + if (deserializer instanceof AbstractSerDe && !((AbstractSerDe) deserializer).getConfigurationErrors().isEmpty()) { + throw new RuntimeException("There are configuration errors: " + ((AbstractSerDe) deserializer).getConfigurationErrors()); + } + } + + public static boolean isHiveNull(byte[] bytes) + { + return bytes.length == 2 && bytes[0] == '\\' && bytes[1] == 'N'; + } + + public static void verifyPartitionTypeSupported(String partitionName, Type type) + { + if (!isValidPartitionType(type)) { + throw new PrestoException(NOT_SUPPORTED, format("Unsupported type [%s] for partition: %s", type, partitionName)); + } + } + + private static boolean isValidPartitionType(Type type) + { + return type instanceof DecimalType || + BOOLEAN.equals(type) || + TINYINT.equals(type) || + SMALLINT.equals(type) || + INTEGER.equals(type) || + BIGINT.equals(type) || + REAL.equals(type) || + DOUBLE.equals(type) || + DATE.equals(type) || + TIMESTAMP.equals(type) || + isVarcharType(type) || + isCharType(type); + } + + public static NullableValue parsePartitionValue(String partitionName, String value, Type type) + { + verifyPartitionTypeSupported(partitionName, type); + + boolean isNull = HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION.equals(value); + + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + if (isNull) { + return NullableValue.asNull(decimalType); + } + if (decimalType.isShort()) { + if (value.isEmpty()) { + return NullableValue.of(decimalType, 0L); + } + return NullableValue.of(decimalType, shortDecimalPartitionKey(value, decimalType, partitionName)); + } + else { + if (value.isEmpty()) { + return NullableValue.of(decimalType, Decimals.encodeUnscaledValue(BigInteger.ZERO)); + } + return NullableValue.of(decimalType, longDecimalPartitionKey(value, decimalType, partitionName)); + } + } + + if (BOOLEAN.equals(type)) { + if (isNull) { + return NullableValue.asNull(BOOLEAN); + } + if (value.isEmpty()) { + return NullableValue.of(BOOLEAN, false); + } + return NullableValue.of(BOOLEAN, booleanPartitionKey(value, partitionName)); + } + + if (TINYINT.equals(type)) { + if (isNull) { + return NullableValue.asNull(TINYINT); + } + if (value.isEmpty()) { + return NullableValue.of(TINYINT, 0L); + } + return NullableValue.of(TINYINT, tinyintPartitionKey(value, partitionName)); + } + + if (SMALLINT.equals(type)) { + if (isNull) { + return NullableValue.asNull(SMALLINT); + } + if (value.isEmpty()) { + return NullableValue.of(SMALLINT, 0L); + } + return NullableValue.of(SMALLINT, smallintPartitionKey(value, partitionName)); + } + + if (INTEGER.equals(type)) { + if (isNull) { + return NullableValue.asNull(INTEGER); + } + if (value.isEmpty()) { + return NullableValue.of(INTEGER, 0L); + } + return NullableValue.of(INTEGER, integerPartitionKey(value, partitionName)); + } + + if (BIGINT.equals(type)) { + if (isNull) { + return NullableValue.asNull(BIGINT); + } + if (value.isEmpty()) { + return NullableValue.of(BIGINT, 0L); + } + return NullableValue.of(BIGINT, bigintPartitionKey(value, partitionName)); + } + + if (DATE.equals(type)) { + if (isNull) { + return NullableValue.asNull(DATE); + } + return NullableValue.of(DATE, datePartitionKey(value, partitionName)); + } + + if (TIMESTAMP.equals(type)) { + if (isNull) { + return NullableValue.asNull(TIMESTAMP); + } + return NullableValue.of(TIMESTAMP, timestampPartitionKey(value, partitionName)); + } + + if (REAL.equals(type)) { + if (isNull) { + return NullableValue.asNull(REAL); + } + if (value.isEmpty()) { + return NullableValue.of(REAL, (long) floatToRawIntBits(0.0f)); + } + return NullableValue.of(REAL, floatPartitionKey(value, partitionName)); + } + + if (DOUBLE.equals(type)) { + if (isNull) { + return NullableValue.asNull(DOUBLE); + } + if (value.isEmpty()) { + return NullableValue.of(DOUBLE, 0.0); + } + return NullableValue.of(DOUBLE, doublePartitionKey(value, partitionName)); + } + + if (isVarcharType(type)) { + if (isNull) { + return NullableValue.asNull(type); + } + return NullableValue.of(type, varcharPartitionKey(value, partitionName, type)); + } + + if (isCharType(type)) { + if (isNull) { + return NullableValue.asNull(type); + } + return NullableValue.of(type, charPartitionKey(value, partitionName, type)); + } + + throw new VerifyException(format("Unhandled type [%s] for partition: %s", type, partitionName)); + } + + public static boolean isPrestoView(Table table) + { + return "true".equals(table.getParameters().get(PRESTO_VIEW_FLAG)); + } + + public static boolean isHiveView(Table table) + { + return table.getTableType().equals(TableType.VIRTUAL_VIEW.name()); + } + + public static boolean isView(Table table) + { + return isPrestoView(table) || isHiveView(table); + } + + public static String encodeViewData(ConnectorViewDefinition definition) + { + byte[] bytes = VIEW_CODEC.toJsonBytes(definition); + String data = Base64.getEncoder().encodeToString(bytes); + return VIEW_PREFIX + data + VIEW_SUFFIX; + } + + public static ConnectorViewDefinition decodeViewData(String data) + { + checkCondition(data.startsWith(VIEW_PREFIX), HiveErrorCode.HIVE_INVALID_VIEW_DATA, "View data missing prefix: %s", data); + checkCondition(data.endsWith(VIEW_SUFFIX), HiveErrorCode.HIVE_INVALID_VIEW_DATA, "View data missing suffix: %s", data); + data = data.substring(VIEW_PREFIX.length()); + data = data.substring(0, data.length() - VIEW_SUFFIX.length()); + byte[] bytes = Base64.getDecoder().decode(data); + return VIEW_CODEC.fromJson(bytes); + } + + public static Optional getDecimalType(HiveType hiveType) + { + return getDecimalType(hiveType.getHiveTypeName().toString()); + } + + public static Optional getDecimalType(String hiveTypeName) + { + Matcher matcher = SUPPORTED_DECIMAL_TYPE.matcher(hiveTypeName); + if (matcher.matches()) { + int precision = parseInt(matcher.group(DECIMAL_PRECISION_GROUP)); + int scale = parseInt(matcher.group(DECIMAL_SCALE_GROUP)); + return Optional.of(createDecimalType(precision, scale)); + } + else { + return Optional.empty(); + } + } + + public static boolean isArrayType(Type type) + { + return type.getTypeSignature().getBase().equals(StandardTypes.ARRAY); + } + + public static boolean isMapType(Type type) + { + return type.getTypeSignature().getBase().equals(StandardTypes.MAP); + } + + public static boolean isRowType(Type type) + { + return type.getTypeSignature().getBase().equals(StandardTypes.ROW); + } + + public static boolean isStructuralType(Type type) + { + String baseName = type.getTypeSignature().getBase(); + return baseName.equals(StandardTypes.MAP) || baseName.equals(StandardTypes.ARRAY) || baseName.equals(StandardTypes.ROW); + } + + public static boolean isStructuralType(HiveType hiveType) + { + return hiveType.getCategory() == Category.LIST || hiveType.getCategory() == Category.MAP || hiveType.getCategory() == Category.STRUCT; + } + + public static boolean booleanPartitionKey(String value, String name) + { + if (value.equalsIgnoreCase("true")) { + return true; + } + if (value.equalsIgnoreCase("false")) { + return false; + } + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for BOOLEAN partition key: %s", value, name)); + } + + public static long bigintPartitionKey(String value, String name) + { + try { + return parseLong(value); + } + catch (NumberFormatException e) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for BIGINT partition key: %s", value, name)); + } + } + + public static long integerPartitionKey(String value, String name) + { + try { + return parseInt(value); + } + catch (NumberFormatException e) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for INTEGER partition key: %s", value, name)); + } + } + + public static long smallintPartitionKey(String value, String name) + { + try { + return parseShort(value); + } + catch (NumberFormatException e) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for SMALLINT partition key: %s", value, name)); + } + } + + public static long tinyintPartitionKey(String value, String name) + { + try { + return parseByte(value); + } + catch (NumberFormatException e) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for TINYINT partition key: %s", value, name)); + } + } + + public static long floatPartitionKey(String value, String name) + { + try { + return floatToRawIntBits(parseFloat(value)); + } + catch (NumberFormatException e) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for FLOAT partition key: %s", value, name)); + } + } + + public static double doublePartitionKey(String value, String name) + { + try { + return parseDouble(value); + } + catch (NumberFormatException e) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for DOUBLE partition key: %s", value, name)); + } + } + + public static long datePartitionKey(String value, String name) + { + try { + return parseHiveDate(value); + } + catch (IllegalArgumentException e) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for DATE partition key: %s", value, name)); + } + } + + public static long timestampPartitionKey(String value, String name) + { + try { + return parseHiveTimestamp(value); + } + catch (IllegalArgumentException e) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for TIMESTAMP partition key: %s", value, name)); + } + } + + public static long shortDecimalPartitionKey(String value, DecimalType type, String name) + { + return decimalPartitionKey(value, type, name).unscaledValue().longValue(); + } + + public static Slice longDecimalPartitionKey(String value, DecimalType type, String name) + { + return Decimals.encodeUnscaledValue(decimalPartitionKey(value, type, name).unscaledValue()); + } + + private static BigDecimal decimalPartitionKey(String value, DecimalType type, String name) + { + try { + if (value.endsWith(BIG_DECIMAL_POSTFIX)) { + value = value.substring(0, value.length() - BIG_DECIMAL_POSTFIX.length()); + } + + BigDecimal decimal = new BigDecimal(value); + decimal = decimal.setScale(type.getScale(), ROUND_UNNECESSARY); + if (decimal.precision() > type.getPrecision()) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for %s partition key: %s", value, type.toString(), name)); + } + return decimal; + } + catch (NumberFormatException e) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for %s partition key: %s", value, type.toString(), name)); + } + } + + public static Slice varcharPartitionKey(String value, String name, Type columnType) + { + Slice partitionKey = Slices.utf8Slice(value); + VarcharType varcharType = (VarcharType) columnType; + if (!varcharType.isUnbounded() && SliceUtf8.countCodePoints(partitionKey) > varcharType.getBoundedLength()) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for %s partition key: %s", value, columnType.toString(), name)); + } + return partitionKey; + } + + public static Slice charPartitionKey(String value, String name, Type columnType) + { + Slice partitionKey = trimTrailingSpaces(Slices.utf8Slice(value)); + CharType charType = (CharType) columnType; + if (SliceUtf8.countCodePoints(partitionKey) > charType.getLength()) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for %s partition key: %s", value, columnType.toString(), name)); + } + return partitionKey; + } + + public static List hiveColumnHandles(Table table) + { + ImmutableList.Builder columns = ImmutableList.builder(); + + // add the data fields first + columns.addAll(getRegularColumnHandles(table)); + + // add the partition keys last (like Hive does) + columns.addAll(getPartitionKeyColumnHandles(table)); + + // add hidden columns + columns.add(HiveColumnHandle.pathColumnHandle()); + if (getHiveBucketHandle(table).isPresent()) { + if (!bucketedOnTimestamp(table.getStorage().getBucketProperty().get(), table)) { + columns.add(bucketColumnHandle()); + } + } + + return columns.build(); + } + + public static List getRegularColumnHandles(Table table) + { + ImmutableList.Builder columns = ImmutableList.builder(); + + Optional bucketProperty = table.getStorage().getBucketProperty(); + Set bucketSortColumns = new HashSet<>(); + if (bucketProperty.isPresent()) { + bucketSortColumns.addAll(bucketProperty.get().getBucketedBy()); + bucketProperty.get().getSortedBy().stream().map(SortingColumn::getColumnName).forEach(bucketSortColumns::add); + } + int hiveColumnIndex = 0; + for (Column field : table.getDataColumns()) { + // ignore unsupported types rather than failing + HiveType hiveType = field.getType(); + if (hiveType.isSupportedType()) { + columns.add(new HiveColumnHandle(field.getName(), hiveType, hiveType.getTypeSignature(), hiveColumnIndex, HiveColumnHandle.ColumnType.REGULAR, field.getComment(), + bucketSortColumns.contains(field.getName()))); + } + hiveColumnIndex++; + } + + return columns.build(); + } + + public static List getPartitionKeyColumnHandles(Table table) + { + ImmutableList.Builder columns = ImmutableList.builder(); + + List partitionKeys = table.getPartitionColumns(); + for (Column field : partitionKeys) { + HiveType hiveType = field.getType(); + if (!hiveType.isSupportedType()) { + throw new PrestoException(NOT_SUPPORTED, format("Unsupported Hive type %s found in partition keys of table %s.%s", hiveType, table.getDatabaseName(), table.getTableName())); + } + columns.add(new HiveColumnHandle(field.getName(), hiveType, hiveType.getTypeSignature(), -1, HiveColumnHandle.ColumnType.PARTITION_KEY, field.getComment(), true)); + } + + return columns.build(); + } + + public static void checkCondition(boolean condition, ErrorCodeSupplier errorCode, String formatString, Object... args) + { + if (!condition) { + throw new PrestoException(errorCode, format(formatString, args)); + } + } + + @Nullable + public static String columnExtraInfo(boolean partitionKey) + { + return partitionKey ? "partition key" : null; + } + + public static List toPartitionValues(String partitionName) + { + // mimics Warehouse.makeValsFromName + ImmutableList.Builder resultBuilder = ImmutableList.builder(); + int start = 0; + while (true) { + while (start < partitionName.length() && partitionName.charAt(start) != '=') { + start++; + } + start++; + int end = start; + while (end < partitionName.length() && partitionName.charAt(end) != '/') { + end++; + } + if (start > partitionName.length()) { + break; + } + resultBuilder.add(unescapePathName(partitionName.substring(start, end))); + start = end + 1; + } + return resultBuilder.build(); + } + + public static String getPrefilledColumnValue(HiveColumnHandle columnHandle, HivePartitionKey partitionKey, Path path, OptionalInt bucketNumber) + { + if (partitionKey != null) { + return partitionKey.getValue(); + } + if (HiveColumnHandle.isPathColumnHandle(columnHandle)) { + return path.toString(); + } + if (HiveColumnHandle.isBucketColumnHandle(columnHandle)) { + return String.valueOf(bucketNumber.getAsInt()); + } + throw new PrestoException(NOT_SUPPORTED, "unsupported hidden column: " + columnHandle); + } + + public static void closeWithSuppression(RecordCursor recordCursor, Throwable throwable) + { + requireNonNull(recordCursor, "recordCursor is null"); + requireNonNull(throwable, "throwable is null"); + try { + recordCursor.close(); + } + catch (RuntimeException e) { + // Self-suppression not permitted + if (throwable != e) { + throwable.addSuppressed(e); + } + } + } + + public static List extractStructFieldTypes(HiveType hiveType) + { + return ((StructTypeInfo) hiveType.getTypeInfo()).getAllStructFieldTypeInfos().stream() + .map(typeInfo -> HiveType.valueOf(typeInfo.getTypeName())) + .collect(toImmutableList()); + } + + public static int getHeaderCount(Properties schema) + { + return getPositiveIntegerValue(schema, "skip.header.line.count", "0"); + } + + public static int getFooterCount(Properties schema) + { + return getPositiveIntegerValue(schema, "skip.footer.line.count", "0"); + } + + private static int getPositiveIntegerValue(Properties schema, String key, String defaultValue) + { + String value = schema.getProperty(key, defaultValue); + try { + int intValue = Integer.parseInt(value); + if (intValue < 0) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, format("Invalid value for %s property: %s", key, value)); + } + return intValue; + } + catch (NumberFormatException e) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, format("Invalid value for %s property: %s", key, value)); + } + } + + public static Object typedPartitionKey(String value, Type type, String name) + { + byte[] bytes = value.getBytes(UTF_8); + + if (isHiveNull(bytes)) { + return null; + } + else if (type.equals(BOOLEAN)) { + return booleanPartitionKey(value, name); + } + else if (type.equals(BIGINT)) { + return bigintPartitionKey(value, name); + } + else if (type.equals(INTEGER)) { + return integerPartitionKey(value, name); + } + else if (type.equals(SMALLINT)) { + return smallintPartitionKey(value, name); + } + else if (type.equals(TINYINT)) { + return tinyintPartitionKey(value, name); + } + else if (type.equals(REAL)) { + return floatPartitionKey(value, name); + } + else if (type.equals(DOUBLE)) { + return doublePartitionKey(value, name); + } + else if (isVarcharType(type)) { + return varcharPartitionKey(value, name, type); + } + else if (isCharType(type)) { + return charPartitionKey(value, name, type); + } + else if (type.equals(DATE)) { + return datePartitionKey(value, name); + } + else if (type.equals(TIMESTAMP)) { + return timestampPartitionKey(value, name); + } + else if (isShortDecimal(type)) { + return shortDecimalPartitionKey(value, (DecimalType) type, name); + } + else if (isLongDecimal(type)) { + return longDecimalPartitionKey(value, (DecimalType) type, name); + } + else { + throw new PrestoException(NOT_SUPPORTED, format("Unsupported column type %s for partition column: %s", type.getDisplayName(), name)); + } + } + + public static List getColumnNames(Properties schema) + { + return COLUMN_NAMES_SPLITTER.splitToList(schema.getProperty(IOConstants.COLUMNS, "")); + } + + public static List getColumnTypes(Properties schema) + { + return HiveType.toHiveTypes(schema.getProperty(IOConstants.COLUMNS_TYPES, "")); + } + + public static boolean isPartitionFiltered(List partitionKeys, List> dynamicFilterList, TypeManager typeManager) + { + if (partitionKeys == null || dynamicFilterList == null || dynamicFilterList.isEmpty()) { + return false; + } + + Map partitions = partitionKeys.stream() + .collect(Collectors.toMap(HivePartitionKey::getName, HivePartitionKey::getValue)); + + boolean result = false; + for (int i = 0; i < dynamicFilterList.size(); i++) { + for (DynamicFilter dynamicFilter : dynamicFilterList.get(i)) { + final ColumnHandle columnHandle = dynamicFilter.getColumnHandle(); + + // If the dynamic filter contains no data there can't be any match + if (dynamicFilter.isEmpty()) { + result = true; + } + + // No need to check non-partition columns + if (!((HiveColumnHandle) columnHandle).isPartitionKey()) { + continue; + } + + String partitionValue = partitions.get(columnHandle.getColumnName()); + if (partitionValue == null) { + continue; + } + + // Skip partitions with null value + if (DEFAULT_PARTITION_VALUE.equals(partitionValue)) { + continue; + } + + try { + Object realObjectValue = getValueAsType(((HiveColumnHandle) columnHandle) + .getColumnMetadata(typeManager).getType(), partitionValue); + // FIXME: Remove this check once BloomFilter type conversion is removed + if (dynamicFilter instanceof BloomFilterDynamicFilter && !(realObjectValue instanceof Long)) { + realObjectValue = partitionValue; + } + if (!dynamicFilter.contains(realObjectValue)) { + result = true; + } + } + catch (PrestoException | ClassCastException e) { + log.error("cannot cast class" + e.getMessage()); + return false; + } + //return if this dynamic filter is not filtering + if (!result) { + return false; + } + } + } + return result; + } + + public static Iterator getMergeSortedPages(List pageSources, + List columnTypes, List sortFields, + List sortOrders) + { + List> sourceIterators = getPageSourceIterators(pageSources); + + return new MergingPageIterator(sourceIterators, columnTypes, sortFields, sortOrders); + } + + static List> getPageSourceIterators(List pageSources) + { + return pageSources.stream().map(source -> new AbstractIterator() + { + @Override + protected Page computeNext() + { + Page nextPage; + do { + nextPage = source.getNextPage(); + if (nextPage == null) { + return endOfData(); + } + } + while (nextPage.getPositionCount() == 0); + + nextPage = nextPage.getLoadedPage(); + return nextPage; + } + }).collect(toList()); + } + + @VisibleForTesting + static OptionalInt getBucketNumber(String name) + { + for (Pattern pattern : BUCKET_PATTERNS) { + Matcher matcher = pattern.matcher(name); + if (matcher.matches()) { + return OptionalInt.of(parseInt(matcher.group(1))); + } + } + return OptionalInt.empty(); + } + + private static Object getValueAsType(Type type, String value) + throws ClassCastException, PrestoException + { + Class javaType = type.getJavaType(); + if (javaType == long.class) { + if (type.equals(BIGINT) || type.equals(INTEGER)) { + return Long.valueOf(value); + } + else { + throw new PrestoException(GENERIC_INTERNAL_ERROR, + "Unhandled type for " + javaType.getSimpleName() + ":" + type.getTypeSignature()); + } + } + else if (javaType == boolean.class) { + return Boolean.valueOf(value); + } + else if (javaType == double.class) { + return Double.valueOf(value); + } + else if (type instanceof AbstractVariableWidthType || javaType == Slice.class) { + return Slices.utf8Slice(value); + } + throw new PrestoException(GENERIC_INTERNAL_ERROR, + "Unhandled type for " + javaType.getSimpleName() + ":" + type.getTypeSignature()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveVacuumSplitSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveVacuumSplitSource.java new file mode 100644 index 00000000..ada34afb --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveVacuumSplitSource.java @@ -0,0 +1,272 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.HiveVacuumTableHandle.Range; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorPartitionHandle; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorSplit; +import io.prestosql.spi.connector.ConnectorSplitSource; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat.Options; +import org.apache.hadoop.hive.ql.io.AcidUtils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.OptionalInt; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; + +import static com.google.common.collect.Iterables.getOnlyElement; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT; +import static java.util.concurrent.CompletableFuture.completedFuture; + +/** + * This class is used in case of Vacuum operation, where all related (same bucket) splits should be scheduled + * to same worker and together, to start vacuum operation. So class wrapps all such splits together and scheduled as + * single split. + */ +class HiveVacuumSplitSource + implements ConnectorSplitSource +{ + private HiveSplitSource splitSource; + /* + * Vacuum operations can be made to run in parallel for the maximum extent as below. + * Minor Vacuums will also have delete_delta splits separately, which can be run in parallel. + * Each grouped splits will be scheduled separately and run in parallel if enough number of workers available. + * Splits are grouped as below to execute in parallel. + * partition->buckets->(type of delta)->List of HiveSplits + */ + private Map>>> splitsMap = new HashMap<>(); + private HiveVacuumTableHandle vacuumTableHandle; + private HdfsEnvironment hdfsEnvironment; + private HdfsContext hdfsContext; + + HiveVacuumSplitSource(HiveSplitSource splitSource, HiveVacuumTableHandle vacuumTableHandle, HdfsEnvironment hdfsEnvironment, HdfsContext hdfsContext, ConnectorSession session) + { + this.splitSource = splitSource; + this.vacuumTableHandle = vacuumTableHandle; + this.hdfsContext = hdfsContext; + this.hdfsEnvironment = hdfsEnvironment; + } + + private int getBucketNumber(HiveSplit hiveSplit) + { + if (hiveSplit.getBucketNumber().isPresent()) { + return hiveSplit.getBucketNumber().getAsInt(); + } + + Path bucketFile = new Path(hiveSplit.getFilePath()); + OptionalInt bucketNumber = HiveUtil.getBucketNumber(bucketFile.getName()); + return bucketNumber.orElse(0); + } + + private boolean isDeleteDelta(HiveSplit hiveSplit) + { + Path bucketFile = new Path(hiveSplit.getPath()); + return AcidUtils.isDeleteDelta(bucketFile.getParent()); + } + + private List getHiveSplitsFor(int bucketNumber, String partition, boolean isDeleteDelta) + { + if (partition == null) { + partition = "default"; + } + Map>> bucketToSplits = splitsMap.get(partition); + if (bucketToSplits == null) { + bucketToSplits = new HashMap<>(); + splitsMap.put(partition, bucketToSplits); + } + Map> partitionMap = getDeltaTypeToSplitsMap(bucketNumber, bucketToSplits); + return getSplitsFromPartition(isDeleteDelta, partitionMap); + } + + private Map> getDeltaTypeToSplitsMap(int bucketNumber, Map>> bucketsToSplits) + { + Map> deltaTypeToSplits = bucketsToSplits.get(bucketNumber); + if (deltaTypeToSplits == null) { + deltaTypeToSplits = new HashMap<>(); + bucketsToSplits.put(bucketNumber, deltaTypeToSplits); + } + return deltaTypeToSplits; + } + + private List getSplitsFromPartition(boolean isDeleteDelta, Map> partitionMap) + { + List hiveSplits = partitionMap.get(isDeleteDelta); + if (hiveSplits == null) { + hiveSplits = new ArrayList<>(); + partitionMap.put(isDeleteDelta, hiveSplits); + } + return hiveSplits; + } + + @Override + public CompletableFuture getNextBatch(ConnectorPartitionHandle partitionHandle, int maxSize) + { + do { + CompletableFuture nextBatch = splitSource.getNextBatch(partitionHandle, maxSize); + try { + ConnectorSplitBatch splitBatch = nextBatch.get(); + List splits = splitBatch.getSplits(); + for (ConnectorSplit split : splits) { + HiveSplit hiveSplit = ((HiveSplitWrapper) split).getSplits().get(0); + int bucketNumber = vacuumTableHandle.isUnifyVacuum() ? 0 : getBucketNumber(hiveSplit); //In case of unify there are no bucket numbers + boolean isDeleteDelta = isDeleteDelta(hiveSplit); + List hiveSplits = getHiveSplitsFor(bucketNumber, hiveSplit.getPartitionName(), isDeleteDelta); + hiveSplits.add(hiveSplit); + } + if (splitBatch.isNoMoreSplits()) { + break; + } + } + catch (InterruptedException e) { + HiveSplitSource.propagatePrestoException(e); + } + catch (ExecutionException e) { + HiveSplitSource.propagatePrestoException(e.getCause()); + } + } + while (true); + + ConnectorSplitBatch splitBatch = getCurrentBatch(partitionHandle); + return completedFuture(splitBatch); + } + + private ConnectorSplitBatch getCurrentBatch(ConnectorPartitionHandle partitionHandle) + { + /* + * All Splits are grouped based on partition->bucketNumber->(delta_type in case of Minor vacuum). + */ + List bucketedSplits = null; + int bucketNumber = 0; + + /* + * If partition handle is passed splits will be chosen only for that bucket, else will be choosen from available buckets. + */ + int bucketToChoose = (partitionHandle instanceof HivePartitionHandle) ? ((HivePartitionHandle) partitionHandle).getBucket() : -1; + Iterator>>>> partitions = splitsMap.entrySet().iterator(); + while (partitions.hasNext()) { + Entry>>> currentPartitionEntry = partitions.next(); + String currentPartition = currentPartitionEntry.getKey(); + if (vacuumTableHandle.isUnifyVacuum() && currentPartition.contains(HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION)) { + //skip the dynamic partition for now. + partitions.remove(); + continue; + } + Map>> buckets = currentPartitionEntry.getValue(); + Map> deltaTypeToSplits = null; + if (bucketToChoose != -1) { + deltaTypeToSplits = buckets.get(bucketToChoose); + bucketNumber = bucketToChoose; + } + else { + Iterator>>> deltaTypeIterator = buckets.entrySet().iterator(); + if (deltaTypeIterator.hasNext()) { + Entry>> entry = deltaTypeIterator.next(); + deltaTypeToSplits = entry.getValue(); + bucketNumber = entry.getKey(); + } + } + if (deltaTypeToSplits == null) { + if (buckets.size() == 0) { + partitions.remove(); + } + continue; + } + Iterator>> splitsIterator = deltaTypeToSplits.entrySet().iterator(); + boolean type; + if (splitsIterator.hasNext()) { + //Choose the splits and remove the entry + Entry> entry = splitsIterator.next(); + bucketedSplits = entry.getValue(); + splitsIterator.remove(); + } + if (!splitsIterator.hasNext()) { + buckets.remove(bucketNumber); + if (buckets.size() == 0) { + partitions.remove(); + } + } + if (bucketedSplits == null || bucketedSplits.isEmpty()) { + continue; + } + + if (bucketedSplits != null) { + //check whether splits are selected for already compacted files. + if (bucketedSplits.size() == 1) { + HiveSplit split = bucketedSplits.get(0); + + Path bucketFile = new Path(split.getPath()); + Range range = getRange(bucketFile); + Range suitableRange = getOnlyElement(vacuumTableHandle.getSuitableRange(currentPartition, range)); + if (range.equals(suitableRange)) { + //Already compacted + // or only one participant, which makes no sense to compact + //Check for some more. + bucketedSplits = null; + continue; + } + } + break; + } + } + if (bucketedSplits != null && !bucketedSplits.isEmpty()) { + HiveSplitWrapper multiSplit = new HiveSplitWrapper(bucketedSplits, OptionalInt.of(bucketNumber)); + //All of the splits for this bucket inside one partition are grouped. + //There may be some more splits for same bucket in different partition. + return new ConnectorSplitBatch(ImmutableList.of(multiSplit), false); + } + else { + return new ConnectorSplitBatch(ImmutableList.of(), true); + } + } + + private Range getRange(Path bucketFile) + { + Range range; + try { + Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, bucketFile); + Options options = hdfsEnvironment.doAs(hdfsContext.getIdentity().getUser(), () -> + AcidUtils.parseBaseOrDeltaBucketFilename(bucketFile, configuration)); + range = new Range(options.getMinimumWriteId(), options.getMaximumWriteId()); + } + catch (IOException e) { + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, "Error while parsing split info for vacuum", e); + } + return range; + } + + @Override + public void close() + { + splitSource.close(); + } + + @Override + public boolean isFinished() + { + return splitSource.isFinished(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveVacuumTableHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveVacuumTableHandle.java new file mode 100644 index 00000000..363573e6 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveVacuumTableHandle.java @@ -0,0 +1,241 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.prestosql.plugin.hive.metastore.HivePageSinkMetadata; +import io.prestosql.spi.connector.ConnectorVacuumTableHandle; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; + +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; + +public class HiveVacuumTableHandle + extends HiveWritableTableHandle + implements ConnectorVacuumTableHandle +{ + private boolean full; + private boolean unify; + Map> ranges; + + @JsonCreator + public HiveVacuumTableHandle( + @JsonProperty("schemaName") String schemaName, + @JsonProperty("tableName") String tableName, + @JsonProperty("inputColumns") List inputColumns, + @JsonProperty("pageSinkMetadata") HivePageSinkMetadata pageSinkMetadata, + @JsonProperty("locationHandle") LocationHandle locationHandle, + @JsonProperty("bucketProperty") Optional bucketProperty, + @JsonProperty("tableStorageFormat") HiveStorageFormat tableStorageFormat, + @JsonProperty("partitionStorageFormat") HiveStorageFormat partitionStorageFormat, + @JsonProperty("full") boolean full, + @JsonProperty("unify") boolean unify, + @JsonProperty("ranges") Map> ranges) + { + super( + schemaName, + tableName, + inputColumns, + pageSinkMetadata, + locationHandle, + bucketProperty, + tableStorageFormat, + partitionStorageFormat, + false); + this.full = full; + this.unify = unify; + this.ranges = ranges; + } + + /** + * This will add the specified range of compacted files to be written. + * There is no way to identify the range of delta files involved in compaction for all batches in worker side. + * Especially useful, when bucketed files are not present in all bucketed batches + * Usually there will be only one range per partition. When multiple partitions are involved, + * ranges has to be corrected involving all together. + */ + synchronized void addRange(String partitionName, Range range) + { + requireNonNull(partitionName, "Partition name is null"); + if (ranges == null) { + ranges = new HashMap<>(); + } + List partitionRanges = ranges.get(partitionName); + if (partitionRanges == null) { + partitionRanges = new ArrayList<>(); + ranges.put(partitionName, partitionRanges); + } + addRange(range, partitionRanges); + } + + static void addRange(Range range, List ranges) + { + List suitableRange = getSuitableRange(range, ranges); + if (!suitableRange.isEmpty()) { + //Already there is an entry to cover this. + return; + } + /* + * Filter out the entries which are overalaps with (part of) current entry and expand such entry to cover both + * ranges, and later remove all other entries which are covered by expanded entry. + */ + List expandableEntries = ranges.stream() + .filter(r -> ((range.getMin() >= r.getMin() + && range.getMax() > r.getMax() + && range.getMin() <= r.getMax()) + || (range.getMax() <= r.getMax() + && range.getMin() < r.getMin() + && range.getMax() >= r.getMin()))) + .collect(toList()); + if (expandableEntries.isEmpty()) { + ranges.add(range); + } + else { + long min = range.getMin(); + long max = range.getMax(); + for (Range expandableRange : expandableEntries) { + min = Math.min(expandableRange.getMin(), min); + max = Math.max(expandableRange.getMax(), max); + ranges.remove(expandableRange); + } + Range expandedRange = new Range(min, max); + ranges.add(expandedRange); + } + //Remove duplicate ranges. + Collections.sort(ranges); + long current = 0; + for (Iterator it = ranges.iterator(); it.hasNext(); ) { + Range next = it.next(); + if (next.getMax() > current) { + current = next.getMax(); + } + else { + it.remove(); + } + } + } + + List getSuitableRange(String partitionName, Range range) + { + List partitionRanges = this.ranges.get(partitionName); + if (partitionRanges == null || partitionRanges.isEmpty()) { + return partitionRanges; + } + return getSuitableRange(range, partitionRanges); + } + + static List getSuitableRange(Range range, List ranges) + { + return ranges.stream() + .filter(r -> (range.getMin() >= r.getMin() + && range.getMax() <= r.getMax())) + .collect(toList()); + } + + @JsonProperty("full") + public boolean isFullVacuum() + { + return full; + } + + @JsonProperty("unify") + public boolean isUnifyVacuum() + { + return unify; + } + + @JsonProperty("ranges") + public synchronized Map> getRanges() + { + return ranges; + } + + public static class Range + implements Comparable + { + private final long min; + private final long max; + + @JsonCreator + public Range( + @JsonProperty("min") long min, + @JsonProperty("max") long max) + { + this.min = min; + this.max = max; + } + + @JsonProperty("min") + public long getMin() + { + return min; + } + + @JsonProperty("max") + public long getMax() + { + return max; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Range range = (Range) o; + return min == range.min && + max == range.max; + } + + @Override + public int hashCode() + { + return Objects.hash(min, max); + } + + /** + * Sorted in Ascending order of min and Descending order of Max to find out the overalapping ranges. + */ + @Override + public int compareTo(Object o) + { + Range other = (Range) o; + return min < other.getMin() ? -1 : min > other.getMin() ? 1 : + max > other.getMax() ? -1 : 1; + } + + @Override + public String toString() + { + return "Range{" + + "min=" + min + + ", max=" + max + + '}'; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveViewNotSupportedException.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveViewNotSupportedException.java new file mode 100644 index 00000000..226f0a5f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveViewNotSupportedException.java @@ -0,0 +1,41 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.connector.NotFoundException; +import io.prestosql.spi.connector.SchemaTableName; + +import static java.lang.String.format; + +public class HiveViewNotSupportedException + extends NotFoundException +{ + private final SchemaTableName tableName; + + public HiveViewNotSupportedException(SchemaTableName tableName) + { + this(tableName, format("Hive views are not supported: '%s'", tableName)); + } + + public HiveViewNotSupportedException(SchemaTableName tableName, String message) + { + super(message); + this.tableName = tableName; + } + + public SchemaTableName getTableName() + { + return tableName; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWritableTableHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWritableTableHandle.java new file mode 100644 index 00000000..7b22c909 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWritableTableHandle.java @@ -0,0 +1,126 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.metastore.HivePageSinkMetadata; +import io.prestosql.spi.connector.SchemaTableName; + +import java.util.List; +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +public class HiveWritableTableHandle +{ + private final String schemaName; + private final String tableName; + private final List inputColumns; + private final HivePageSinkMetadata pageSinkMetadata; + private final LocationHandle locationHandle; + private final Optional bucketProperty; + private final HiveStorageFormat tableStorageFormat; + private final HiveStorageFormat partitionStorageFormat; + private final boolean isOverwrite; + + public HiveWritableTableHandle( + String schemaName, + String tableName, + List inputColumns, + HivePageSinkMetadata pageSinkMetadata, + LocationHandle locationHandle, + Optional bucketProperty, + HiveStorageFormat tableStorageFormat, + HiveStorageFormat partitionStorageFormat, + boolean isOverwrite) + { + this.schemaName = requireNonNull(schemaName, "schemaName is null"); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.inputColumns = ImmutableList.copyOf(requireNonNull(inputColumns, "inputColumns is null")); + this.pageSinkMetadata = requireNonNull(pageSinkMetadata, "pageSinkMetadata is null"); + this.locationHandle = requireNonNull(locationHandle, "locationHandle is null"); + this.bucketProperty = requireNonNull(bucketProperty, "bucketProperty is null"); + this.tableStorageFormat = requireNonNull(tableStorageFormat, "tableStorageFormat is null"); + this.partitionStorageFormat = requireNonNull(partitionStorageFormat, "partitionStorageFormat is null"); + this.isOverwrite = isOverwrite; + } + + @JsonProperty + public String getSchemaName() + { + return schemaName; + } + + @JsonProperty + public String getTableName() + { + return tableName; + } + + @JsonIgnore + public SchemaTableName getSchemaTableName() + { + return new SchemaTableName(schemaName, tableName); + } + + @JsonProperty + public List getInputColumns() + { + return inputColumns; + } + + @JsonProperty + public HivePageSinkMetadata getPageSinkMetadata() + { + return pageSinkMetadata; + } + + @JsonProperty + public LocationHandle getLocationHandle() + { + return locationHandle; + } + + @JsonProperty + public Optional getBucketProperty() + { + return bucketProperty; + } + + @JsonProperty + public HiveStorageFormat getTableStorageFormat() + { + return tableStorageFormat; + } + + @JsonProperty + public HiveStorageFormat getPartitionStorageFormat() + { + return partitionStorageFormat; + } + + @JsonProperty + public boolean getIsOverwrite() + { + return isOverwrite; + } + + @Override + public String toString() + { + return schemaName + "." + tableName; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriteUtils.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriteUtils.java new file mode 100644 index 00000000..6bf6cc7a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriteUtils.java @@ -0,0 +1,738 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.base.CharMatcher; +import com.google.common.collect.ImmutableList; +import com.google.common.primitives.Shorts; +import com.google.common.primitives.SignedBytes; +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.MetastoreUtil; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.parquet.ParquetRecordWriter; +import io.prestosql.plugin.hive.s3.HiveS3Module; +import io.prestosql.plugin.hive.s3.PrestoS3FileSystem; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.StandardErrorCode; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.SchemaNotFoundException; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.type.BigintType; +import io.prestosql.spi.type.BooleanType; +import io.prestosql.spi.type.CharType; +import io.prestosql.spi.type.DateType; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.Decimals; +import io.prestosql.spi.type.DoubleType; +import io.prestosql.spi.type.IntegerType; +import io.prestosql.spi.type.RealType; +import io.prestosql.spi.type.SmallintType; +import io.prestosql.spi.type.TimestampType; +import io.prestosql.spi.type.TinyintType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.VarbinaryType; +import io.prestosql.spi.type.VarcharType; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FilterFileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.fs.viewfs.ViewFileSystem; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.ProtectMode; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.io.HiveOutputFormat; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.Serializer; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Reporter; + +import java.io.IOException; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; + +import static com.google.common.base.Strings.padEnd; +import static com.google.common.io.BaseEncoding.base16; +import static io.prestosql.plugin.hive.HiveUtil.checkCondition; +import static io.prestosql.plugin.hive.HiveUtil.isArrayType; +import static io.prestosql.plugin.hive.HiveUtil.isMapType; +import static io.prestosql.plugin.hive.HiveUtil.isRowType; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.type.Chars.isCharType; +import static java.lang.Float.intBitsToFloat; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.UUID.randomUUID; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.COMPRESSRESULT; +import static org.apache.hadoop.hive.metastore.TableType.MANAGED_TABLE; +import static org.apache.hadoop.hive.ql.io.AcidUtils.isTransactionalTable; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaBooleanObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDateObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDoubleObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaFloatObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaShortObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaTimestampObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableByteObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableDateObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableFloatObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableHiveCharObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableIntObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableLongObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableShortObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableStringObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableTimestampObjectInspector; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getCharTypeInfo; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getVarcharTypeInfo; + +public final class HiveWriteUtils +{ + @SuppressWarnings("OctalInteger") + private static final FsPermission ALL_PERMISSIONS = new FsPermission((short) 0777); + + private HiveWriteUtils() + { + } + + public enum OpertionType { + CREATE_TABLE, + CREATE_TABLE_AS, + INSERT, + INSERT_OVERWRITE + } + + public static RecordWriter createRecordWriter(Path target, JobConf conf, Properties properties, String outputFormatName, ConnectorSession session) + { + try { + boolean compress = HiveConf.getBoolVar(conf, COMPRESSRESULT); + if (outputFormatName.equals(MapredParquetOutputFormat.class.getName())) { + return ParquetRecordWriter.create(target, conf, properties, session); + } + Object writer = Class.forName(outputFormatName).getConstructor().newInstance(); + return ((HiveOutputFormat) writer).getHiveRecordWriter(conf, target, Text.class, compress, properties, Reporter.NULL); + } + catch (IOException | ReflectiveOperationException e) { + throw new PrestoException(HiveErrorCode.HIVE_WRITER_DATA_ERROR, e); + } + } + + public static Serializer initializeSerializer(Configuration conf, Properties properties, String serializerName) + { + try { + Serializer result = (Serializer) Class.forName(serializerName).getConstructor().newInstance(); + result.initialize(conf, properties); + return result; + } + catch (ClassNotFoundException e) { + throw new PrestoException(HiveErrorCode.HIVE_SERDE_NOT_FOUND, "Serializer does not exist: " + serializerName); + } + catch (SerDeException | ReflectiveOperationException e) { + throw new PrestoException(HiveErrorCode.HIVE_WRITER_DATA_ERROR, e); + } + } + + public static ObjectInspector getJavaObjectInspector(Type type) + { + if (type.equals(BooleanType.BOOLEAN)) { + return javaBooleanObjectInspector; + } + if (type.equals(BigintType.BIGINT)) { + return javaLongObjectInspector; + } + if (type.equals(IntegerType.INTEGER)) { + return javaIntObjectInspector; + } + if (type.equals(SmallintType.SMALLINT)) { + return javaShortObjectInspector; + } + if (type.equals(TinyintType.TINYINT)) { + return javaByteObjectInspector; + } + if (type.equals(RealType.REAL)) { + return javaFloatObjectInspector; + } + if (type.equals(DoubleType.DOUBLE)) { + return javaDoubleObjectInspector; + } + if (type instanceof VarcharType) { + return writableStringObjectInspector; + } + if (type instanceof CharType) { + return writableHiveCharObjectInspector; + } + if (type.equals(VarbinaryType.VARBINARY)) { + return javaByteArrayObjectInspector; + } + if (type.equals(DateType.DATE)) { + return javaDateObjectInspector; + } + if (type.equals(TimestampType.TIMESTAMP)) { + return javaTimestampObjectInspector; + } + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + return getPrimitiveJavaObjectInspector(new DecimalTypeInfo(decimalType.getPrecision(), decimalType.getScale())); + } + if (isArrayType(type)) { + return ObjectInspectorFactory.getStandardListObjectInspector(getJavaObjectInspector(type.getTypeParameters().get(0))); + } + if (isMapType(type)) { + ObjectInspector keyObjectInspector = getJavaObjectInspector(type.getTypeParameters().get(0)); + ObjectInspector valueObjectInspector = getJavaObjectInspector(type.getTypeParameters().get(1)); + return ObjectInspectorFactory.getStandardMapObjectInspector(keyObjectInspector, valueObjectInspector); + } + if (isRowType(type)) { + return ObjectInspectorFactory.getStandardStructObjectInspector( + type.getTypeSignature().getParameters().stream() + .map(parameter -> parameter.getNamedTypeSignature().getName().get()) + .collect(toList()), + type.getTypeParameters().stream() + .map(HiveWriteUtils::getJavaObjectInspector) + .collect(toList())); + } + throw new IllegalArgumentException("unsupported type: " + type); + } + + public static List createPartitionValues(List partitionColumnTypes, Page partitionColumns, int position) + { + ImmutableList.Builder partitionValues = ImmutableList.builder(); + for (int field = 0; field < partitionColumns.getChannelCount(); field++) { + Object value = getField(partitionColumnTypes.get(field), partitionColumns.getBlock(field), position); + if (value == null) { + partitionValues.add(HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION); + } + else { + String valueString = value.toString(); + if (!CharMatcher.inRange((char) 0x20, (char) 0x7E).matchesAllOf(valueString)) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_PARTITION_VALUE, + "Hive partition keys can only contain printable ASCII characters (0x20 - 0x7E). Invalid value: " + + base16().withSeparator(" ", 2).encode(valueString.getBytes(UTF_8))); + } + partitionValues.add(valueString); + } + } + return partitionValues.build(); + } + + public static Object getField(Type type, Block block, int position) + { + if (block.isNull(position)) { + return null; + } + if (BooleanType.BOOLEAN.equals(type)) { + return type.getBoolean(block, position); + } + if (BigintType.BIGINT.equals(type)) { + return type.getLong(block, position); + } + if (IntegerType.INTEGER.equals(type)) { + return toIntExact(type.getLong(block, position)); + } + if (SmallintType.SMALLINT.equals(type)) { + return Shorts.checkedCast(type.getLong(block, position)); + } + if (TinyintType.TINYINT.equals(type)) { + return SignedBytes.checkedCast(type.getLong(block, position)); + } + if (RealType.REAL.equals(type)) { + return intBitsToFloat((int) type.getLong(block, position)); + } + if (DoubleType.DOUBLE.equals(type)) { + return type.getDouble(block, position); + } + if (type instanceof VarcharType) { + return new Text(type.getSlice(block, position).getBytes()); + } + if (type instanceof CharType) { + CharType charType = (CharType) type; + return new Text(padEnd(type.getSlice(block, position).toStringUtf8(), charType.getLength(), ' ')); + } + if (VarbinaryType.VARBINARY.equals(type)) { + return type.getSlice(block, position).getBytes(); + } + if (DateType.DATE.equals(type)) { + return Date.ofEpochDay(toIntExact(type.getLong(block, position))); + } + if (TimestampType.TIMESTAMP.equals(type)) { + return Timestamp.ofEpochMilli(type.getLong(block, position)); + } + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + return getHiveDecimal(decimalType, block, position); + } + if (isArrayType(type)) { + Type elementType = type.getTypeParameters().get(0); + + Block arrayBlock = block.getObject(position, Block.class); + + List list = new ArrayList<>(arrayBlock.getPositionCount()); + for (int i = 0; i < arrayBlock.getPositionCount(); i++) { + Object element = getField(elementType, arrayBlock, i); + list.add(element); + } + + return Collections.unmodifiableList(list); + } + if (isMapType(type)) { + Type keyType = type.getTypeParameters().get(0); + Type valueType = type.getTypeParameters().get(1); + + Block mapBlock = block.getObject(position, Block.class); + Map map = new HashMap<>(); + for (int i = 0; i < mapBlock.getPositionCount(); i += 2) { + Object key = getField(keyType, mapBlock, i); + Object value = getField(valueType, mapBlock, i + 1); + map.put(key, value); + } + + return Collections.unmodifiableMap(map); + } + if (isRowType(type)) { + Block rowBlock = block.getObject(position, Block.class); + + List fieldTypes = type.getTypeParameters(); + checkCondition(fieldTypes.size() == rowBlock.getPositionCount(), StandardErrorCode.GENERIC_INTERNAL_ERROR, "Expected row value field count does not match type field count"); + + List row = new ArrayList<>(rowBlock.getPositionCount()); + for (int i = 0; i < rowBlock.getPositionCount(); i++) { + Object element = getField(fieldTypes.get(i), rowBlock, i); + row.add(element); + } + + return Collections.unmodifiableList(row); + } + throw new PrestoException(NOT_SUPPORTED, "unsupported type: " + type); + } + + public static void checkTableIsWritable(Table table, boolean writesToNonManagedTablesEnabled, HiveACIDWriteType writeType) + { + if (!writesToNonManagedTablesEnabled && !table.getTableType().equals(MANAGED_TABLE.toString())) { + throw new PrestoException(NOT_SUPPORTED, "Cannot write to non-managed Hive table"); + } + + checkWritable( + table.getSchemaTableName(), + Optional.empty(), + MetastoreUtil.getProtectMode(table), + table.getParameters(), + table.getStorage(), + writeType); + } + + public static void checkPartitionIsWritable(String partitionName, Partition partition) + { + checkWritable( + partition.getSchemaTableName(), + Optional.of(partitionName), + MetastoreUtil.getProtectMode(partition), + partition.getParameters(), + partition.getStorage(), + HiveACIDWriteType.INSERT); + } + + private static void checkWritable( + SchemaTableName tableName, + Optional partitionName, + ProtectMode protectMode, + Map parameters, + Storage storage, + HiveACIDWriteType writeType) + { + String tablePartitionDescription = "Table '" + tableName + "'"; + if (partitionName.isPresent()) { + tablePartitionDescription += " partition '" + partitionName.get() + "'"; + } + + // verify online + MetastoreUtil.verifyOnline(tableName, partitionName, protectMode, parameters); + + // verify not read only + if (protectMode.readOnly) { + throw new HiveReadOnlyException(tableName, partitionName); + } + + // verify skew info + if (storage.isSkewed()) { + throw new PrestoException(NOT_SUPPORTED, format("Inserting into bucketed tables with skew is not supported. %s", tablePartitionDescription)); + } + + // verify transactional for update + if (writeType == HiveACIDWriteType.UPDATE) { + if (!isTransactionalTable(parameters)) { + throw new PrestoException(NOT_SUPPORTED, "Updates to Hive Non-transactional tables are not supported: " + tableName); + } + else if (AcidUtils.isInsertOnlyTable(parameters)) { + throw new PrestoException(NOT_SUPPORTED, "Updates to Hive InsertOnly-transactional tables are not supported: " + tableName); + } + } + // verify transactional for Vacuum + if (HiveACIDWriteType.isVacuum(writeType) && !isTransactionalTable(parameters)) { + throw new PrestoException(NOT_SUPPORTED, "Vacuum on Hive Non-transactional tables are not supported: " + tableName); + } + if (HiveACIDWriteType.VACUUM_UNIFY == writeType && + storage.getBucketProperty().isPresent()) { + throw new PrestoException(NOT_SUPPORTED, String.format("Vacuum merge on Bucketed Hive table %s not supported", tableName)); + } + } + + public static Path getTableDefaultLocation(HdfsContext context, SemiTransactionalHiveMetastore metastore, HdfsEnvironment hdfsEnvironment, String schemaName, String tableName) + { + Database database = metastore.getDatabase(schemaName) + .orElseThrow(() -> new SchemaNotFoundException(schemaName)); + + return getTableDefaultLocation(database, context, hdfsEnvironment, schemaName, tableName); + } + + public static Path getTableDefaultLocation(Database database, HdfsContext context, HdfsEnvironment hdfsEnvironment, String schemaName, String tableName) + { + Optional location = database.getLocation(); + if (!location.isPresent() || location.get().isEmpty()) { + throw new PrestoException(HiveErrorCode.HIVE_DATABASE_LOCATION_ERROR, format("Database '%s' location is not set", schemaName)); + } + + Path databasePath = new Path(location.get()); + if (!isS3FileSystem(context, hdfsEnvironment, databasePath)) { + if (!pathExists(context, hdfsEnvironment, databasePath)) { + throw new PrestoException(HiveErrorCode.HIVE_DATABASE_LOCATION_ERROR, format("Database '%s' location does not exist: %s", schemaName, databasePath)); + } + if (!isDirectory(context, hdfsEnvironment, databasePath)) { + throw new PrestoException(HiveErrorCode.HIVE_DATABASE_LOCATION_ERROR, format("Database '%s' location is not a directory: %s", schemaName, databasePath)); + } + } + + return new Path(databasePath, tableName); + } + + public static boolean pathExists(HdfsContext context, HdfsEnvironment hdfsEnvironment, Path path) + { + try { + return hdfsEnvironment.getFileSystem(context, path).exists(path); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, "Failed checking path: " + path, e); + } + } + + public static FileStatus getFileStatus(HdfsContext context, HdfsEnvironment hdfsEnvironment, Path path) + { + try { + return hdfsEnvironment.getFileSystem(context, path).getFileStatus(path); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, "Failed getting listStatus: " + path, e); + } + } + + public static FileStatus[] getChildren(HdfsContext context, HdfsEnvironment hdfsEnvironment, Path path) + { + try { + return hdfsEnvironment.getFileSystem(context, path).listStatus(path); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, "Failed getting listStatus: " + path, e); + } + } + + public static boolean isS3FileSystem(HdfsContext context, HdfsEnvironment hdfsEnvironment, Path path) + { + try { + FileSystem fileSystem = getRawFileSystem(hdfsEnvironment.getFileSystem(context, path)); + return fileSystem instanceof PrestoS3FileSystem || fileSystem.getClass().getName().equals(HiveS3Module.EMR_FS_CLASS_NAME); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, "Failed checking path: " + path, e); + } + } + + public static boolean isViewFileSystem(HdfsContext context, HdfsEnvironment hdfsEnvironment, Path path) + { + try { + return getRawFileSystem(hdfsEnvironment.getFileSystem(context, path)) instanceof ViewFileSystem; + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, "Failed checking path: " + path, e); + } + } + + private static FileSystem getRawFileSystem(FileSystem fileSystem) + { + if (fileSystem instanceof FilterFileSystem) { + return getRawFileSystem(((FilterFileSystem) fileSystem).getRawFileSystem()); + } + return fileSystem; + } + + private static boolean isDirectory(HdfsContext context, HdfsEnvironment hdfsEnvironment, Path path) + { + try { + return hdfsEnvironment.getFileSystem(context, path).isDirectory(path); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, "Failed checking path: " + path, e); + } + } + + public static boolean isHdfsEncrypted(HdfsContext context, HdfsEnvironment hdfsEnvironment, Path path) + { + try { + FileSystem fileSystem = getRawFileSystem(hdfsEnvironment.getFileSystem(context, path)); + if (fileSystem instanceof DistributedFileSystem) { + return ((DistributedFileSystem) fileSystem).getEZForPath(path) != null; + } + return false; + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, "Failed checking encryption status for path: " + path, e); + } + } + + public static Path createTemporaryPath(ConnectorSession session, HdfsContext context, HdfsEnvironment hdfsEnvironment, Path targetPath, OpertionType type) + { + String temporaryPrefix; + String stagingDirectoryName; + + // use relative temporary directory on ViewFS + if (isViewFileSystem(context, hdfsEnvironment, targetPath)) { + temporaryPrefix = ".hive-staging"; + stagingDirectoryName = randomUUID().toString(); + } + else { + temporaryPrefix = ((type != OpertionType.INSERT) ? targetPath.getParent() : targetPath).toString(); + if ((type == OpertionType.INSERT) && !pathExists(temporaryPrefix, context, hdfsEnvironment)) { + temporaryPrefix = targetPath.getParent().toString(); + } + String queryId = context.getQueryId().isPresent() ? context.getQueryId().get() : ""; + stagingDirectoryName = ".staging-" + queryId + "-" + randomUUID().toString(); + } + + // create a temporary directory on the same filesystem + Path temporaryRoot = new Path(targetPath, temporaryPrefix); + Path temporaryPath = new Path(temporaryRoot, stagingDirectoryName); + + createDirectory(context, hdfsEnvironment, temporaryPath); + return temporaryPath; + } + + private static boolean pathExists(String location, HdfsContext context, HdfsEnvironment hdfsEnvironment) + { + Path path = new Path(location); + try { + if (hdfsEnvironment.getFileSystem(context, path).exists(path)) { + return true; + } + } + catch (IOException e) { + return false; + } + return false; + } + + public static void createDirectory(HdfsContext context, HdfsEnvironment hdfsEnvironment, Path path) + { + try { + if (!hdfsEnvironment.getFileSystem(context, path).mkdirs(path, ALL_PERMISSIONS)) { + throw new IOException("mkdirs returned false"); + } + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, "Failed to create directory: " + path, e); + } + + // explicitly set permission since the default umask overrides it on creation + try { + hdfsEnvironment.getFileSystem(context, path).setPermission(path, ALL_PERMISSIONS); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, "Failed to set permission on directory: " + path, e); + } + } + + public static boolean isWritableType(HiveType hiveType) + { + return isWritableType(hiveType.getTypeInfo()); + } + + private static boolean isWritableType(TypeInfo typeInfo) + { + switch (typeInfo.getCategory()) { + case PRIMITIVE: + PrimitiveCategory primitiveCategory = ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory(); + return isWritablePrimitiveType(primitiveCategory); + case MAP: + MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo; + return isWritableType(mapTypeInfo.getMapKeyTypeInfo()) && isWritableType(mapTypeInfo.getMapValueTypeInfo()); + case LIST: + ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; + return isWritableType(listTypeInfo.getListElementTypeInfo()); + case STRUCT: + StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; + return structTypeInfo.getAllStructFieldTypeInfos().stream().allMatch(HiveWriteUtils::isWritableType); + } + return false; + } + + private static boolean isWritablePrimitiveType(PrimitiveCategory primitiveCategory) + { + switch (primitiveCategory) { + case BOOLEAN: + case LONG: + case INT: + case SHORT: + case BYTE: + case FLOAT: + case DOUBLE: + case STRING: + case DATE: + case TIMESTAMP: + case BINARY: + case DECIMAL: + case VARCHAR: + case CHAR: + return true; + } + return false; + } + + public static List getRowColumnInspectors(List types) + { + return types.stream() + .map(HiveWriteUtils::getRowColumnInspector) + .collect(toList()); + } + + public static ObjectInspector getRowColumnInspector(Type type) + { + if (type.equals(BooleanType.BOOLEAN)) { + return writableBooleanObjectInspector; + } + + if (type.equals(BigintType.BIGINT)) { + return writableLongObjectInspector; + } + + if (type.equals(IntegerType.INTEGER)) { + return writableIntObjectInspector; + } + + if (type.equals(SmallintType.SMALLINT)) { + return writableShortObjectInspector; + } + + if (type.equals(TinyintType.TINYINT)) { + return writableByteObjectInspector; + } + + if (type.equals(RealType.REAL)) { + return writableFloatObjectInspector; + } + + if (type.equals(DoubleType.DOUBLE)) { + return writableDoubleObjectInspector; + } + + if (type instanceof VarcharType) { + VarcharType varcharType = (VarcharType) type; + if (varcharType.isUnbounded()) { + // Unbounded VARCHAR is not supported by Hive. + // Values for such columns must be stored as STRING in Hive + return writableStringObjectInspector; + } + if (varcharType.getBoundedLength() <= HiveVarchar.MAX_VARCHAR_LENGTH) { + // VARCHAR columns with the length less than or equal to 65535 are supported natively by Hive + return getPrimitiveWritableObjectInspector(getVarcharTypeInfo(varcharType.getBoundedLength())); + } + } + + if (isCharType(type)) { + CharType charType = (CharType) type; + int charLength = charType.getLength(); + return getPrimitiveWritableObjectInspector(getCharTypeInfo(charLength)); + } + + if (type.equals(VarbinaryType.VARBINARY)) { + return writableBinaryObjectInspector; + } + + if (type.equals(DateType.DATE)) { + return writableDateObjectInspector; + } + + if (type.equals(TimestampType.TIMESTAMP)) { + return writableTimestampObjectInspector; + } + + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + return getPrimitiveWritableObjectInspector(new DecimalTypeInfo(decimalType.getPrecision(), decimalType.getScale())); + } + + if (isArrayType(type) || isMapType(type) || isRowType(type)) { + return getJavaObjectInspector(type); + } + + throw new IllegalArgumentException("unsupported type: " + type); + } + + public static HiveDecimal getHiveDecimal(DecimalType decimalType, Block block, int position) + { + BigInteger unscaledValue; + if (decimalType.isShort()) { + unscaledValue = BigInteger.valueOf(decimalType.getLong(block, position)); + } + else { + unscaledValue = Decimals.decodeUnscaledValue(decimalType.getSlice(block, position)); + } + return HiveDecimal.create(unscaledValue, decimalType.getScale()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriter.java new file mode 100644 index 00000000..247faeb3 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriter.java @@ -0,0 +1,161 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import io.prestosql.spi.Page; + +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.function.Consumer; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +public class HiveWriter +{ + private final HiveFileWriter fileWriter; + private final Optional partitionName; + private final PartitionUpdate.UpdateMode updateMode; + private final String fileName; + private final String writePath; + private final String targetPath; + private final String filePath; + private final Consumer onCommit; + private final HiveWriterStats hiveWriterStats; + + private List extraPartitionFiles; + private List miscData; + private long rowCount; + private long inputSizeInBytes; + + public HiveWriter( + HiveFileWriter fileWriter, + Optional partitionName, + PartitionUpdate.UpdateMode updateMode, + String fileName, + String writePath, + String targetPath, + String filePath, + Consumer onCommit, + HiveWriterStats hiveWriterStats, + List extraPartitionFiles) + { + this.fileWriter = requireNonNull(fileWriter, "fileWriter is null"); + this.partitionName = requireNonNull(partitionName, "partitionName is null"); + this.updateMode = requireNonNull(updateMode, "updateMode is null"); + this.fileName = requireNonNull(fileName, "fileName is null"); + this.writePath = requireNonNull(writePath, "writePath is null"); + this.targetPath = requireNonNull(targetPath, "targetPath is null"); + this.filePath = requireNonNull(filePath, "filePath is null"); + this.onCommit = requireNonNull(onCommit, "onCommit is null"); + this.hiveWriterStats = hiveWriterStats; + this.extraPartitionFiles = (extraPartitionFiles != null) ? ImmutableList.copyOf(extraPartitionFiles) : Collections.emptyList(); + this.miscData = Collections.emptyList(); + } + + public String getFilePath() + { + return filePath; + } + + public long getWrittenBytes() + { + return fileWriter.getWrittenBytes(); + } + + public long getSystemMemoryUsage() + { + return fileWriter.getSystemMemoryUsage(); + } + + public long getRowCount() + { + return rowCount; + } + + public void append(Page dataPage) + { + // getRegionSizeInBytes for each row can be expensive; use getRetainedSizeInBytes for estimation + if (hiveWriterStats != null) { + hiveWriterStats.addInputPageSizesInBytes(dataPage.getRetainedSizeInBytes()); + } + fileWriter.appendRows(dataPage); + rowCount += dataPage.getPositionCount(); + inputSizeInBytes += dataPage.getSizeInBytes(); + } + + public void commit() + { + fileWriter.commit(); + + /* Fetch again to confirm if some new files added as part of the operations */ + Set set = new HashSet<>(extraPartitionFiles); + set.addAll(fileWriter.getExtraPartitionFiles()); + + extraPartitionFiles = ImmutableList.copyOf(set); + + miscData = fileWriter.getMiscData(); + + onCommit.accept(this); + } + + long getValidationCpuNanos() + { + return fileWriter.getValidationCpuNanos(); + } + + public Optional getVerificationTask() + { + return fileWriter.getVerificationTask(); + } + + public void rollback() + { + fileWriter.rollback(); + } + + public PartitionUpdate getPartitionUpdate() + { + return new PartitionUpdate( + partitionName.orElse(""), + updateMode, + writePath, + targetPath, + ImmutableList.builder().addAll(extraPartitionFiles).add(fileName).build(), + rowCount, + inputSizeInBytes, + fileWriter.getWrittenBytes(), + miscData); + } + + @VisibleForTesting + public HiveFileWriter getFileWriter() + { + return fileWriter; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("fileWriter", fileWriter) + .add("filePath", writePath + "/" + fileName) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriterFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriterFactory.java new file mode 100644 index 00000000..b317099e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriterFactory.java @@ -0,0 +1,1013 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.base.Strings; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; +import io.airlift.event.client.EventClient; +import io.airlift.log.Logger; +import io.airlift.units.DataSize; +import io.prestosql.orc.OrcDataSourceId; +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.HiveSessionProperties.InsertExistingPartitionsBehavior; +import io.prestosql.plugin.hive.LocationService.WriteInfo; +import io.prestosql.plugin.hive.PartitionUpdate.UpdateMode; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.HivePageSinkMetadataProvider; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.SortingColumn; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.orc.HdfsOrcDataSource; +import io.prestosql.plugin.hive.util.TempFileReader; +import io.prestosql.spi.NodeManager; +import io.prestosql.spi.Page; +import io.prestosql.spi.PageSorter; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.SortOrder; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.session.PropertyMetadata; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.FileUtils; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat.Options; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat; +import org.apache.hadoop.hive.ql.io.IOConstants; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.DefaultCodec; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hive.common.util.ReflectionUtil; +import org.joda.time.DateTimeZone; + +import java.io.IOException; +import java.security.Principal; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.Set; +import java.util.function.Consumer; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_PARTITION_READ_ONLY; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_PATH_ALREADY_EXISTS; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_TABLE_READ_ONLY; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_OPEN_ERROR; +import static io.prestosql.plugin.hive.HiveUtil.getColumnNames; +import static io.prestosql.plugin.hive.HiveUtil.getColumnTypes; +import static io.prestosql.plugin.hive.HiveWriteUtils.createPartitionValues; +import static io.prestosql.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_EXISTING_DIRECTORY; +import static io.prestosql.plugin.hive.metastore.MetastoreUtil.getHiveSchema; +import static io.prestosql.plugin.hive.metastore.StorageFormat.fromHiveStorageFormat; +import static io.prestosql.plugin.hive.util.ConfigurationUtils.toJobConf; +import static io.prestosql.spi.StandardErrorCode.NOT_FOUND; +import static java.lang.Math.min; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static java.util.UUID.randomUUID; +import static java.util.function.Function.identity; +import static java.util.stream.Collectors.joining; +import static java.util.stream.Collectors.toList; +import static java.util.stream.Collectors.toMap; +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.COMPRESSRESULT; + +public class HiveWriterFactory +{ + private static Logger log = Logger.get(HiveWriterFactory.class); + + private static final int MAX_BUCKET_COUNT = 100_000; + private static final int BUCKET_NUMBER_PADDING = Integer.toString(MAX_BUCKET_COUNT - 1).length(); + + private final Set fileWriterFactories; + private final String schemaName; + private final String tableName; + + private final List dataColumns; + + private final List partitionColumnNames; + private final List partitionColumnTypes; + + private final HiveStorageFormat tableStorageFormat; + private final HiveStorageFormat partitionStorageFormat; + private final Map additionalTableParameters; + protected final LocationHandle locationHandle; + protected final LocationService locationService; + private final String queryId; + + private final HivePageSinkMetadataProvider pageSinkMetadataProvider; + private final TypeManager typeManager; + private final HdfsEnvironment hdfsEnvironment; + private final PageSorter pageSorter; + private final JobConf conf; + + private final Table table; + private final DataSize sortBufferSize; + private final int maxOpenSortFiles; + private final boolean immutablePartitions; + private final InsertExistingPartitionsBehavior insertExistingPartitionsBehavior; + private final DateTimeZone parquetTimeZone; + + private final ConnectorSession session; + private final OptionalInt bucketCount; + private final List sortedBy; + + private final NodeManager nodeManager; + private final EventClient eventClient; + private final Map sessionProperties; + + private final HiveWriterStats hiveWriterStats; + private final HiveACIDWriteType acidWriteType; + + private final OrcFileWriterFactory orcFileWriterFactory; + + // Snapshot: instead of writing to a "file", each snapshot is stored in a sub file "file.0", "file.1", etc. + // These sub files are then merged to the final file when the operator finishes. + private boolean isSnapshotEnabled; + // The snapshotSuffixes list records the "resumeCount" for each sub file index. + // File suffix includes both the resumeCount and the sub file index. + // This ensures that different runs create files with different names, to avoid any potential collision. + private final List snapshotSuffixes = new ArrayList<>(); + private long resumeCount; + + public HiveWriterFactory( + Set fileWriterFactories, + String schemaName, + String tableName, + boolean isCreateTable, + HiveACIDWriteType acidWriteType, + List inputColumns, + HiveStorageFormat tableStorageFormat, + HiveStorageFormat partitionStorageFormat, + Map additionalTableParameters, + OptionalInt bucketCount, + List sortedBy, + LocationHandle locationHandle, + LocationService locationService, + String queryId, + HivePageSinkMetadataProvider pageSinkMetadataProvider, + TypeManager typeManager, + HdfsEnvironment hdfsEnvironment, + PageSorter pageSorter, + DataSize sortBufferSize, + int maxOpenSortFiles, + boolean immutablePartitions, + DateTimeZone parquetTimeZone, + ConnectorSession session, + NodeManager nodeManager, + EventClient eventClient, + HiveSessionProperties hiveSessionProperties, + HiveWriterStats hiveWriterStats, + OrcFileWriterFactory orcFileWriterFactory) + { + this.fileWriterFactories = ImmutableSet.copyOf(requireNonNull(fileWriterFactories, "fileWriterFactories is null")); + this.schemaName = requireNonNull(schemaName, "schemaName is null"); + this.tableName = requireNonNull(tableName, "tableName is null"); + + this.tableStorageFormat = requireNonNull(tableStorageFormat, "tableStorageFormat is null"); + this.partitionStorageFormat = requireNonNull(partitionStorageFormat, "partitionStorageFormat is null"); + this.additionalTableParameters = ImmutableMap.copyOf(requireNonNull(additionalTableParameters, "additionalTableParameters is null")); + this.locationHandle = requireNonNull(locationHandle, "locationHandle is null"); + this.locationService = requireNonNull(locationService, "locationService is null"); + this.queryId = requireNonNull(queryId, "queryId is null"); + + this.pageSinkMetadataProvider = requireNonNull(pageSinkMetadataProvider, "pageSinkMetadataProvider is null"); + + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.pageSorter = requireNonNull(pageSorter, "pageSorter is null"); + this.sortBufferSize = requireNonNull(sortBufferSize, "sortBufferSize is null"); + this.maxOpenSortFiles = maxOpenSortFiles; + this.immutablePartitions = immutablePartitions; + + if (acidWriteType == HiveACIDWriteType.INSERT_OVERWRITE) { + //In case of ACID txn tables, dont delete old data. Just create new base in same partition. + if (pageSinkMetadataProvider.getTable().isPresent() && + AcidUtils.isTransactionalTable(pageSinkMetadataProvider.getTable().get().getParameters())) { + this.insertExistingPartitionsBehavior = InsertExistingPartitionsBehavior.APPEND; + } + else { + this.insertExistingPartitionsBehavior = InsertExistingPartitionsBehavior.OVERWRITE; + } + } + else if (acidWriteType == HiveACIDWriteType.UPDATE) { + // if the write type is update, then ignore the session property + this.insertExistingPartitionsBehavior = InsertExistingPartitionsBehavior.APPEND; + } + else { + this.insertExistingPartitionsBehavior = HiveSessionProperties.getInsertExistingPartitionsBehavior(session); + } + + if (immutablePartitions) { + checkArgument(insertExistingPartitionsBehavior != InsertExistingPartitionsBehavior.APPEND, "insertExistingPartitionsBehavior cannot be APPEND"); + } + this.parquetTimeZone = requireNonNull(parquetTimeZone, "parquetTimeZone is null"); + + this.acidWriteType = acidWriteType; + // divide input columns into partition and data columns + requireNonNull(inputColumns, "inputColumns is null"); + ImmutableList.Builder partitionColumnNames = ImmutableList.builder(); + ImmutableList.Builder partitionColumnTypes = ImmutableList.builder(); + ImmutableList.Builder dataColumns = ImmutableList.builder(); + for (HiveColumnHandle column : inputColumns) { + HiveType hiveType = column.getHiveType(); + if (column.isPartitionKey()) { + partitionColumnNames.add(column.getName()); + partitionColumnTypes.add(typeManager.getType(column.getTypeSignature())); + } + else { + dataColumns.add(new DataColumn(column.getName(), hiveType)); + } + } + this.partitionColumnNames = partitionColumnNames.build(); + this.partitionColumnTypes = partitionColumnTypes.build(); + this.dataColumns = dataColumns.build(); + + Path writePath; + if (isCreateTable) { + this.table = null; + WriteInfo writeInfo = locationService.getQueryWriteInfo(locationHandle); + checkArgument(writeInfo.getWriteMode() != DIRECT_TO_TARGET_EXISTING_DIRECTORY, "CREATE TABLE write mode cannot be DIRECT_TO_TARGET_EXISTING_DIRECTORY"); + writePath = writeInfo.getWritePath(); + } + else { + Optional

table = pageSinkMetadataProvider.getTable(); + if (!table.isPresent()) { + throw new PrestoException(HIVE_INVALID_METADATA, format("Table %s.%s was dropped during insert", schemaName, tableName)); + } + this.table = table.get(); + writePath = locationService.getQueryWriteInfo(locationHandle).getWritePath(); + } + + this.bucketCount = requireNonNull(bucketCount, "bucketCount is null"); + if (bucketCount.isPresent()) { + checkArgument(bucketCount.getAsInt() < MAX_BUCKET_COUNT, "bucketCount must be smaller than " + MAX_BUCKET_COUNT); + } + + this.sortedBy = ImmutableList.copyOf(requireNonNull(sortedBy, "sortedBy is null")); + + this.session = requireNonNull(session, "session is null"); + this.nodeManager = requireNonNull(nodeManager, "nodeManager is null"); + this.eventClient = requireNonNull(eventClient, "eventClient is null"); + + requireNonNull(hiveSessionProperties, "hiveSessionProperties is null"); + this.sessionProperties = hiveSessionProperties.getSessionProperties().stream() + .collect(toImmutableMap(PropertyMetadata::getName, + entry -> session.getProperty(entry.getName(), entry.getJavaType()).toString())); + + Configuration conf = hdfsEnvironment.getConfiguration(new HdfsContext(session, schemaName, tableName), writePath); + this.conf = toJobConf(conf); + + // make sure the FileSystem is created with the correct Configuration object + try { + hdfsEnvironment.getFileSystem(session.getUser(), writePath, conf); + } + catch (IOException e) { + throw new PrestoException(HIVE_FILESYSTEM_ERROR, "Failed getting FileSystem: " + writePath, e); + } + + this.hiveWriterStats = requireNonNull(hiveWriterStats, "hiveWriterStats is null"); + + this.orcFileWriterFactory = requireNonNull(orcFileWriterFactory, "orcFileWriterFactory is null"); + + this.isSnapshotEnabled = session.isSnapshotEnabled(); + } + + JobConf getConf() + { + return conf; + } + + List getPartitionValues(Page partitionColumns, int position) + { + return createPartitionValues(partitionColumnTypes, partitionColumns, position); + } + + Optional getPartitionName(Page partitionColumns, int position) + { + List partitionValues = createPartitionValues(partitionColumnTypes, partitionColumns, position); + Optional partitionName; + if (!partitionColumnNames.isEmpty()) { + partitionName = Optional.of(FileUtils.makePartName(partitionColumnNames, partitionValues)); + } + else { + partitionName = Optional.empty(); + } + return partitionName; + } + + public HiveWriter createWriter(List partitionValues, OptionalInt bucketNumber, Optional vacuumOptions) + { + return createWriter(partitionValues, bucketNumber, vacuumOptions, false); + } + + public HiveWriter createWriterForSnapshotMerge(List partitionValues, OptionalInt bucketNumber, Optional vacuumOptions) + { + return createWriter(partitionValues, bucketNumber, vacuumOptions, true); + } + + private HiveWriter createWriter(List partitionValues, OptionalInt bucketNumber, Optional vacuumOptions, boolean forMerge) + { + boolean isTxnTable = isTxnTable(); + if (bucketCount.isPresent()) { + checkArgument(bucketNumber.isPresent(), "Bucket not provided for bucketed table"); + checkArgument(bucketNumber.getAsInt() < bucketCount.getAsInt(), "Bucket number %s must be less than bucket count %s", bucketNumber, bucketCount); + } + else { + checkArgument(isTxnTable || !bucketNumber.isPresent(), "Bucket number provided by for table that is not bucketed"); + } + + String fileName; + if (bucketNumber.isPresent()) { + fileName = computeBucketedFileName(queryId, bucketNumber.getAsInt()); + } + else { + // Snapshot: don't use UUID. File name needs to be deterministic. + if (isSnapshotEnabled) { + fileName = String.format("%s_%d_%d_%d", queryId, session.getTaskId().getAsInt(), session.getPipelineId().getAsInt(), session.getDriverId().getAsInt()); + } + else { + fileName = queryId + "_" + randomUUID(); + } + } + + Optional partitionName; + if (!partitionColumnNames.isEmpty()) { + partitionName = Optional.of(FileUtils.makePartName(partitionColumnNames, partitionValues)); + } + else { + partitionName = Optional.empty(); + } + + // attempt to get the existing partition (if this is an existing partitioned table) + Optional partition = Optional.empty(); + if (!partitionValues.isEmpty() && table != null) { + partition = pageSinkMetadataProvider.getPartition(partitionValues); + } + + UpdateMode updateMode; + Properties schema; + WriteInfo writeInfo; + StorageFormat outputStorageFormat; + if (!partition.isPresent()) { + if (table == null) { + // Write to: a new partition in a new partitioned table, + // or a new unpartitioned table. + updateMode = UpdateMode.NEW; + schema = new Properties(); + schema.setProperty(IOConstants.COLUMNS, dataColumns.stream() + .map(DataColumn::getName) + .collect(joining(","))); + schema.setProperty(IOConstants.COLUMNS_TYPES, dataColumns.stream() + .map(DataColumn::getHiveType) + .map(HiveType::getHiveTypeName) + .map(HiveTypeName::toString) + .collect(joining(":"))); + setAdditionalSchemaProperties(schema); + if (!partitionName.isPresent()) { + // new unpartitioned table + writeInfo = locationService.getTableWriteInfo(locationHandle, false); + } + else { + // a new partition in a new partitioned table + writeInfo = locationService.getPartitionWriteInfo(locationHandle, partition, partitionName.get()); + + if (!writeInfo.getWriteMode().isWritePathSameAsTargetPath()) { + // When target path is different from write path, + // verify that the target directory for the partition does not already exist + if (HiveWriteUtils.pathExists(new HdfsContext(session, schemaName, tableName), hdfsEnvironment, writeInfo.getTargetPath())) { + throw new PrestoException(HIVE_PATH_ALREADY_EXISTS, format( + "Target directory for new partition '%s' of table '%s.%s' already exists: %s", + partitionName, + schemaName, + tableName, + writeInfo.getTargetPath())); + } + } + } + } + else { + // Write to: a new partition in an existing partitioned table, + // or an existing unpartitioned table + if (partitionName.isPresent()) { + // a new partition in an existing partitioned table + updateMode = UpdateMode.NEW; + writeInfo = locationService.getPartitionWriteInfo(locationHandle, partition, partitionName.get()); + } + else { + switch (insertExistingPartitionsBehavior) { + case APPEND: + checkState(!immutablePartitions); + updateMode = UpdateMode.APPEND; + writeInfo = locationService.getTableWriteInfo(locationHandle, false); + break; + case OVERWRITE: + updateMode = UpdateMode.OVERWRITE; + writeInfo = locationService.getTableWriteInfo(locationHandle, true); + break; + case ERROR: + throw new PrestoException(HIVE_TABLE_READ_ONLY, "Unpartitioned Hive tables are immutable"); + default: + throw new IllegalArgumentException("Unsupported insert existing table behavior: " + insertExistingPartitionsBehavior); + } + } + + schema = getHiveSchema(table); + } + + if (partitionName.isPresent()) { + // Write to a new partition + outputStorageFormat = fromHiveStorageFormat(partitionStorageFormat); + } + else { + // Write to a new/existing unpartitioned table + outputStorageFormat = fromHiveStorageFormat(tableStorageFormat); + } + } + else { + // Write to: an existing partition in an existing partitioned table + if (insertExistingPartitionsBehavior == InsertExistingPartitionsBehavior.APPEND) { + // Append to an existing partition + checkState(!immutablePartitions); + updateMode = UpdateMode.APPEND; + // Check the column types in partition schema match the column types in table schema + List tableColumns = table.getDataColumns(); + List existingPartitionColumns = partition.get().getColumns(); + for (int i = 0; i < min(existingPartitionColumns.size(), tableColumns.size()); i++) { + HiveType tableType = tableColumns.get(i).getType(); + HiveType partitionType = existingPartitionColumns.get(i).getType(); + if (!tableType.equals(partitionType)) { + throw new PrestoException(HIVE_PARTITION_SCHEMA_MISMATCH, format("" + + "You are trying to write into an existing partition in a table. " + + "The table schema has changed since the creation of the partition. " + + "Inserting rows into such partition is not supported. " + + "The column '%s' in table '%s' is declared as type '%s', " + + "but partition '%s' declared column '%s' as type '%s'.", + tableColumns.get(i).getName(), + tableName, + tableType, + partitionName, + existingPartitionColumns.get(i).getName(), + partitionType)); + } + } + + HiveWriteUtils.checkPartitionIsWritable(partitionName.get(), partition.get()); + + outputStorageFormat = partition.get().getStorage().getStorageFormat(); + schema = getHiveSchema(partition.get(), table); + + writeInfo = locationService.getPartitionWriteInfo(locationHandle, partition, partitionName.get()); + } + else if (insertExistingPartitionsBehavior == InsertExistingPartitionsBehavior.OVERWRITE) { + // Overwrite an existing partition + // + // The behavior of overwrite considered as if first dropping the partition and inserting a new partition, thus: + // * No partition writable check is required. + // * Table schema and storage format is used for the new partition (instead of existing partition schema and storage format). + updateMode = UpdateMode.OVERWRITE; + + outputStorageFormat = fromHiveStorageFormat(partitionStorageFormat); + schema = getHiveSchema(table); + + writeInfo = locationService.getPartitionWriteInfo(locationHandle, Optional.empty(), partitionName.get()); + checkWriteMode(writeInfo); + } + else if (insertExistingPartitionsBehavior == InsertExistingPartitionsBehavior.ERROR) { + throw new PrestoException(HIVE_PARTITION_READ_ONLY, "Cannot insert into an existing partition of Hive table: " + partitionName.get()); + } + else { + throw new IllegalArgumentException(format("Unsupported insert existing partitions behavior: %s", insertExistingPartitionsBehavior)); + } + } + + schema.putAll(additionalTableParameters); + if (acidWriteType != HiveACIDWriteType.DELETE) { + validateSchema(partitionName, schema); + } + + Path path; + Optional acidOptions; + String fileNameWithExtension; + + if (isTxnTable) { + WriteIdInfo writeIdInfo = locationHandle.getJsonSerializablewriteIdInfo().get(); + AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf) + .minimumWriteId(writeIdInfo.getMinWriteId()) + .maximumWriteId(writeIdInfo.getMaxWriteId()) + .statementId(writeIdInfo.getStatementId()) + .bucket(bucketNumber.isPresent() ? bucketNumber.getAsInt() : 0); + if (acidWriteType == HiveACIDWriteType.DELETE) { + //to support delete as insert + options.writingDeleteDelta(true); + } + else if (acidWriteType == HiveACIDWriteType.INSERT_OVERWRITE) { + //In case of ACID txn tables, dont delete old data. Just create new base in same partition. + options.writingBase(true); + } + if (vacuumOptions.isPresent() && HiveACIDWriteType.isVacuum(acidWriteType)) { + Options vOptions = vacuumOptions.get(); + //Use the original bucket file number itself. + //Compacted delta directories will not have statementId + options.maximumWriteId(vOptions.getMaximumWriteId()) + .minimumWriteId(vOptions.getMinimumWriteId()) + .writingBase(vOptions.isWritingBase()) + .writingDeleteDelta(vOptions.isWritingDeleteDelta()) + .bucket(vOptions.getBucketId()) + .statementId(-1); + } + if (AcidUtils.isInsertOnlyTable(schema)) { + String subdir; + if (options.isWritingBase()) { + subdir = AcidUtils.baseDir(options.getMaximumWriteId()); + } + else if (HiveACIDWriteType.isVacuum(acidWriteType)) { + //Only for Minor compacted delta will not have statement Id. + subdir = AcidUtils.deltaSubdir(options.getMinimumWriteId(), options.getMaximumWriteId()); + } + else { + subdir = AcidUtils.deltaSubdir(options.getMinimumWriteId(), options.getMaximumWriteId(), options.getStatementId()); + } + Path parentDir = new Path(writeInfo.getWritePath(), subdir); + fileName = String.format("%06d", options.getBucketId()) + "_0" + getFileExtension(conf, outputStorageFormat); + path = new Path(parentDir, fileName); + Properties properties = new Properties(); + properties.setProperty("transactional_properties", "insert_only"); + options.tableProperties(properties); + } + else { + path = AcidUtils.createFilename(writeInfo.getWritePath(), options); + } + //In case of ACID entire delta directory should be renamed from staging directory. + fileNameWithExtension = path.getParent().getName(); + acidOptions = Optional.of(options); + } + else { + fileNameWithExtension = fileName + getFileExtension(conf, outputStorageFormat); + path = new Path(writeInfo.getWritePath(), fileNameWithExtension); + acidOptions = Optional.empty(); + } + + FileSystem fileSystem; + try { + fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, conf); + } + catch (IOException e) { + throw new PrestoException(HIVE_WRITER_OPEN_ERROR, e); + } + + if (isSnapshotEnabled) { + // Snapshot: use a recognizable name pattern, in case they need to be deleted/renamed + String oldFileName = path.getName(); + String newFileName = toSnapshotFileName(oldFileName, queryId); + path = new Path(path.getParent(), newFileName); + if (fileNameWithExtension.equals(oldFileName)) { + fileNameWithExtension = newFileName; + } + } + HiveFileWriter hiveFileWriter = null; + if (isSnapshotEnabled && !forMerge) { + // Add a suffix to file name for sub files + String oldFileName = path.getName(); + String newFileName = toSnapshotSubFile(oldFileName); + path = new Path(path.getParent(), newFileName); + if (fileNameWithExtension.equals(oldFileName)) { + fileNameWithExtension = newFileName; + } + // Always create a simple ORC writer for snapshot files. These will be merged in the end. + logContainingFolderInfo(fileSystem, path, "Creating SnapshotTempFileWriter for %s", path); + try { + Path finalPath = path; + hiveFileWriter = new SnapshotTempFileWriter( + orcFileWriterFactory.createOrcDataSink(session, fileSystem, path), + dataColumns.stream() + .map(column -> column.getHiveType().getType(typeManager)) + .collect(Collectors.toList())); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e); + } + } + else { + conf.set("table.write.path", writeInfo.getWritePath().toString()); + for (HiveFileWriterFactory fileWriterFactory : fileWriterFactories) { + Optional fileWriter = fileWriterFactory.createFileWriter( + path, + dataColumns.stream() + .map(DataColumn::getName) + .collect(toList()), + outputStorageFormat, + schema, + conf, + session, + acidOptions, + Optional.of(acidWriteType)); + if (fileWriter.isPresent()) { + hiveFileWriter = fileWriter.get(); + break; + } + } + + if (isSnapshotEnabled) { + // TODO-cp-I2BZ0A: assuming all files to be of ORC type + checkState(hiveFileWriter instanceof OrcFileWriter, "Only support ORC format with snapshot"); + logContainingFolderInfo(fileSystem, path, "Creating file writer for final result: %s", path); + } + + if (hiveFileWriter == null) { + hiveFileWriter = new RecordFileWriter( + path, + dataColumns.stream() + .map(DataColumn::getName) + .collect(toList()), + outputStorageFormat, + schema, + partitionStorageFormat.getEstimatedWriterSystemMemoryUsage(), + conf, + typeManager, + parquetTimeZone, + session); + } + if (isTxnTable) { + hiveFileWriter.initWriter(true, path, fileSystem); + } + } + + Path finalPath = path; + String writerImplementation = hiveFileWriter.getClass().getName(); + + Consumer onCommit; + if (isSnapshotEnabled && !forMerge) { + // Only send "commit" event for the merged file + onCommit = hiveWriter -> {}; + } + else { + onCommit = hiveWriter -> { + Optional size; + try { + size = Optional.of(hdfsEnvironment.getFileSystem(session.getUser(), finalPath, conf).getFileStatus(finalPath).getLen()); + } + catch (IOException | RuntimeException e) { + // Do not fail the query if file system is not available + size = Optional.empty(); + } + + eventClient.post(new WriteCompletedEvent( + session.getQueryId(), + finalPath.toString(), + schemaName, + tableName, + partitionName.orElse(null), + outputStorageFormat.getOutputFormat(), + writerImplementation, + nodeManager.getCurrentNode().getVersion(), + nodeManager.getCurrentNode().getHost(), + session.getIdentity().getPrincipal().map(Principal::getName).orElse(null), + nodeManager.getEnvironment(), + sessionProperties, + size.orElse(null), + hiveWriter.getRowCount())); + }; + } + + if (!sortedBy.isEmpty() || (isTxnTable() && HiveACIDWriteType.isUpdateOrDelete(acidWriteType))) { + List types = dataColumns.stream() + .map(column -> column.getHiveType().getType(typeManager)) + .collect(Collectors.toList()); + + Map columnIndexes = new HashMap<>(); + for (int i = 0; i < dataColumns.size(); i++) { + columnIndexes.put(dataColumns.get(i).getName(), i); + } + if (sortedBy.isEmpty() && + isTxnTable() && HiveACIDWriteType.isUpdateOrDelete(acidWriteType)) { + //Add $rowId column as the last column in the page + types.add(HiveColumnHandle.updateRowIdHandle().getHiveType().getType(typeManager)); + columnIndexes.put(HiveColumnHandle.UPDATE_ROW_ID_COLUMN_NAME, dataColumns.size()); + } + + List sortFields = new ArrayList<>(); + List sortOrders = new ArrayList<>(); + List sortigColumns = this.sortedBy; + if (sortedBy.isEmpty() && + isTxnTable() && HiveACIDWriteType.isUpdateOrDelete(acidWriteType)) { + sortigColumns = ImmutableList.of(new SortingColumn(HiveColumnHandle.UPDATE_ROW_ID_COLUMN_NAME, SortingColumn.Order.ASCENDING)); + } + for (SortingColumn column : sortigColumns) { + Integer index = columnIndexes.get(column.getColumnName()); + if (index == null) { + throw new PrestoException(HIVE_INVALID_METADATA, format("Sorting column '%s' does not exist in table '%s.%s'", column.getColumnName(), schemaName, tableName)); + } + sortFields.add(index); + sortOrders.add(column.getOrder().getSortOrder()); + } + + FileSystem sortFileSystem = fileSystem; + String child = ".tmp-sort." + path.getName(); + Path tempFilePrefix = new Path(path.getParent(), child); + + hiveFileWriter = new SortingFileWriter( + sortFileSystem, + tempFilePrefix, + hiveFileWriter, + sortBufferSize, + maxOpenSortFiles, + types, + sortFields, + sortOrders, + pageSorter, + (fs, p) -> orcFileWriterFactory.createOrcDataSink(session, fs, p)); + } + + return new HiveWriter( + hiveFileWriter, + partitionName, + updateMode, + fileNameWithExtension, + writeInfo.getWritePath().toString(), + writeInfo.getTargetPath().toString(), + path.toString(), + onCommit, + // Snapshot: only update stats when merging files + isSnapshotEnabled && !forMerge ? null : hiveWriterStats, + hiveFileWriter.getExtraPartitionFiles()); + } + + public boolean isTxnTable() + { + Map tableParameters = table != null ? this.table.getParameters() : additionalTableParameters; + return tableParameters != null && AcidUtils.isTransactionalTable(tableParameters); + } + + private void validateSchema(Optional partitionName, Properties schema) + { + // existing tables may have columns in a different order + List fileColumnNames = getColumnNames(schema); + List fileColumnHiveTypes = getColumnTypes(schema); + + // verify we can write all input columns to the file + Map inputColumnMap = dataColumns.stream() + .collect(toMap(DataColumn::getName, identity())); + Set missingColumns = Sets.difference(inputColumnMap.keySet(), new HashSet<>(fileColumnNames)); + if (!missingColumns.isEmpty()) { + throw new PrestoException(NOT_FOUND, format("Table %s.%s does not have columns %s", schema, tableName, missingColumns)); + } + if (fileColumnNames.size() != fileColumnHiveTypes.size()) { + throw new PrestoException(HIVE_INVALID_METADATA, format( + "Partition '%s' in table '%s.%s' has mismatched metadata for column names and types", + partitionName, + schemaName, + tableName)); + } + + // verify the file types match the input type + // todo adapt input types to the file types as Hive does + for (int fileIndex = 0; fileIndex < fileColumnNames.size(); fileIndex++) { + String columnName = fileColumnNames.get(fileIndex); + HiveType fileColumnHiveType = fileColumnHiveTypes.get(fileIndex); + HiveType inputHiveType = inputColumnMap.get(columnName).getHiveType(); + + if (!fileColumnHiveType.equals(inputHiveType)) { + // todo this should be moved to a helper + throw new PrestoException(HIVE_PARTITION_SCHEMA_MISMATCH, format( + "" + + "There is a mismatch between the table and partition schemas. " + + "The column '%s' in table '%s.%s' is declared as type '%s', " + + "but partition '%s' declared column '%s' as type '%s'.", + columnName, + schemaName, + tableName, + inputHiveType, + partitionName, + columnName, + fileColumnHiveType)); + } + } + } + + public static String computeBucketedFileName(String queryId, int bucket) + { + String paddedBucket = Strings.padStart(Integer.toString(bucket), BUCKET_NUMBER_PADDING, '0'); + return format("0%s_0_%s", paddedBucket, queryId); + } + + protected void checkWriteMode(WriteInfo writeInfo) + { + checkState(writeInfo.getWriteMode() != DIRECT_TO_TARGET_EXISTING_DIRECTORY, "Overwriting existing partition doesn't support DIRECT_TO_TARGET_EXISTING_DIRECTORY write mode"); + } + + public static String getFileExtension(JobConf conf, StorageFormat storageFormat) + { + // text format files must have the correct extension when compressed + if (!HiveConf.getBoolVar(conf, COMPRESSRESULT) || !HiveIgnoreKeyTextOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) { + return ""; + } + + String compressionCodecClass = conf.get("mapred.output.compression.codec"); + if (compressionCodecClass == null) { + return new DefaultCodec().getDefaultExtension(); + } + + try { + Class codecClass = conf.getClassByName(compressionCodecClass).asSubclass(CompressionCodec.class); + return ReflectionUtil.newInstance(codecClass, conf).getDefaultExtension(); + } + catch (ClassNotFoundException e) { + throw new PrestoException(HIVE_UNSUPPORTED_FORMAT, "Compression codec not found: " + compressionCodecClass, e); + } + catch (RuntimeException e) { + throw new PrestoException(HIVE_UNSUPPORTED_FORMAT, "Failed to load compression codec: " + compressionCodecClass, e); + } + } + + public static class DataColumn + { + private final String name; + private final HiveType hiveType; + + public DataColumn(String name, HiveType hiveType) + { + this.name = requireNonNull(name, "name is null"); + this.hiveType = requireNonNull(hiveType, "hiveType is null"); + } + + public String getName() + { + return name; + } + + public HiveType getHiveType() + { + return hiveType; + } + } + + protected void setAdditionalSchemaProperties(Properties schema) + { + // No additional properties to set for Hive tables + } + + static String toSnapshotFileName(String fileName, String queryId) + { + return fileName + "_snapshot_" + queryId; + } + + static boolean isSnapshotFile(String fileName, String queryId) + { + String identifier = "_snapshot_" + queryId; + return fileName.contains(identifier); + } + + static boolean isSnapshotSubFile(String fileName, String queryId) + { + return getSnapshotSubFileIndex(fileName, queryId) >= 0; + } + + static long getSnapshotSubFileIndex(String fileName, String queryId) + { + String identifier = "_snapshot_" + queryId; + int index = fileName.indexOf(identifier); + if (index < 0) { + // Not a snapshot file + return index; + } + index += identifier.length(); + if (index == fileName.length()) { + // Doesn't have a suffix + return -1; + } + String suffix = fileName.substring(fileName.indexOf('_', index) + 1); // Skip over '.' and '_' + return Long.valueOf(suffix); + } + + static String removeSnapshotFileName(String fileName, String queryId) + { + String identifier = "_snapshot_" + queryId; + int index = fileName.indexOf(identifier); + return index > 0 ? fileName.substring(0, index) : fileName; + } + + private String toSnapshotSubFile(String path) + { + return toSnapshotSubFile(path, resumeCount, snapshotSuffixes.size()); + } + + private String toSnapshotSubFile(String path, long resumeCount, int index) + { + return path + '.' + resumeCount + '_' + index; + } + + public void mergeSubFiles(List writers) + throws IOException + { + if (writers.isEmpty()) { + return; + } + + FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), new Path(writers.get(0).getFilePath()), conf); + + List types = dataColumns.stream() + .map(column -> column.getHiveType().getType(typeManager)) + .collect(toList()); + + for (HiveWriter writer : writers) { + String filePath = writer.getFilePath(); + + Path path = new Path(filePath); + logContainingFolderInfo(fileSystem, path, "Merging snapshot files to result file: %s", path); + + // The snapshotSuffixes list records the "resumeCount" for each suffix. + // It doesn't has an entry for the current set of files, so an entry is added first. + // The resumeCount helps distinguish files created by different runs. + snapshotSuffixes.add(resumeCount); + for (int i = 0; i < snapshotSuffixes.size(); i++) { + long resume = snapshotSuffixes.get(i); + Path file = new Path(toSnapshotSubFile(filePath, resume, i)); + if (fileSystem.exists(file)) { + // TODO-cp-I2BZ0A: assuming all files to be of ORC type. + // Using same parameters as used by SortingFileWriter + FileStatus fileStatus = fileSystem.getFileStatus(file); + try (TempFileReader reader = new TempFileReader(types, new HdfsOrcDataSource( + new OrcDataSourceId(file.toString()), + fileStatus.getLen(), + new DataSize(1, MEGABYTE), + new DataSize(8, MEGABYTE), + new DataSize(8, MEGABYTE), + false, + fileSystem.open(file), + new FileFormatDataSourceStats(), + fileStatus.getModificationTime()))) { + while (reader.hasNext()) { + writer.append(reader.next()); + } + } + // DO NOT delete the sub file, in case we need to resume. Delete them when the query finishes. + } + } + } + } + + private void logContainingFolderInfo(FileSystem fileSystem, Path path, String message, Object... params) + { + try { + if (log.isDebugEnabled()) { + log.debug(message, params); + Arrays.stream(fileSystem.listStatus(path.getParent())).forEach(file -> { + log.debug("%d\t%s", file.getLen(), file.getPath()); + }); + } + } + catch (IOException e) { + log.debug(e, "Failed to list folder content for %s: %s", path, e.getMessage()); + } + } + + public Object capture() + { + // hiveWriterStats is not captured. They are not updated for sub-files. + // Increment suffix so that each resume generates a new set of files + snapshotSuffixes.add(resumeCount); + return snapshotSuffixes; + } + + public void restore(Object obj, long resumeCount) + { + snapshotSuffixes.clear(); + snapshotSuffixes.addAll((List) obj); + this.resumeCount = resumeCount; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriterStats.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriterStats.java new file mode 100644 index 00000000..a0b44642 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWriterStats.java @@ -0,0 +1,35 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.stats.DistributionStat; +import org.weakref.jmx.Managed; +import org.weakref.jmx.Nested; + +public class HiveWriterStats +{ + private final DistributionStat inputPageSizeInBytes = new DistributionStat(); + + @Managed + @Nested + public DistributionStat getInputPageSizeInBytes() + { + return inputPageSizeInBytes; + } + + public void addInputPageSizesInBytes(long bytes) + { + inputPageSizeInBytes.add(bytes); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWrittenPartitions.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWrittenPartitions.java new file mode 100644 index 00000000..159a18db --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/HiveWrittenPartitions.java @@ -0,0 +1,41 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import io.prestosql.spi.connector.ConnectorOutputMetadata; + +import java.util.List; + +import static java.util.Objects.requireNonNull; + +public class HiveWrittenPartitions + implements ConnectorOutputMetadata +{ + private final List partitionNames; + + @JsonCreator + public HiveWrittenPartitions(@JsonProperty("partitionNames") List partitionNames) + { + this.partitionNames = ImmutableList.copyOf(requireNonNull(partitionNames, "partitionNames is null")); + } + + @JsonProperty + public List getInfo() + { + return partitionNames; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/InternalHiveSplit.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/InternalHiveSplit.java new file mode 100644 index 00000000..07f554db --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/InternalHiveSplit.java @@ -0,0 +1,305 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HiveSplit.BucketConversion; +import io.prestosql.spi.HostAddress; +import org.openjdk.jol.info.ClassLayout; + +import javax.annotation.concurrent.NotThreadSafe; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Verify.verify; +import static io.airlift.slice.SizeOf.sizeOfObjectArray; +import static java.util.Objects.requireNonNull; + +@NotThreadSafe +public class InternalHiveSplit +{ + // Overhead of ImmutableList and ImmutableMap is not accounted because of its complexity. + private static final int INSTANCE_SIZE = ClassLayout.parseClass(InternalHiveSplit.class).instanceSize() + + ClassLayout.parseClass(String.class).instanceSize() + + ClassLayout.parseClass(Properties.class).instanceSize() + + ClassLayout.parseClass(String.class).instanceSize() + + ClassLayout.parseClass(OptionalInt.class).instanceSize(); + private static final int INTEGER_INSTANCE_SIZE = ClassLayout.parseClass(Integer.class).instanceSize(); + + private final String path; + private final long end; + private final long fileSize; + private final long lastModifiedTime; + private final Properties schema; + private final List partitionKeys; + private final List blocks; + private final String partitionName; + private final OptionalInt bucketNumber; + private final boolean splittable; + private final boolean forceLocalScheduling; + private final Map columnCoercions; + private final Optional bucketConversion; + private final boolean s3SelectPushdownEnabled; + private final Optional deleteDeltaLocations; + private final Optional startRowOffsetOfFile; + private final Map customSplitInfo; + + private long start; + private int currentBlockIndex; + + public InternalHiveSplit( + String partitionName, + String path, + long start, + long end, + long fileSize, + long lastModifiedTime, + Properties schema, + List partitionKeys, + List blocks, + OptionalInt bucketNumber, + boolean splittable, + boolean forceLocalScheduling, + Map columnCoercions, + Optional bucketConversion, + boolean s3SelectPushdownEnabled, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Map customSplitInfo) + { + checkArgument(start >= 0, "start must be positive"); + checkArgument(end >= 0, "length must be positive"); + checkArgument(fileSize >= 0, "fileSize must be positive"); + checkArgument(lastModifiedTime >= 0, "lastModifiedTime must be positive"); + requireNonNull(partitionName, "partitionName is null"); + requireNonNull(path, "path is null"); + requireNonNull(schema, "schema is null"); + requireNonNull(partitionKeys, "partitionKeys is null"); + requireNonNull(blocks, "blocks is null"); + requireNonNull(bucketNumber, "bucketNumber is null"); + requireNonNull(columnCoercions, "columnCoercions is null"); + requireNonNull(bucketConversion, "bucketConversion is null"); + requireNonNull(deleteDeltaLocations, "deleteDeltaLocations is null"); + + this.partitionName = partitionName; + this.path = path; + this.start = start; + this.end = end; + this.fileSize = fileSize; + this.lastModifiedTime = lastModifiedTime; + this.schema = schema; + this.partitionKeys = ImmutableList.copyOf(partitionKeys); + this.blocks = ImmutableList.copyOf(blocks); + this.bucketNumber = bucketNumber; + this.splittable = splittable; + this.forceLocalScheduling = forceLocalScheduling; + this.columnCoercions = ImmutableMap.copyOf(columnCoercions); + this.bucketConversion = bucketConversion; + this.s3SelectPushdownEnabled = s3SelectPushdownEnabled; + this.deleteDeltaLocations = deleteDeltaLocations; + this.startRowOffsetOfFile = startRowOffsetOfFile; + this.customSplitInfo = ImmutableMap.copyOf(requireNonNull(customSplitInfo, "customSplitInfo is null")); + } + + public String getPath() + { + return path; + } + + public long getStart() + { + return start; + } + + public long getEnd() + { + return end; + } + + public long getFileSize() + { + return fileSize; + } + + public long getLastModifiedTime() + { + return lastModifiedTime; + } + + public boolean isS3SelectPushdownEnabled() + { + return s3SelectPushdownEnabled; + } + + public Properties getSchema() + { + return schema; + } + + public List getPartitionKeys() + { + return partitionKeys; + } + + public String getPartitionName() + { + return partitionName; + } + + public OptionalInt getBucketNumber() + { + return bucketNumber; + } + + public boolean isSplittable() + { + return splittable; + } + + public boolean isForceLocalScheduling() + { + return forceLocalScheduling; + } + + public Map getColumnCoercions() + { + return columnCoercions; + } + + public Optional getBucketConversion() + { + return bucketConversion; + } + + public InternalHiveBlock currentBlock() + { + checkState(!isDone(), "All blocks have been consumed"); + return blocks.get(currentBlockIndex); + } + + public boolean isDone() + { + return currentBlockIndex == blocks.size(); + } + + public void increaseStart(long value) + { + start += value; + if (start == currentBlock().getEnd()) { + currentBlockIndex++; + if (isDone()) { + return; + } + verify(start == currentBlock().getStart()); + } + } + + public Map getCustomSplitInfo() + { + return customSplitInfo; + } + + public int getEstimatedSizeInBytes() + { + int result = INSTANCE_SIZE; + result += path.length() * Character.BYTES; + result += sizeOfObjectArray(partitionKeys.size()); + for (HivePartitionKey partitionKey : partitionKeys) { + result += partitionKey.getEstimatedSizeInBytes(); + } + result += sizeOfObjectArray(blocks.size()); + for (InternalHiveBlock block : blocks) { + result += block.getEstimatedSizeInBytes(); + } + result += partitionName.length() * Character.BYTES; + result += sizeOfObjectArray(columnCoercions.size()); + for (HiveTypeName hiveTypeName : columnCoercions.values()) { + result += INTEGER_INSTANCE_SIZE + hiveTypeName.getEstimatedSizeInBytes(); + } + return result; + } + + public Optional getDeleteDeltaLocations() + { + return deleteDeltaLocations; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("path", path) + .add("start", start) + .add("end", end) + .add("fileSize", fileSize) + .toString(); + } + + public Optional getStartRowOffsetOfFile() + { + return startRowOffsetOfFile; + } + + public static class InternalHiveBlock + { + private static final int INSTANCE_SIZE = ClassLayout.parseClass(InternalHiveBlock.class).instanceSize(); + private static final int HOST_ADDRESS_INSTANCE_SIZE = ClassLayout.parseClass(HostAddress.class).instanceSize() + + ClassLayout.parseClass(String.class).instanceSize(); + + private final long start; + private final long end; + private final List addresses; + + public InternalHiveBlock(long start, long end, List addresses) + { + checkArgument(start <= end, "block end cannot be before block start"); + this.start = start; + this.end = end; + this.addresses = ImmutableList.copyOf(addresses); + } + + public long getStart() + { + return start; + } + + public long getEnd() + { + return end; + } + + public List getAddresses() + { + return addresses; + } + + public int getEstimatedSizeInBytes() + { + int result = INSTANCE_SIZE; + result += sizeOfObjectArray(addresses.size()); + for (HostAddress address : addresses) { + result += HOST_ADDRESS_INSTANCE_SIZE + address.getHostText().length() * Character.BYTES; + } + return result; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/IonSqlQueryBuilder.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/IonSqlQueryBuilder.java new file mode 100644 index 00000000..5c1f2c17 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/IonSqlQueryBuilder.java @@ -0,0 +1,267 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import io.airlift.slice.Slice; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.Range; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.Decimals; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.type.VarcharType; +import org.joda.time.format.DateTimeFormatter; + +import java.util.ArrayList; +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.Iterables.getOnlyElement; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.DAYS; +import static java.util.stream.Collectors.joining; +import static org.joda.time.chrono.ISOChronology.getInstanceUTC; +import static org.joda.time.format.ISODateTimeFormat.date; + +/** + * S3 Select uses Ion SQL++ query language. This class is used to construct a valid Ion SQL++ query + * to be evaluated with S3 Select on an S3 object. + */ +public class IonSqlQueryBuilder +{ + private static final DateTimeFormatter FORMATTER = date().withChronology(getInstanceUTC()); + private static final String DATA_SOURCE = "S3Object s"; + private final TypeManager typeManager; + + public IonSqlQueryBuilder(TypeManager typeManager) + { + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + } + + public String buildSql(List columns, TupleDomain tupleDomain) + { + StringBuilder sql = new StringBuilder("SELECT "); + + if (columns.isEmpty()) { + sql.append("' '"); + } + else { + String columnNames = columns.stream() + .map(column -> format("s._%d", column.getHiveColumnIndex() + 1)) + .collect(joining(", ")); + sql.append(columnNames); + } + + sql.append(" FROM "); + sql.append(DATA_SOURCE); + + List clauses = toConjuncts(columns, tupleDomain); + if (!clauses.isEmpty()) { + sql.append(" WHERE ") + .append(Joiner.on(" AND ").join(clauses)); + } + + return sql.toString(); + } + + private List toConjuncts(List columns, TupleDomain tupleDomain) + { + ImmutableList.Builder builder = ImmutableList.builder(); + for (HiveColumnHandle column : columns) { + Type type = column.getHiveType().getType(typeManager); + if (tupleDomain.getDomains().isPresent() && isSupported(type)) { + Domain domain = tupleDomain.getDomains().get().get(column); + if (domain != null) { + builder.add(toPredicate(domain, type, column.getHiveColumnIndex())); + } + } + } + return builder.build(); + } + + private static boolean isSupported(Type type) + { + Type validType = requireNonNull(type, "type is null"); + return validType.equals(BIGINT) || + validType.equals(TINYINT) || + validType.equals(SMALLINT) || + validType.equals(INTEGER) || + validType instanceof DecimalType || + validType.equals(BOOLEAN) || + validType.equals(DATE) || + validType instanceof VarcharType; + } + + private String toPredicate(Domain domain, Type type, int position) + { + checkArgument(domain.getType().isOrderable(), "Domain type must be orderable"); + + if (domain.getValues().isNone()) { + if (domain.isNullAllowed()) { + return format("s._%d", position + 1) + " = '' "; + } + return "FALSE"; + } + + if (domain.getValues().isAll()) { + if (domain.isNullAllowed()) { + return "TRUE"; + } + return format("s._%d", position + 1) + " <> '' "; + } + + List disjuncts = new ArrayList<>(); + List singleValues = new ArrayList<>(); + for (Range range : domain.getValues().getRanges().getOrderedRanges()) { + checkState(!range.isAll()); + if (range.isSingleValue()) { + singleValues.add(range.getLow().getValue()); + continue; + } + List rangeConjuncts = new ArrayList<>(); + if (!range.getLow().isLowerUnbounded()) { + switch (range.getLow().getBound()) { + case ABOVE: + rangeConjuncts.add(toPredicate(">", range.getLow().getValue(), type, position)); + break; + case EXACTLY: + rangeConjuncts.add(toPredicate(">=", range.getLow().getValue(), type, position)); + break; + case BELOW: + throw new IllegalArgumentException("Low marker should never use BELOW bound"); + default: + throw new AssertionError("Unhandled bound: " + range.getLow().getBound()); + } + } + if (!range.getHigh().isUpperUnbounded()) { + switch (range.getHigh().getBound()) { + case ABOVE: + throw new IllegalArgumentException("High marker should never use ABOVE bound"); + case EXACTLY: + rangeConjuncts.add(toPredicate("<=", range.getHigh().getValue(), type, position)); + break; + case BELOW: + rangeConjuncts.add(toPredicate("<", range.getHigh().getValue(), type, position)); + break; + default: + throw new AssertionError("Unhandled bound: " + range.getHigh().getBound()); + } + } + // If rangeConjuncts is null, then the range was ALL, which should already have been checked for + checkState(!rangeConjuncts.isEmpty()); + disjuncts.add("(" + Joiner.on(" AND ").join(rangeConjuncts) + ")"); + } + + // Add back all of the possible single values either as an equality or an IN predicate + if (singleValues.size() == 1) { + disjuncts.add(toPredicate("=", getOnlyElement(singleValues), type, position)); + } + else if (singleValues.size() > 1) { + List values = new ArrayList<>(); + for (Object value : singleValues) { + checkType(type); + values.add(valueToQuery(type, value)); + } + disjuncts.add(createColumn(type, position) + " IN (" + Joiner.on(",").join(values) + ")"); + } + + // Add nullability disjuncts + checkState(!disjuncts.isEmpty()); + if (domain.isNullAllowed()) { + disjuncts.add(format("s._%d", position + 1) + " = '' "); + } + + return "(" + Joiner.on(" OR ").join(disjuncts) + ")"; + } + + private String toPredicate(String operator, Object value, Type type, int position) + { + checkType(type); + + return format("%s %s %s", createColumn(type, position), operator, valueToQuery(type, value)); + } + + private static void checkType(Type type) + { + checkArgument(isSupported(type), "Type not supported: %s", type); + } + + private static String valueToQuery(Type type, Object value) + { + if (type.equals(BIGINT)) { + return String.valueOf(((Number) value).longValue()); + } + if (type.equals(INTEGER)) { + return String.valueOf(((Number) value).intValue()); + } + if (type.equals(SMALLINT)) { + return String.valueOf(((Number) value).shortValue()); + } + if (type.equals(TINYINT)) { + return String.valueOf(((Number) value).byteValue()); + } + if (type.equals(BOOLEAN)) { + return String.valueOf(value); + } + if (type.equals(DATE)) { + return "`" + FORMATTER.print(DAYS.toMillis((long) value)) + "`"; + } + if (type.equals(VarcharType.VARCHAR)) { + return "'" + ((Slice) value).toStringUtf8() + "'"; + } + if (type instanceof DecimalType) { + if (Decimals.isLongDecimal(type)) { + return Decimals.toString((Slice) value, ((DecimalType) type).getScale()); + } + return Decimals.toString((long) value, ((DecimalType) type).getScale()); + } + return "'" + ((Slice) value).toStringUtf8() + "'"; + } + + private String createColumn(Type type, int position) + { + String column = format("s._%d", position + 1); + + if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { + return formatPredicate(column, "INT"); + } + if (type.equals(BOOLEAN)) { + return formatPredicate(column, "BOOL"); + } + if (type.equals(DATE)) { + return formatPredicate(column, "TIMESTAMP"); + } + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + return formatPredicate(column, format("DECIMAL(%s,%s)", decimalType.getPrecision(), decimalType.getScale())); + } + return column; + } + + private String formatPredicate(String column, String type) + { + return format("case %s when '' then null else CAST(%s AS %s) end", column, column, type); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/LocationHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/LocationHandle.java new file mode 100644 index 00000000..53364fab --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/LocationHandle.java @@ -0,0 +1,159 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import org.apache.hadoop.fs.Path; + +import java.util.Optional; + +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class LocationHandle +{ + private final Path targetPath; + private final Path writePath; + private final boolean isExistingTable; + private final WriteMode writeMode; + private final Optional writeIdInfo; + + public LocationHandle( + Path targetPath, + Path writePath, + boolean isExistingTable, + WriteMode writeMode, + Optional writeIdInfo) + { + if (writeMode.isWritePathSameAsTargetPath() && !targetPath.equals(writePath)) { + throw new IllegalArgumentException(format("targetPath is expected to be same as writePath for writeMode %s", writeMode)); + } + this.targetPath = requireNonNull(targetPath, "targetPath is null"); + this.writePath = requireNonNull(writePath, "writePath is null"); + this.isExistingTable = isExistingTable; + this.writeMode = requireNonNull(writeMode, "writeMode is null"); + this.writeIdInfo = writeIdInfo; + } + + @JsonCreator + public LocationHandle( + @JsonProperty("targetPath") String targetPath, + @JsonProperty("writePath") String writePath, + @JsonProperty("isExistingTable") boolean isExistingTable, + @JsonProperty("writeMode") WriteMode writeMode, + @JsonProperty("writeIdInfo") Optional writeIdInfo) + { + this( + new Path(requireNonNull(targetPath, "targetPath is null")), + new Path(requireNonNull(writePath, "writePath is null")), + isExistingTable, + writeMode, + writeIdInfo); + } + + // This method should only be called by LocationService + public Path getTargetPath() + { + return targetPath; + } + + // This method should only be called by LocationService + public Path getWritePath() + { + return writePath; + } + + // This method should only be called by LocationService + public WriteMode getWriteMode() + { + return writeMode; + } + + // This method should only be called by LocationService + boolean isExistingTable() + { + return isExistingTable; + } + + @JsonProperty("targetPath") + public String getJsonSerializableTargetPath() + { + return targetPath.toString(); + } + + @JsonProperty("writePath") + public String getJsonSerializableWritePath() + { + return writePath.toString(); + } + + @JsonProperty("isExistingTable") + public boolean getJsonSerializableIsExistingTable() + { + return isExistingTable; + } + + @JsonProperty("writeMode") + public WriteMode getJsonSerializableWriteMode() + { + return writeMode; + } + + @JsonProperty("writeIdInfo") + public Optional getJsonSerializablewriteIdInfo() + { + return writeIdInfo; + } + + public enum WriteMode + { + /** + * common mode for new table or existing table (both new and existing partition) and when staging directory is enabled + */ + STAGE_AND_MOVE_TO_TARGET_DIRECTORY(false), + /** + * for new table in S3 or when staging directory is disabled + */ + DIRECT_TO_TARGET_NEW_DIRECTORY(true), + /** + * for existing table in S3 (both new and existing partition) or when staging directory is disabled + */ + DIRECT_TO_TARGET_EXISTING_DIRECTORY(true), + /**/; + + // NOTE: Insert overwrite simulation (partition drops and partition additions in the same + // transaction get merged and become one or more partition alterations, and get submitted to + // metastore in close succession of each other) is not supported for S3. S3 uses the last + // mode for insert into existing table. This is hard to support because the directory + // containing the old data cannot be deleted until commit. Nor can the old data be moved + // (assuming Hive HDFS directory naming convention shall not be violated). As a result, + // subsequent insertion will have to write to directory belonging to existing partition. + // This undermines the benefit of having insert overwrite simulation. This also makes + // dropping of old partition at commit time hard because data added after the logical + // "drop" time was added to the directories to be dropped. + + private final boolean writePathSameAsTargetPath; + + WriteMode(boolean writePathSameAsTargetPath) + { + this.writePathSameAsTargetPath = writePathSameAsTargetPath; + } + + public boolean isWritePathSameAsTargetPath() + { + return writePathSameAsTargetPath; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/LocationService.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/LocationService.java new file mode 100644 index 00000000..a16f3cbb --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/LocationService.java @@ -0,0 +1,82 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.connector.ConnectorSession; +import org.apache.hadoop.fs.Path; + +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +public interface LocationService +{ + LocationHandle forNewTable(SemiTransactionalHiveMetastore metastore, ConnectorSession session, String schemaName, String tableName, Optional writeIdInfo, Optional tablePath, HiveWriteUtils.OpertionType opertionType); + + LocationHandle forExistingTable(SemiTransactionalHiveMetastore metastore, ConnectorSession session, Table table, Optional writeIdInfo, HiveWriteUtils.OpertionType opertionType); + + /** + * targetPath and writePath will be root directory of all partition and table paths + * that may be returned by {@link #getTableWriteInfo(LocationHandle, boolean)} and {@link #getPartitionWriteInfo(LocationHandle, Optional, String)} method. + */ + WriteInfo getQueryWriteInfo(LocationHandle locationHandle); + + WriteInfo getTableWriteInfo(LocationHandle locationHandle, boolean overwrite); + + /** + * If {@code partition} is present, returns {@code WriteInfo} for appending existing partition; + * otherwise, returns {@code WriteInfo} for writing new partition or overwriting existing partition. + */ + WriteInfo getPartitionWriteInfo(LocationHandle locationHandle, Optional partition, String partitionName); + + class WriteInfo + { + private final Path targetPath; + private final Path writePath; + private final LocationHandle.WriteMode writeMode; + + public WriteInfo(Path targetPath, Path writePath, LocationHandle.WriteMode writeMode) + { + this.targetPath = requireNonNull(targetPath, "targetPath is null"); + this.writePath = requireNonNull(writePath, "writePath is null"); + this.writeMode = requireNonNull(writeMode, "writeMode is null"); + } + + /** + * Target path for the partition, unpartitioned table, or the query. + */ + public Path getTargetPath() + { + return targetPath; + } + + /** + * Temporary path for writing to the partition, unpartitioned table or the query. + *

+ * It may be the same as {@code targetPath}. + */ + public Path getWritePath() + { + return writePath; + } + + public LocationHandle.WriteMode getWriteMode() + { + return writeMode; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/NamenodeStats.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/NamenodeStats.java new file mode 100644 index 00000000..b366e381 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/NamenodeStats.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.stats.CounterStat; +import io.airlift.stats.TimeStat; +import org.weakref.jmx.Managed; +import org.weakref.jmx.Nested; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; + +public class NamenodeStats +{ + private final CallStats listLocatedStatus = new CallStats(); + private final CallStats remoteIteratorNext = new CallStats(); + + @Managed + @Nested + public CallStats getListLocatedStatus() + { + return listLocatedStatus; + } + + @Managed + @Nested + public CallStats getRemoteIteratorNext() + { + return remoteIteratorNext; + } + + public static class CallStats + { + private final TimeStat time = new TimeStat(TimeUnit.MILLISECONDS); + private final CounterStat totalFailures = new CounterStat(); + private final CounterStat ioExceptions = new CounterStat(); + + public TimeStat.BlockTimer time() + { + return time.time(); + } + + public void recordException(Exception exception) + { + if (exception instanceof IOException) { + ioExceptions.update(1); + } + totalFailures.update(1); + } + + @Managed + @Nested + public CounterStat getTotalFailures() + { + return totalFailures; + } + + @Managed + @Nested + public CounterStat getIoExceptions() + { + return ioExceptions; + } + + @Managed + @Nested + public TimeStat getTime() + { + return time; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/NodeVersion.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/NodeVersion.java new file mode 100644 index 00000000..9f80e811 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/NodeVersion.java @@ -0,0 +1,32 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import static java.util.Objects.requireNonNull; + +public final class NodeVersion +{ + private final String version; + + public NodeVersion(String version) + { + this.version = requireNonNull(version, "version is null"); + } + + @Override + public String toString() + { + return version; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/OrcFileWriter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/OrcFileWriter.java new file mode 100644 index 00000000..fcbf4465 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/OrcFileWriter.java @@ -0,0 +1,464 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.orc.OrcDataSink; +import io.prestosql.orc.OrcDataSource; +import io.prestosql.orc.OrcWriteValidation.OrcWriteValidationMode; +import io.prestosql.orc.OrcWriter; +import io.prestosql.orc.OrcWriterOptions; +import io.prestosql.orc.OrcWriterStats; +import io.prestosql.orc.metadata.CompressionKind; +import io.prestosql.plugin.hive.orc.OrcAcidRowId; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.block.IntArrayBlockBuilder; +import io.prestosql.spi.block.LongArrayBlockBuilder; +import io.prestosql.spi.block.RowBlock; +import io.prestosql.spi.block.RunLengthEncodedBlock; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.fs.FileAlreadyExistsException; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.io.BucketCodec; +import org.apache.hadoop.ipc.RemoteException; +import org.apache.orc.impl.AcidStats; +import org.apache.orc.impl.OrcAcidUtils; +import org.openjdk.jol.info.ClassLayout; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.lang.management.ManagementFactory; +import java.lang.management.ThreadMXBean; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; +import java.util.concurrent.Callable; +import java.util.function.Supplier; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_CLOSE_ERROR; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_DATA_ERROR; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED; +import static java.util.Objects.requireNonNull; + +public class OrcFileWriter + implements HiveFileWriter +{ + private static final int INSTANCE_SIZE = ClassLayout.parseClass(OrcFileWriter.class).instanceSize(); + private static final ThreadMXBean THREAD_MX_BEAN = ManagementFactory.getThreadMXBean(); + private static final String ACID_KEY_INDEX_NAME = "hive.acid.key.index"; + + private final OrcWriter orcWriter; + private final Callable rollbackAction; + private final int[] fileInputColumnIndexes; + private final List nullBlocks; + private final List dataNullBlocks; + private final Optional> validationInputFactory; + + private final Optional acidOptions; + private StringBuilder indexKey = new StringBuilder(); + private OrcAcidRowId lastKey; + private long writeId; + private long indexWriteId; + private int encodedBucketId; + private int indexEncodedBucketId; + private long rowId; + private long indexRowId; + private Optional acidWriteType; + private AcidStats acidStats = new AcidStats(); + private Path path; + + private long validationCpuNanos; + private Optional deleteDeltaFileWriter; + + public OrcFileWriter( + OrcDataSink orcDataSink, + Callable rollbackAction, + List columnNames, + List fileColumnTypes, + List dataFileColumnTypes, + CompressionKind compression, + OrcWriterOptions options, + boolean writeLegacyVersion, + int[] fileInputColumnIndexes, + Map metadata, + Optional> validationInputFactory, + OrcWriteValidationMode validationMode, + OrcWriterStats stats, + Optional acidOptions, + Optional acidWriteType, + Optional deleteDeltaFileWriter, + Path path) + { + requireNonNull(orcDataSink, "orcDataSink is null"); + + this.path = path; + orcWriter = new OrcWriter( + orcDataSink, + columnNames, + fileColumnTypes, + compression, + options, + writeLegacyVersion, + metadata, + validationInputFactory.isPresent(), + validationMode, + stats, + Optional.of(flushStripeCallback()), + Optional.of(closeCallback())); + this.deleteDeltaFileWriter = deleteDeltaFileWriter; + this.rollbackAction = requireNonNull(rollbackAction, "rollbackAction is null"); + + this.fileInputColumnIndexes = requireNonNull(fileInputColumnIndexes, "outputColumnInputIndexes is null"); + + ImmutableList.Builder nullBlocks = ImmutableList.builder(); + for (Type fileColumnType : fileColumnTypes) { + BlockBuilder blockBuilder = fileColumnType.createBlockBuilder(null, 1, 0); + blockBuilder.appendNull(); + nullBlocks.add(blockBuilder.build()); + } + this.nullBlocks = nullBlocks.build(); + ImmutableList.Builder dataNullBlocks = ImmutableList.builder(); + for (Type fileColumnType : dataFileColumnTypes) { + BlockBuilder blockBuilder = fileColumnType.createBlockBuilder(null, 1, 0); + blockBuilder.appendNull(); + dataNullBlocks.add(blockBuilder.build()); + } + this.dataNullBlocks = dataNullBlocks.build(); + this.validationInputFactory = validationInputFactory; + this.acidOptions = acidOptions; + this.lastKey = new OrcAcidRowId(-1, -1, -1); + this.rowId = -1; + if (acidOptions.isPresent()) { + writeId = acidOptions.get().getMaximumWriteId(); + encodedBucketId = BucketCodec.V1.encode(acidOptions.get()); + } + this.acidWriteType = acidWriteType; + } + + @Override + public void initWriter(boolean isAcid, Path path, FileSystem fileSystem) + { + if (isAcid && isFullAcid()) { + if (deleteDeltaFileWriter.isPresent()) { + AcidOutputFormat.Options deleteOptions = acidOptions.get().clone().writingDeleteDelta(true); + Path deletePath = AcidUtils.createFilename(path.getParent().getParent(), deleteOptions); + deleteDeltaFileWriter.get().initWriter(isAcid, deletePath, fileSystem); + } + try { + AcidUtils.OrcAcidVersion.writeVersionFile(path.getParent(), fileSystem); + } + catch (IOException e) { + if (e instanceof AlreadyBeingCreatedException + || (e instanceof RemoteException && ((RemoteException) e).unwrapRemoteException(AlreadyBeingCreatedException.class) != e) + || (e instanceof FileAlreadyExistsException)) { + //Ignore the exception as same file is being created by another task in parallel. + return; + } + throw new PrestoException(HIVE_WRITER_DATA_ERROR, e); + } + } + } + + private boolean isFullAcid() + { + if (!acidOptions.isPresent()) { + return false; + } + Properties tableProperties = acidOptions.get().getTableProperties(); + return tableProperties == null || !AcidUtils.isInsertOnlyTable(tableProperties); + } + + Callable flushStripeCallback() + { + return () -> { + if (!isFullAcid()) { + return null; + } + OrcAcidRowId currentKey = new OrcAcidRowId(indexWriteId, indexEncodedBucketId, indexRowId); + if (lastKey.compareTo(currentKey) < 0) { + indexKey.append(indexWriteId); + indexKey.append(","); + indexKey.append(indexEncodedBucketId); + indexKey.append(","); + indexKey.append(indexRowId); + indexKey.append(";"); + lastKey = currentKey; + } + return null; + }; + } + + Callable closeCallback() + { + return () -> { + if (!isFullAcid()) { + return null; + } + OrcAcidRowId currentKey = new OrcAcidRowId(indexWriteId, indexEncodedBucketId, indexRowId); + if (lastKey.compareTo(currentKey) < 0) { + flushStripeCallback().call(); + } + orcWriter.addUserMetadata(ACID_KEY_INDEX_NAME, indexKey.toString()); + orcWriter.addUserMetadata(OrcAcidUtils.ACID_STATS, acidStats.serialize()); + return null; + }; + } + + @Override + public long getWrittenBytes() + { + return orcWriter.getWrittenBytes() + orcWriter.getBufferedBytes(); + } + + @Override + public long getSystemMemoryUsage() + { + return INSTANCE_SIZE + orcWriter.getRetainedBytes(); + } + + @Override + public void appendRows(Page dataPage) + { + if (deleteDeltaFileWriter.isPresent()) { + //Forward to delete writer + deleteDeltaFileWriter.get().appendRows(dataPage); + } + Block[] dataBlocks = new Block[fileInputColumnIndexes.length]; + for (int i = 0; i < fileInputColumnIndexes.length; i++) { + int inputColumnIndex = fileInputColumnIndexes[i]; + if (inputColumnIndex < 0) { + dataBlocks[i] = new RunLengthEncodedBlock(dataNullBlocks.get(i), dataPage.getPositionCount()); + } + else { + dataBlocks[i] = dataPage.getBlock(inputColumnIndex); + } + } + Block[] blocks = null; + int i = 0; + int totalColumns; + if (isFullAcid()) { + Block rowIdBlock = null; + if (HiveACIDWriteType.isRowIdNeeded(acidWriteType.get())) { + Block block = dataPage.getBlock(dataPage.getChannelCount() - 1); + rowIdBlock = block.getLoadedBlock(); + } + totalColumns = 6; + blocks = new Block[totalColumns]; + //operation + blocks[i++] = insertOperationId(dataPage, rowIdBlock, acidWriteType.get().getOperationId()); + //originalTransactionId + blocks[i++] = insertOriginalTransaction(dataPage, rowIdBlock, writeId); + //bucketId + //Bucket Id is encoded to include some extra information from options. + blocks[i++] = insertBucketIdBlock(dataPage, rowIdBlock, encodedBucketId); + //rowId + //rowId is incremental within a delta file./ + blocks[i++] = insertRowIdBlock(dataPage, rowIdBlock); + //currentTransactionId + blocks[i++] = insertCurrentTransaction(dataPage, rowIdBlock, writeId); + boolean isDelete = acidWriteType.get() == HiveACIDWriteType.DELETE || + (acidWriteType.get() == HiveACIDWriteType.VACUUM && + acidOptions.map(o -> o.isWritingDeleteDelta()).orElse(false)); + blocks[i] = !isDelete ? RowBlock.fromFieldBlocks(dataPage.getPositionCount(), Optional.empty(), dataBlocks) + : new RunLengthEncodedBlock(nullBlocks.get(nullBlocks.size() - 1), dataPage.getPositionCount()); + // statistics required to read from hive-cli for historical reasons. + if (isDelete) { + acidStats.deletes += dataPage.getPositionCount(); + } + else { + acidStats.inserts += dataPage.getPositionCount(); + } + } + else { + blocks = dataBlocks; + } + + Page page = new Page(dataPage.getPositionCount(), blocks); + try { + orcWriter.write(page); + } + catch (IOException | UncheckedIOException e) { + throw new PrestoException(HIVE_WRITER_DATA_ERROR, e); + } + } + + private Block insertOperationId(Page dataPage, Block rowIdBlock, int value) + { + BlockBuilder builder = new IntArrayBlockBuilder(null, dataPage.getPositionCount()); + boolean keepOriginal = acidWriteType.map(HiveACIDWriteType::isVacuum).orElse(false); + int valueToWrite = value; + for (int j = 0; j < dataPage.getPositionCount(); j++) { + if (rowIdBlock != null && keepOriginal) { + RowBlock rowBlock = (RowBlock) rowIdBlock.getSingleValueBlock(j); + valueToWrite = rowBlock.getRawFieldBlocks()[4].getInt(0, 0); + } + builder.writeInt(valueToWrite); + } + return builder.build(); + } + + private Block insertOriginalTransaction(Page dataPage, Block rowIdBlock, long value) + { + BlockBuilder builder = new LongArrayBlockBuilder(null, dataPage.getPositionCount()); + boolean keepOriginal = acidWriteType.map(t -> (t == HiveACIDWriteType.DELETE || t == HiveACIDWriteType.VACUUM)).orElse(false); + long valueToWrite = value; + for (int j = 0; j < dataPage.getPositionCount(); j++) { + if (rowIdBlock != null && keepOriginal) { + RowBlock rowBlock = (RowBlock) rowIdBlock.getSingleValueBlock(j); + valueToWrite = rowBlock.getRawFieldBlocks()[0].getLong(0, 0); + } + builder.writeLong(valueToWrite); + } + return builder.build(); + } + + private Block insertCurrentTransaction(Page dataPage, Block rowIdBlock, long value) + { + BlockBuilder builder = new LongArrayBlockBuilder(null, dataPage.getPositionCount()); + boolean keepOriginal = acidWriteType.map(t -> (t == HiveACIDWriteType.VACUUM)).orElse(false); + long valueToWrite = value; + for (int j = 0; j < dataPage.getPositionCount(); j++) { + if (rowIdBlock != null && keepOriginal) { + RowBlock rowBlock = (RowBlock) rowIdBlock.getSingleValueBlock(j); + valueToWrite = rowBlock.getRawFieldBlocks()[3].getLong(0, 0); + } + builder.writeLong(valueToWrite); + } + indexWriteId = valueToWrite; + return builder.build(); + } + + private Block insertRowIdBlock(Page dataPage, Block rowIdBlock) + { + BlockBuilder builder = new LongArrayBlockBuilder(null, dataPage.getPositionCount()); + boolean keepOriginal = acidWriteType.map(t -> (t == HiveACIDWriteType.DELETE || t == HiveACIDWriteType.VACUUM)).orElse(false); + long valueToWrite = -1; + for (int j = 0; j < dataPage.getPositionCount(); j++) { + valueToWrite = (rowId + 1); + if (rowIdBlock != null && keepOriginal) { + RowBlock rowBlock = (RowBlock) rowIdBlock.getSingleValueBlock(j); + valueToWrite = rowBlock.getRawFieldBlocks()[2].getLong(0, 0); + } + else { + ++rowId; + } + builder.writeLong(valueToWrite); + } + indexRowId = valueToWrite; + return builder.build(); + } + + private Block insertBucketIdBlock(Page dataPage, Block rowIdBlock, int value) + { + //In case of VACUUM_UNIFY need to map bucketId to fileName. + boolean keepOriginal = acidWriteType.map(t -> (t == HiveACIDWriteType.DELETE || t == HiveACIDWriteType.VACUUM)).orElse(false); + BlockBuilder builder = new IntArrayBlockBuilder(null, dataPage.getPositionCount()); + int valueToWrite = value; + for (int j = 0; j < dataPage.getPositionCount(); j++) { + if (rowIdBlock != null && keepOriginal) { + RowBlock rowBlock = (RowBlock) rowIdBlock.getSingleValueBlock(j); + valueToWrite = rowBlock.getRawFieldBlocks()[1].getInt(0, 0); + } + builder.writeInt(valueToWrite); + } + indexEncodedBucketId = valueToWrite; + return builder.build(); + } + + @Override + public void commit() + { + try { + if (deleteDeltaFileWriter.isPresent()) { + deleteDeltaFileWriter.get().commit(); + } + orcWriter.close(); + } + catch (IOException | UncheckedIOException e) { + try { + rollbackAction.call(); + } + catch (Exception ignored) { + // ignore + } + throw new PrestoException(HIVE_WRITER_CLOSE_ERROR, "Error committing write to Hive", e); + } + + if (validationInputFactory.isPresent()) { + try { + try (OrcDataSource input = validationInputFactory.get().get()) { + long startThreadCpuTime = THREAD_MX_BEAN.getCurrentThreadCpuTime(); + orcWriter.validate(input); + validationCpuNanos += THREAD_MX_BEAN.getCurrentThreadCpuTime() - startThreadCpuTime; + } + } + catch (IOException | UncheckedIOException e) { + throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e); + } + } + } + + @Override + public void rollback() + { + try { + try { + if (deleteDeltaFileWriter.isPresent()) { + deleteDeltaFileWriter.get().rollback(); + } + orcWriter.close(); + } + finally { + rollbackAction.call(); + } + } + catch (Exception e) { + throw new PrestoException(HIVE_WRITER_CLOSE_ERROR, "Error rolling back write to Hive", e); + } + } + + @Override + public long getValidationCpuNanos() + { + return validationCpuNanos; + } + + @Override + public ImmutableList getExtraPartitionFiles() + { + if (deleteDeltaFileWriter.isPresent()) { + OrcFileWriter deleteFileWriter = (OrcFileWriter) deleteDeltaFileWriter.get(); + Path deletePath = deleteFileWriter.path.getParent(); + return ImmutableList.of(deletePath.getName()); + } + return ImmutableList.of(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("writer", orcWriter) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/OrcFileWriterConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/OrcFileWriterConfig.java new file mode 100644 index 00000000..915ef5c9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/OrcFileWriterConfig.java @@ -0,0 +1,113 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.configuration.Config; +import io.airlift.units.DataSize; +import io.prestosql.orc.OrcWriterOptions; + +@SuppressWarnings("unused") +public class OrcFileWriterConfig +{ + private OrcWriterOptions options = new OrcWriterOptions(); + + public OrcWriterOptions toOrcWriterOptions() + { + return options; + } + + public DataSize getStripeMinSize() + { + return options.getStripeMinSize(); + } + + @Config("hive.orc.writer.stripe-min-size") + public OrcFileWriterConfig setStripeMinSize(DataSize stripeMinSize) + { + options = options.withStripeMinSize(stripeMinSize); + return this; + } + + public DataSize getStripeMaxSize() + { + return options.getStripeMaxSize(); + } + + @Config("hive.orc.writer.stripe-max-size") + public OrcFileWriterConfig setStripeMaxSize(DataSize stripeMaxSize) + { + options = options.withStripeMaxSize(stripeMaxSize); + return this; + } + + public int getStripeMaxRowCount() + { + return options.getStripeMaxRowCount(); + } + + @Config("hive.orc.writer.stripe-max-rows") + public OrcFileWriterConfig setStripeMaxRowCount(int stripeMaxRowCount) + { + options = options.withStripeMaxRowCount(stripeMaxRowCount); + return this; + } + + public int getRowGroupMaxRowCount() + { + return options.getRowGroupMaxRowCount(); + } + + @Config("hive.orc.writer.row-group-max-rows") + public OrcFileWriterConfig setRowGroupMaxRowCount(int rowGroupMaxRowCount) + { + options = options.withRowGroupMaxRowCount(rowGroupMaxRowCount); + return this; + } + + public DataSize getDictionaryMaxMemory() + { + return options.getDictionaryMaxMemory(); + } + + @Config("hive.orc.writer.dictionary-max-memory") + public OrcFileWriterConfig setDictionaryMaxMemory(DataSize dictionaryMaxMemory) + { + options = options.withDictionaryMaxMemory(dictionaryMaxMemory); + return this; + } + + public DataSize getStringStatisticsLimit() + { + return options.getMaxStringStatisticsLimit(); + } + + @Config("hive.orc.writer.string-statistics-limit") + public OrcFileWriterConfig setStringStatisticsLimit(DataSize stringStatisticsLimit) + { + options = options.withMaxStringStatisticsLimit(stringStatisticsLimit); + return this; + } + + public DataSize getMaxCompressionBufferSize() + { + return options.getMaxCompressionBufferSize(); + } + + @Config("hive.orc.writer.max-compression-buffer-size") + public OrcFileWriterConfig setMaxCompressionBufferSize(DataSize maxCompressionBufferSize) + { + options = options.withMaxCompressionBufferSize(maxCompressionBufferSize); + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/OrcFileWriterFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/OrcFileWriterFactory.java new file mode 100644 index 00000000..abf57e14 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/OrcFileWriterFactory.java @@ -0,0 +1,260 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.orc.OrcDataSink; +import io.prestosql.orc.OrcDataSource; +import io.prestosql.orc.OrcDataSourceId; +import io.prestosql.orc.OrcWriterOptions; +import io.prestosql.orc.OrcWriterStats; +import io.prestosql.orc.OutputStreamOrcDataSink; +import io.prestosql.orc.metadata.CompressionKind; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.orc.HdfsOrcDataSource; +import io.prestosql.plugin.hive.orc.OrcPageSourceFactory; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.type.RowType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.orc.OrcConf; +import org.weakref.jmx.Flatten; +import org.weakref.jmx.Managed; + +import javax.inject.Inject; + +import java.io.IOException; +import java.util.List; +import java.util.Optional; +import java.util.Properties; +import java.util.concurrent.Callable; +import java.util.function.Supplier; + +import static io.prestosql.plugin.hive.HiveUtil.getColumnNames; +import static io.prestosql.plugin.hive.HiveUtil.getColumnTypes; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static java.util.Locale.ENGLISH; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; + +public class OrcFileWriterFactory + implements HiveFileWriterFactory +{ + private final HdfsEnvironment hdfsEnvironment; + private final TypeManager typeManager; + private final NodeVersion nodeVersion; + private final FileFormatDataSourceStats readStats; + private final OrcWriterStats stats = new OrcWriterStats(); + private final OrcWriterOptions orcWriterOptions; + private final boolean writeLegacyVersion; + + @Inject + public OrcFileWriterFactory( + HdfsEnvironment hdfsEnvironment, + TypeManager typeManager, + NodeVersion nodeVersion, + HiveConfig hiveConfig, + FileFormatDataSourceStats readStats, + OrcFileWriterConfig config) + { + this( + hdfsEnvironment, + typeManager, + nodeVersion, + hiveConfig.isOrcWriteLegacyVersion(), + readStats, + requireNonNull(config, "config is null").toOrcWriterOptions()); + } + + public OrcFileWriterFactory( + HdfsEnvironment hdfsEnvironment, + TypeManager typeManager, + NodeVersion nodeVersion, + boolean writeLegacyVersion, + FileFormatDataSourceStats readStats, + OrcWriterOptions orcWriterOptions) + { + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + this.nodeVersion = requireNonNull(nodeVersion, "nodeVersion is null"); + this.writeLegacyVersion = writeLegacyVersion; + this.readStats = requireNonNull(readStats, "stats is null"); + this.orcWriterOptions = requireNonNull(orcWriterOptions, "orcWriterOptions is null"); + } + + @Managed + @Flatten + public OrcWriterStats getStats() + { + return stats; + } + + @Override + public Optional createFileWriter( + Path path, + List inputColumnNames, + StorageFormat storageFormat, + Properties schema, + JobConf configuration, + ConnectorSession session, + Optional acidOptions, + Optional acidWriteType) + { + if (!OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) { + return Optional.empty(); + } + + CompressionKind compression = getCompression(schema, configuration); + + // existing tables and partitions may have columns in a different order than the writer is providing, so build + // an index to rearrange columns in the proper order + List fileColumnNames = getColumnNames(schema); + List fileColumnTypes = getColumnTypes(schema).stream() + .map(hiveType -> hiveType.getType(typeManager)) + .collect(toList()); + List dataFileColumnTypes = fileColumnTypes; + + int[] fileInputColumnIndexes = fileColumnNames.stream() + .mapToInt(inputColumnNames::indexOf) + .toArray(); + + Optional deleteDeltaWriter = Optional.empty(); + if (AcidUtils.isTablePropertyTransactional(schema) && !AcidUtils.isInsertOnlyTable(schema)) { + ImmutableList orcFileColumnNames = ImmutableList.of(OrcPageSourceFactory.ACID_COLUMN_OPERATION, + OrcPageSourceFactory.ACID_COLUMN_ORIGINAL_TRANSACTION, + OrcPageSourceFactory.ACID_COLUMN_BUCKET, + OrcPageSourceFactory.ACID_COLUMN_ROW_ID, + OrcPageSourceFactory.ACID_COLUMN_CURRENT_TRANSACTION, + OrcPageSourceFactory.ACID_COLUMN_ROW_STRUCT); + + ImmutableList.Builder fieldsBuilder = ImmutableList.builder(); + for (int i = 0; i < fileColumnNames.size(); i++) { + fieldsBuilder.add(new RowType.Field(Optional.of(fileColumnNames.get(i)), fileColumnTypes.get(i))); + } + ImmutableList orcFileColumnTypes = ImmutableList.of(INTEGER, + BIGINT, + INTEGER, + BIGINT, + BIGINT, + RowType.from(fieldsBuilder.build())); + fileColumnNames = orcFileColumnNames; + fileColumnTypes = orcFileColumnTypes; + if (acidWriteType.isPresent() && acidWriteType.get() == HiveACIDWriteType.UPDATE) { + AcidOutputFormat.Options deleteOptions = acidOptions.get().clone().writingDeleteDelta(true); + Path deleteDeltaPath = AcidUtils.createFilename(path.getParent().getParent(), deleteOptions); + deleteDeltaWriter = createFileWriter(deleteDeltaPath, inputColumnNames, storageFormat, schema, configuration, + session, Optional.of(deleteOptions), Optional.of(HiveACIDWriteType.DELETE)); + } + } + + try { + FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration); + OrcDataSink orcDataSink = createOrcDataSink(session, fileSystem, path); + + Optional> validationInputFactory = Optional.empty(); + if (HiveSessionProperties.isOrcOptimizedWriterValidate(session)) { + validationInputFactory = Optional.of(() -> { + try { + FileStatus fileStatus = fileSystem.getFileStatus(path); + return new HdfsOrcDataSource( + new OrcDataSourceId(path.toString()), + fileStatus.getLen(), + HiveSessionProperties.getOrcMaxMergeDistance(session), + HiveSessionProperties.getOrcMaxBufferSize(session), + HiveSessionProperties.getOrcStreamBufferSize(session), + false, + fileSystem.open(path), + readStats, + fileStatus.getModificationTime()); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED, e); + } + }); + } + + Callable rollbackAction = () -> { + fileSystem.delete(path, false); + return null; + }; + + return Optional.of(new OrcFileWriter( + orcDataSink, + rollbackAction, + fileColumnNames, + fileColumnTypes, + dataFileColumnTypes, + compression, + orcWriterOptions + .withStripeMinSize(HiveSessionProperties.getOrcOptimizedWriterMinStripeSize(session)) + .withStripeMaxSize(HiveSessionProperties.getOrcOptimizedWriterMaxStripeSize(session)) + .withStripeMaxRowCount(HiveSessionProperties.getOrcOptimizedWriterMaxStripeRows(session)) + .withDictionaryMaxMemory(HiveSessionProperties.getOrcOptimizedWriterMaxDictionaryMemory(session)) + .withMaxStringStatisticsLimit(HiveSessionProperties.getOrcStringStatisticsLimit(session)), + writeLegacyVersion, + fileInputColumnIndexes, + ImmutableMap.builder() + .put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()) + .put(HiveMetadata.PRESTO_QUERY_ID_NAME, session.getQueryId()) + .put("hive.acid.version", String.valueOf(AcidUtils.OrcAcidVersion.ORC_ACID_VERSION)) + .build(), + validationInputFactory, + HiveSessionProperties.getOrcOptimizedWriterValidateMode(session), + stats, + acidOptions, + acidWriteType, + deleteDeltaWriter, + path)); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e); + } + } + + /** + * Allow subclass to replace data sink implementation. + */ + protected OrcDataSink createOrcDataSink(ConnectorSession session, FileSystem fileSystem, Path path) + throws IOException + { + return new OutputStreamOrcDataSink(fileSystem.create(path)); + } + + private static CompressionKind getCompression(Properties schema, JobConf configuration) + { + String compressionName = OrcConf.COMPRESS.getString(schema, configuration); + if (compressionName == null) { + return CompressionKind.ZLIB; + } + + CompressionKind compression; + try { + compression = CompressionKind.valueOf(compressionName.toUpperCase(ENGLISH)); + } + catch (IllegalArgumentException e) { + throw new PrestoException(HiveErrorCode.HIVE_UNSUPPORTED_FORMAT, "Unknown ORC compression type " + compressionName); + } + return compression; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ParquetFileWriterConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ParquetFileWriterConfig.java new file mode 100644 index 00000000..632384c6 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ParquetFileWriterConfig.java @@ -0,0 +1,50 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.configuration.Config; +import io.airlift.units.DataSize; +import org.apache.parquet.hadoop.ParquetWriter; + +import static io.airlift.units.DataSize.Unit.BYTE; + +public class ParquetFileWriterConfig +{ + private DataSize blockSize = new DataSize(ParquetWriter.DEFAULT_BLOCK_SIZE, BYTE); + private DataSize pageSize = new DataSize(ParquetWriter.DEFAULT_PAGE_SIZE, BYTE); + + public DataSize getBlockSize() + { + return blockSize; + } + + @Config("hive.parquet.writer.block-size") + public ParquetFileWriterConfig setBlockSize(DataSize blockSize) + { + this.blockSize = blockSize; + return this; + } + + public DataSize getPageSize() + { + return pageSize; + } + + @Config("hive.parquet.writer.page-size") + public ParquetFileWriterConfig setPageSize(DataSize pageSize) + { + this.pageSize = pageSize; + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionNotFoundException.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionNotFoundException.java new file mode 100644 index 00000000..6bbc0b37 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionNotFoundException.java @@ -0,0 +1,61 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.connector.NotFoundException; +import io.prestosql.spi.connector.SchemaTableName; + +import java.util.List; + +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class PartitionNotFoundException + extends NotFoundException +{ + private final SchemaTableName tableName; + private final List partitionValues; + + public PartitionNotFoundException(SchemaTableName tableName, List partitionValue) + { + this(tableName, partitionValue, format("Partition '%s' not found", tableName), null); + } + + public PartitionNotFoundException(SchemaTableName tableName, List partitionValues, String message) + { + this(tableName, partitionValues, message, null); + } + + public PartitionNotFoundException(SchemaTableName tableName, List partitionValue, Throwable cause) + { + this(tableName, partitionValue, format("Partition '%s' not found", tableName), cause); + } + + public PartitionNotFoundException(SchemaTableName tableName, List partitionValues, String message, Throwable cause) + { + super(message, cause); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.partitionValues = requireNonNull(partitionValues, "partitionValue is null"); + } + + public SchemaTableName getTableName() + { + return tableName; + } + + public List getPartitionValues() + { + return partitionValues; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionOfflineException.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionOfflineException.java new file mode 100644 index 00000000..38902582 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionOfflineException.java @@ -0,0 +1,59 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.SchemaTableName; + +import static com.google.common.base.Strings.isNullOrEmpty; +import static java.util.Objects.requireNonNull; + +public class PartitionOfflineException + extends PrestoException +{ + private final SchemaTableName tableName; + private final String partition; + + public PartitionOfflineException(SchemaTableName tableName, String partitionName, boolean forPresto, String offlineMessage) + { + super(HiveErrorCode.HIVE_PARTITION_OFFLINE, formatMessage(tableName, partitionName, forPresto, offlineMessage)); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.partition = requireNonNull(partitionName, "partition is null"); + } + + public SchemaTableName getTableName() + { + return tableName; + } + + public String getPartition() + { + return partition; + } + + private static String formatMessage(SchemaTableName tableName, String partitionName, boolean forPresto, String offlineMessage) + { + StringBuilder resultBuilder = new StringBuilder() + .append("Table '").append(tableName).append("'") + .append(" partition '").append(partitionName).append("'") + .append(" is offline"); + if (forPresto) { + resultBuilder.append(" for Presto"); + } + if (!isNullOrEmpty(offlineMessage)) { + resultBuilder.append(": ").append(offlineMessage); + } + return resultBuilder.toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionStatistics.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionStatistics.java new file mode 100644 index 00000000..6040a933 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionStatistics.java @@ -0,0 +1,120 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; + +import javax.annotation.concurrent.Immutable; + +import java.util.Map; +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class PartitionStatistics +{ + private static final PartitionStatistics EMPTY = new PartitionStatistics(HiveBasicStatistics.createEmptyStatistics(), ImmutableMap.of()); + + private final HiveBasicStatistics basicStatistics; + private final Map columnStatistics; + + public static PartitionStatistics empty() + { + return EMPTY; + } + + @JsonCreator + public PartitionStatistics( + @JsonProperty("basicStatistics") HiveBasicStatistics basicStatistics, + @JsonProperty("columnStatistics") Map columnStatistics) + { + this.basicStatistics = requireNonNull(basicStatistics, "basicStatistics is null"); + this.columnStatistics = ImmutableMap.copyOf(requireNonNull(columnStatistics, "columnStatistics can not be null")); + } + + @JsonProperty + public HiveBasicStatistics getBasicStatistics() + { + return basicStatistics; + } + + @JsonProperty + public Map getColumnStatistics() + { + return columnStatistics; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + PartitionStatistics that = (PartitionStatistics) o; + return Objects.equals(basicStatistics, that.basicStatistics) && + Objects.equals(columnStatistics, that.columnStatistics); + } + + @Override + public int hashCode() + { + return Objects.hash(basicStatistics, columnStatistics); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("basicStatistics", basicStatistics) + .add("columnStatistics", columnStatistics) + .toString(); + } + + public static Builder builder() + { + return new Builder(); + } + + public static class Builder + { + private HiveBasicStatistics basicStatistics = HiveBasicStatistics.createEmptyStatistics(); + private Map columnStatistics = ImmutableMap.of(); + + public Builder setBasicStatistics(HiveBasicStatistics basicStatistics) + { + this.basicStatistics = requireNonNull(basicStatistics, "basicStatistics is null"); + return this; + } + + public Builder setColumnStatistics(Map columnStatistics) + { + this.columnStatistics = ImmutableMap.copyOf(requireNonNull(columnStatistics, "columnStatistics is null")); + return this; + } + + public PartitionStatistics build() + { + return new PartitionStatistics(basicStatistics, columnStatistics); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionUpdate.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionUpdate.java new file mode 100644 index 00000000..b5644c48 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/PartitionUpdate.java @@ -0,0 +1,221 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Multimaps; +import io.prestosql.spi.PrestoException; +import org.apache.hadoop.fs.Path; + +import java.util.Collection; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class PartitionUpdate +{ + private final String name; + private final UpdateMode updateMode; + private final Path writePath; + private final Path targetPath; + private final List fileNames; + private final long rowCount; + private final long inMemoryDataSizeInBytes; + private final long onDiskDataSizeInBytes; + private final List miscData; + + @JsonCreator + public PartitionUpdate( + @JsonProperty("name") String name, + @JsonProperty("updateMode") UpdateMode updateMode, + @JsonProperty("writePath") String writePath, + @JsonProperty("targetPath") String targetPath, + @JsonProperty("fileNames") List fileNames, + @JsonProperty("rowCount") long rowCount, + @JsonProperty("inMemoryDataSizeInBytes") long inMemoryDataSizeInBytes, + @JsonProperty("onDiskDataSizeInBytes") long onDiskDataSizeInBytes, + @JsonProperty("miscData") List miscData) + { + this( + name, + updateMode, + new Path(requireNonNull(writePath, "writePath is null")), + new Path(requireNonNull(targetPath, "targetPath is null")), + fileNames, + rowCount, + inMemoryDataSizeInBytes, + onDiskDataSizeInBytes, + miscData); + } + + public PartitionUpdate( + String name, + UpdateMode updateMode, + Path writePath, + Path targetPath, + List fileNames, + long rowCount, + long inMemoryDataSizeInBytes, + long onDiskDataSizeInBytes, + List miscData) + { + this.name = requireNonNull(name, "name is null"); + this.updateMode = requireNonNull(updateMode, "updateMode is null"); + this.writePath = requireNonNull(writePath, "writePath is null"); + this.targetPath = requireNonNull(targetPath, "targetPath is null"); + this.fileNames = ImmutableList.copyOf(requireNonNull(fileNames, "fileNames is null")); + checkArgument(rowCount >= 0, "rowCount is negative: %s", rowCount); + this.rowCount = rowCount; + checkArgument(inMemoryDataSizeInBytes >= 0, "inMemoryDataSizeInBytes is negative: %s", inMemoryDataSizeInBytes); + this.inMemoryDataSizeInBytes = inMemoryDataSizeInBytes; + checkArgument(onDiskDataSizeInBytes >= 0, "onDiskDataSizeInBytes is negative: %s", onDiskDataSizeInBytes); + this.onDiskDataSizeInBytes = onDiskDataSizeInBytes; + this.miscData = requireNonNull(miscData, "miscData is null"); + } + + @JsonProperty + public String getName() + { + return name; + } + + @JsonProperty + public UpdateMode getUpdateMode() + { + return updateMode; + } + + public Path getWritePath() + { + return writePath; + } + + public Path getTargetPath() + { + return targetPath; + } + + @JsonProperty + public List getFileNames() + { + return fileNames; + } + + @JsonProperty("targetPath") + public String getJsonSerializableTargetPath() + { + return targetPath.toString(); + } + + @JsonProperty("writePath") + public String getJsonSerializableWritePath() + { + return writePath.toString(); + } + + @JsonProperty + public long getRowCount() + { + return rowCount; + } + + @JsonProperty + public long getInMemoryDataSizeInBytes() + { + return inMemoryDataSizeInBytes; + } + + @JsonProperty + public long getOnDiskDataSizeInBytes() + { + return onDiskDataSizeInBytes; + } + + @JsonProperty + public List getMiscData() + { + return miscData; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("name", name) + .add("updateMode", updateMode) + .add("writePath", writePath) + .add("targetPath", targetPath) + .add("fileNames", fileNames) + .add("rowCount", rowCount) + .add("inMemoryDataSizeInBytes", inMemoryDataSizeInBytes) + .add("onDiskDataSizeInBytes", onDiskDataSizeInBytes) + .add("miscData", miscData) + .toString(); + } + + public HiveBasicStatistics getStatistics() + { + return new HiveBasicStatistics(fileNames.size(), rowCount, inMemoryDataSizeInBytes, onDiskDataSizeInBytes); + } + + public static List mergePartitionUpdates(Iterable unMergedUpdates) + { + ImmutableList.Builder partitionUpdates = ImmutableList.builder(); + for (Collection partitionGroup : Multimaps.index(unMergedUpdates, PartitionUpdate::getName).asMap().values()) { + PartitionUpdate firstPartition = partitionGroup.iterator().next(); + + ImmutableList.Builder allFileNames = ImmutableList.builder(); + ImmutableList.Builder allMiscData = ImmutableList.builder(); + long totalRowCount = 0; + long totalInMemoryDataSizeInBytes = 0; + long totalOnDiskDataSizeInBytes = 0; + for (PartitionUpdate partition : partitionGroup) { + // verify partitions have the same new flag, write path and target path + // this shouldn't happen but could if another user added a partition during the write + if (!partition.getWritePath().equals(firstPartition.getWritePath()) || + !partition.getTargetPath().equals(firstPartition.getTargetPath())) { + throw new PrestoException(HiveErrorCode.HIVE_CONCURRENT_MODIFICATION_DETECTED, format("Partition %s was added or modified during INSERT", firstPartition.getName())); + } + allFileNames.addAll(partition.getFileNames()); + allMiscData.addAll(partition.getMiscData()); + totalRowCount += partition.getRowCount(); + totalInMemoryDataSizeInBytes += partition.getInMemoryDataSizeInBytes(); + totalOnDiskDataSizeInBytes += partition.getOnDiskDataSizeInBytes(); + } + + partitionUpdates.add(new PartitionUpdate(firstPartition.getName(), + firstPartition.getUpdateMode(), + firstPartition.getWritePath(), + firstPartition.getTargetPath(), + allFileNames.build(), + totalRowCount, + totalInMemoryDataSizeInBytes, + totalOnDiskDataSizeInBytes, + allMiscData.build())); + } + return partitionUpdates.build(); + } + + public enum UpdateMode + { + NEW, + APPEND, + OVERWRITE, + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RcFileFileWriter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RcFileFileWriter.java new file mode 100644 index 00000000..a737f1ce --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RcFileFileWriter.java @@ -0,0 +1,188 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.io.CountingOutputStream; +import io.airlift.slice.OutputStreamSliceOutput; +import io.prestosql.rcfile.AircompressorCodecFactory; +import io.prestosql.rcfile.HadoopCodecFactory; +import io.prestosql.rcfile.RcFileDataSource; +import io.prestosql.rcfile.RcFileEncoding; +import io.prestosql.rcfile.RcFileWriter; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.block.RunLengthEncodedBlock; +import io.prestosql.spi.type.Type; +import org.openjdk.jol.info.ClassLayout; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.UncheckedIOException; +import java.lang.management.ManagementFactory; +import java.lang.management.ThreadMXBean; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.Callable; +import java.util.function.Supplier; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +public class RcFileFileWriter + implements HiveFileWriter +{ + private static final int INSTANCE_SIZE = ClassLayout.parseClass(RcFileFileWriter.class).instanceSize(); + private static final ThreadMXBean THREAD_MX_BEAN = ManagementFactory.getThreadMXBean(); + + private final CountingOutputStream outputStream; + private final RcFileWriter rcFileWriter; + private final Callable rollbackAction; + private final int[] fileInputColumnIndexes; + private final List nullBlocks; + private final Optional> validationInputFactory; + + private long validationCpuNanos; + + public RcFileFileWriter( + OutputStream outputStream, + Callable rollbackAction, + RcFileEncoding rcFileEncoding, + List fileColumnTypes, + Optional codecName, + int[] fileInputColumnIndexes, + Map metadata, + Optional> validationInputFactory) + throws IOException + { + this.outputStream = new CountingOutputStream(outputStream); + rcFileWriter = new RcFileWriter( + new OutputStreamSliceOutput(this.outputStream), + fileColumnTypes, + rcFileEncoding, + codecName, + new AircompressorCodecFactory(new HadoopCodecFactory(getClass().getClassLoader())), + metadata, + validationInputFactory.isPresent()); + this.rollbackAction = requireNonNull(rollbackAction, "rollbackAction is null"); + + this.fileInputColumnIndexes = requireNonNull(fileInputColumnIndexes, "outputColumnInputIndexes is null"); + + ImmutableList.Builder nullBlocks = ImmutableList.builder(); + for (Type fileColumnType : fileColumnTypes) { + BlockBuilder blockBuilder = fileColumnType.createBlockBuilder(null, 1, 0); + blockBuilder.appendNull(); + nullBlocks.add(blockBuilder.build()); + } + this.nullBlocks = nullBlocks.build(); + this.validationInputFactory = validationInputFactory; + } + + @Override + public long getWrittenBytes() + { + return outputStream.getCount(); + } + + @Override + public long getSystemMemoryUsage() + { + return INSTANCE_SIZE + rcFileWriter.getRetainedSizeInBytes(); + } + + @Override + public void appendRows(Page dataPage) + { + Block[] blocks = new Block[fileInputColumnIndexes.length]; + for (int i = 0; i < fileInputColumnIndexes.length; i++) { + int inputColumnIndex = fileInputColumnIndexes[i]; + if (inputColumnIndex < 0) { + blocks[i] = new RunLengthEncodedBlock(nullBlocks.get(i), dataPage.getPositionCount()); + } + else { + blocks[i] = dataPage.getBlock(inputColumnIndex); + } + } + Page page = new Page(dataPage.getPositionCount(), blocks); + try { + rcFileWriter.write(page); + } + catch (IOException | UncheckedIOException e) { + throw new PrestoException(HiveErrorCode.HIVE_WRITER_DATA_ERROR, e); + } + } + + @Override + public void commit() + { + try { + rcFileWriter.close(); + } + catch (IOException | UncheckedIOException e) { + try { + rollbackAction.call(); + } + catch (Exception ignored) { + // ignore + } + throw new PrestoException(HiveErrorCode.HIVE_WRITER_CLOSE_ERROR, "Error committing write to Hive", e); + } + + if (validationInputFactory.isPresent()) { + try { + try (RcFileDataSource input = validationInputFactory.get().get()) { + long startThreadCpuTime = THREAD_MX_BEAN.getCurrentThreadCpuTime(); + rcFileWriter.validate(input); + validationCpuNanos += THREAD_MX_BEAN.getCurrentThreadCpuTime() - startThreadCpuTime; + } + } + catch (IOException | UncheckedIOException e) { + throw new PrestoException(HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED, e); + } + } + } + + @Override + public void rollback() + { + try { + try { + rcFileWriter.close(); + } + finally { + rollbackAction.call(); + } + } + catch (Exception e) { + throw new PrestoException(HiveErrorCode.HIVE_WRITER_CLOSE_ERROR, "Error rolling back write to Hive", e); + } + } + + @Override + public long getValidationCpuNanos() + { + return validationCpuNanos; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("writer", rcFileWriter) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RcFileFileWriterFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RcFileFileWriterFactory.java new file mode 100644 index 00000000..6265b8c7 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RcFileFileWriterFactory.java @@ -0,0 +1,165 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.rcfile.HdfsRcFileDataSource; +import io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory; +import io.prestosql.rcfile.RcFileDataSource; +import io.prestosql.rcfile.RcFileEncoding; +import io.prestosql.rcfile.binary.BinaryRcFileEncoding; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat; +import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; +import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; +import org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.joda.time.DateTimeZone; + +import javax.inject.Inject; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.List; +import java.util.Optional; +import java.util.Properties; +import java.util.concurrent.Callable; +import java.util.function.Supplier; + +import static io.prestosql.plugin.hive.HiveUtil.getColumnNames; +import static io.prestosql.plugin.hive.HiveUtil.getColumnTypes; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; + +public class RcFileFileWriterFactory + implements HiveFileWriterFactory +{ + private final DateTimeZone timeZone; + private final HdfsEnvironment hdfsEnvironment; + private final TypeManager typeManager; + private final NodeVersion nodeVersion; + private final FileFormatDataSourceStats stats; + + @Inject + public RcFileFileWriterFactory( + HdfsEnvironment hdfsEnvironment, + TypeManager typeManager, + NodeVersion nodeVersion, + HiveConfig hiveConfig, + FileFormatDataSourceStats stats) + { + this(hdfsEnvironment, typeManager, nodeVersion, requireNonNull(hiveConfig, "hiveConfig is null").getRcfileDateTimeZone(), stats); + } + + public RcFileFileWriterFactory( + HdfsEnvironment hdfsEnvironment, + TypeManager typeManager, + NodeVersion nodeVersion, + DateTimeZone timeZone, + FileFormatDataSourceStats stats) + { + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + this.nodeVersion = requireNonNull(nodeVersion, "nodeVersion is null"); + this.timeZone = requireNonNull(timeZone, "timeZone is null"); + this.stats = requireNonNull(stats, "stats is null"); + } + + @Override + public Optional createFileWriter( + Path path, + List inputColumnNames, + StorageFormat storageFormat, + Properties schema, + JobConf configuration, + ConnectorSession session, Optional acidOptions, Optional acidWriteType) + { + if (!RCFileOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) { + return Optional.empty(); + } + + RcFileEncoding rcFileEncoding; + if (LazyBinaryColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) { + rcFileEncoding = new BinaryRcFileEncoding(timeZone); + } + else if (ColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) { + rcFileEncoding = RcFilePageSourceFactory.createTextVectorEncoding(schema); + } + else { + return Optional.empty(); + } + + Optional codecName = Optional.ofNullable(configuration.get(FileOutputFormat.COMPRESS_CODEC)); + + // existing tables and partitions may have columns in a different order than the writer is providing, so build + // an index to rearrange columns in the proper order + List fileColumnNames = getColumnNames(schema); + List fileColumnTypes = getColumnTypes(schema).stream() + .map(hiveType -> hiveType.getType(typeManager)) + .collect(toList()); + + int[] fileInputColumnIndexes = fileColumnNames.stream() + .mapToInt(inputColumnNames::indexOf) + .toArray(); + + try { + FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration); + OutputStream outputStream = fileSystem.create(path); + + Optional> validationInputFactory = Optional.empty(); + if (HiveSessionProperties.isRcfileOptimizedWriterValidate(session)) { + validationInputFactory = Optional.of(() -> { + try { + return new HdfsRcFileDataSource( + path.toString(), + fileSystem.open(path), + fileSystem.getFileStatus(path).getLen(), + stats); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED, e); + } + }); + } + + Callable rollbackAction = () -> { + fileSystem.delete(path, false); + return null; + }; + + return Optional.of(new RcFileFileWriter( + outputStream, + rollbackAction, + rcFileEncoding, + fileColumnTypes, + codecName, + fileInputColumnIndexes, + ImmutableMap.builder() + .put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()) + .put(HiveMetadata.PRESTO_QUERY_ID_NAME, session.getQueryId()) + .build(), + validationInputFactory)); + } + catch (Exception e) { + throw new PrestoException(HiveErrorCode.HIVE_WRITER_OPEN_ERROR, "Error creating RCFile file", e); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RecordFileWriter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RecordFileWriter.java new file mode 100644 index 00000000..16e41cfc --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RecordFileWriter.java @@ -0,0 +1,223 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.airlift.units.DataSize; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.parquet.ParquetRecordWriter; +import io.prestosql.plugin.hive.util.FieldSetterFactory; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.Serializer; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.mapred.JobConf; +import org.joda.time.DateTimeZone; +import org.openjdk.jol.info.ClassLayout; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.Properties; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_CLOSE_ERROR; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_DATA_ERROR; +import static io.prestosql.plugin.hive.HiveUtil.getColumnNames; +import static io.prestosql.plugin.hive.HiveUtil.getColumnTypes; +import static io.prestosql.plugin.hive.HiveWriteUtils.createRecordWriter; +import static io.prestosql.plugin.hive.HiveWriteUtils.getRowColumnInspectors; +import static io.prestosql.plugin.hive.HiveWriteUtils.initializeSerializer; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector; + +public class RecordFileWriter + implements HiveFileWriter +{ + private static final int INSTANCE_SIZE = ClassLayout.parseClass(RecordFileWriter.class).instanceSize(); + + private final Path path; + private final JobConf conf; + private final int fieldCount; + private final Serializer serializer; + private final RecordWriter recordWriter; + private final SettableStructObjectInspector tableInspector; + private final List structFields; + private final Object row; + private final FieldSetterFactory.FieldSetter[] setters; + private final long estimatedWriterSystemMemoryUsage; + + private boolean committed; + + public RecordFileWriter( + Path path, + List inputColumnNames, + StorageFormat storageFormat, + Properties schema, + DataSize estimatedWriterSystemMemoryUsage, + JobConf conf, + TypeManager typeManager, + DateTimeZone parquetTimeZone, + ConnectorSession session) + { + this.path = requireNonNull(path, "path is null"); + this.conf = requireNonNull(conf, "conf is null"); + + // existing tables may have columns in a different order + List fileColumnNames = getColumnNames(schema); + List fileColumnTypes = getColumnTypes(schema).stream() + .map(hiveType -> hiveType.getType(typeManager)) + .collect(toList()); + + fieldCount = fileColumnNames.size(); + + String serDe = storageFormat.getSerDe(); + serializer = initializeSerializer(conf, schema, serDe); + recordWriter = createRecordWriter(path, conf, schema, storageFormat.getOutputFormat(), session); + + List objectInspectors = getRowColumnInspectors(fileColumnTypes); + tableInspector = getStandardStructObjectInspector(fileColumnNames, objectInspectors); + + // reorder (and possibly reduce) struct fields to match input + structFields = ImmutableList.copyOf(inputColumnNames.stream() + .map(tableInspector::getStructFieldRef) + .collect(toList())); + + row = tableInspector.create(); + + DateTimeZone timeZone = (recordWriter instanceof ParquetRecordWriter) ? parquetTimeZone : DateTimeZone.UTC; + FieldSetterFactory fieldSetterFactory = new FieldSetterFactory(timeZone); + + setters = new FieldSetterFactory.FieldSetter[structFields.size()]; + for (int i = 0; i < setters.length; i++) { + setters[i] = fieldSetterFactory.create(tableInspector, row, structFields.get(i), fileColumnTypes.get(structFields.get(i).getFieldID())); + } + + this.estimatedWriterSystemMemoryUsage = estimatedWriterSystemMemoryUsage.toBytes(); + } + + @Override + public long getWrittenBytes() + { + if (recordWriter instanceof ExtendedRecordWriter) { + return ((ExtendedRecordWriter) recordWriter).getWrittenBytes(); + } + + if (committed) { + try { + return path.getFileSystem(conf).getFileStatus(path).getLen(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + // there is no good way to get this when RecordWriter is not yet committed + return 0; + } + + @Override + public long getSystemMemoryUsage() + { + return INSTANCE_SIZE + estimatedWriterSystemMemoryUsage; + } + + @Override + public void appendRows(Page dataPage) + { + for (int position = 0; position < dataPage.getPositionCount(); position++) { + appendRow(dataPage, position); + } + } + + public void appendRow(Page dataPage, int position) + { + for (int field = 0; field < fieldCount; field++) { + Block block = dataPage.getBlock(field); + if (block.isNull(position)) { + tableInspector.setStructFieldData(row, structFields.get(field), null); + } + else { + setters[field].setField(block, position); + } + } + + try { + recordWriter.write(serializer.serialize(row, tableInspector)); + } + catch (SerDeException | IOException e) { + throw new PrestoException(HIVE_WRITER_DATA_ERROR, e); + } + } + + @Override + public void commit() + { + try { + recordWriter.close(false); + committed = true; + } + catch (IOException e) { + throw new PrestoException(HIVE_WRITER_CLOSE_ERROR, "Error committing write to Hive", e); + } + } + + @Override + public void rollback() + { + try { + try { + recordWriter.close(true); + } + finally { + // perform explicit deletion here as implementations of RecordWriter.close() often ignore the abort flag. + path.getFileSystem(conf).delete(path, false); + } + } + catch (IOException e) { + throw new PrestoException(HIVE_WRITER_CLOSE_ERROR, "Error rolling back write to Hive", e); + } + } + + @Override + public long getValidationCpuNanos() + { + // RecordFileWriter delegates to Hive RecordWriter and there is no validation + return 0; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("path", path) + .toString(); + } + + public interface ExtendedRecordWriter + extends RecordWriter + { + long getWrittenBytes(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RoleAlreadyExistsException.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RoleAlreadyExistsException.java new file mode 100644 index 00000000..486ac09e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/RoleAlreadyExistsException.java @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.PrestoException; + +import static io.prestosql.spi.StandardErrorCode.ALREADY_EXISTS; +import static java.lang.String.format; + +public class RoleAlreadyExistsException + extends PrestoException +{ + public RoleAlreadyExistsException(String roleName) + { + super(ALREADY_EXISTS, format("Role already exists: '%s'", roleName), null); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectCsvRecordReader.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectCsvRecordReader.java new file mode 100644 index 00000000..dc761e97 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectCsvRecordReader.java @@ -0,0 +1,112 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.amazonaws.services.s3.model.CSVInput; +import com.amazonaws.services.s3.model.CSVOutput; +import com.amazonaws.services.s3.model.CompressionType; +import com.amazonaws.services.s3.model.ExpressionType; +import com.amazonaws.services.s3.model.InputSerialization; +import com.amazonaws.services.s3.model.OutputSerialization; +import com.amazonaws.services.s3.model.SelectObjectContentRequest; +import io.prestosql.plugin.hive.s3.PrestoS3ClientFactory; +import io.prestosql.plugin.hive.s3.PrestoS3FileSystem; +import io.prestosql.spi.PrestoException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.compress.BZip2Codec; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.GzipCodec; + +import java.net.URI; +import java.util.Properties; + +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static org.apache.hadoop.hive.serde.serdeConstants.ESCAPE_CHAR; +import static org.apache.hadoop.hive.serde.serdeConstants.QUOTE_CHAR; + +public class S3SelectCsvRecordReader + extends S3SelectLineRecordReader +{ + /* + * Sentinel unicode comment character (http://www.unicode.org/faq/private_use.html#nonchar_codes). + * It is expected that \uFDD0 sentinel comment character is not the first character in any row of user's CSV S3 object. + * The rows starting with \uFDD0 will be skipped by S3Select and will not be a part of the result set or aggregations. + * To process CSV objects that may contain \uFDD0 as first row character please disable S3SelectPushdown. + * TODO: Remove this proxy logic when S3Select API supports disabling of row level comments. + */ + + private static final String COMMENTS_CHAR_STR = "\uFDD0"; + + S3SelectCsvRecordReader( + Configuration configuration, + HiveConfig hiveConfig, + Path path, + long start, + long length, + Properties schema, + String ionSqlQuery, + PrestoS3ClientFactory s3ClientFactory) + { + super(configuration, hiveConfig, path, start, length, schema, ionSqlQuery, s3ClientFactory); + } + + @Override + public SelectObjectContentRequest buildSelectObjectRequest(Properties schema, String query, Path path) + { + SelectObjectContentRequest selectObjectRequest = new SelectObjectContentRequest(); + URI uri = path.toUri(); + selectObjectRequest.setBucketName(PrestoS3FileSystem.getBucketName(uri)); + selectObjectRequest.setKey(PrestoS3FileSystem.keyFromPath(path)); + selectObjectRequest.setExpression(query); + selectObjectRequest.setExpressionType(ExpressionType.SQL); + + String fieldDelimiter = getFieldDelimiter(schema); + String quoteChar = schema.getProperty(QUOTE_CHAR, null); + String escapeChar = schema.getProperty(ESCAPE_CHAR, null); + + CSVInput selectObjectCSVInputSerialization = new CSVInput(); + selectObjectCSVInputSerialization.setRecordDelimiter(lineDelimiter); + selectObjectCSVInputSerialization.setFieldDelimiter(fieldDelimiter); + selectObjectCSVInputSerialization.setComments(COMMENTS_CHAR_STR); + selectObjectCSVInputSerialization.setQuoteCharacter(quoteChar); + selectObjectCSVInputSerialization.setQuoteEscapeCharacter(escapeChar); + InputSerialization selectObjectInputSerialization = new InputSerialization(); + + CompressionCodec codec = compressionCodecFactory.getCodec(path); + if (codec instanceof GzipCodec) { + selectObjectInputSerialization.setCompressionType(CompressionType.GZIP); + } + else if (codec instanceof BZip2Codec) { + selectObjectInputSerialization.setCompressionType(CompressionType.BZIP2); + } + else if (codec != null) { + throw new PrestoException(NOT_SUPPORTED, "Compression extension not supported for S3 Select: " + path); + } + + selectObjectInputSerialization.setCsv(selectObjectCSVInputSerialization); + selectObjectRequest.setInputSerialization(selectObjectInputSerialization); + + OutputSerialization selectObjectOutputSerialization = new OutputSerialization(); + CSVOutput selectObjectCSVOutputSerialization = new CSVOutput(); + selectObjectCSVOutputSerialization.setRecordDelimiter(lineDelimiter); + selectObjectCSVOutputSerialization.setFieldDelimiter(fieldDelimiter); + selectObjectCSVOutputSerialization.setQuoteCharacter(quoteChar); + selectObjectCSVOutputSerialization.setQuoteEscapeCharacter(escapeChar); + selectObjectOutputSerialization.setCsv(selectObjectCSVOutputSerialization); + selectObjectRequest.setOutputSerialization(selectObjectOutputSerialization); + + return selectObjectRequest; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectLineRecordReader.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectLineRecordReader.java new file mode 100644 index 00000000..a59b0298 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectLineRecordReader.java @@ -0,0 +1,227 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.amazonaws.services.s3.model.AmazonS3Exception; +import com.amazonaws.services.s3.model.SelectObjectContentRequest; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.io.Closer; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.s3.HiveS3Config; +import io.prestosql.plugin.hive.s3.PrestoS3ClientFactory; +import io.prestosql.plugin.hive.s3.PrestoS3FileSystem; +import io.prestosql.plugin.hive.s3.PrestoS3SelectClient; +import io.prestosql.plugin.hive.util.RetryDriver; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.util.LineReader; + +import javax.annotation.concurrent.ThreadSafe; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.Properties; + +import static com.google.common.base.Throwables.throwIfInstanceOf; +import static com.google.common.base.Throwables.throwIfUnchecked; +import static java.lang.String.format; +import static java.net.HttpURLConnection.HTTP_BAD_REQUEST; +import static java.net.HttpURLConnection.HTTP_FORBIDDEN; +import static java.net.HttpURLConnection.HTTP_NOT_FOUND; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.apache.hadoop.hive.serde.serdeConstants.FIELD_DELIM; +import static org.apache.hadoop.hive.serde.serdeConstants.LINE_DELIM; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT; + +@ThreadSafe +public abstract class S3SelectLineRecordReader + implements RecordReader +{ + private InputStream selectObjectContent; + private long processedRecords; + private long recordsFromS3; + private long position; + private LineReader reader; + private boolean isFirstLine; + private static final Duration BACKOFF_MIN_SLEEP = new Duration(1, SECONDS); + private final PrestoS3SelectClient selectClient; + private final long start; + private final long end; + private final int maxAttempts; + private final Duration maxBackoffTime; + private final Duration maxRetryTime; + private final Closer closer = Closer.create(); + private final SelectObjectContentRequest selectObjectContentRequest; + protected final CompressionCodecFactory compressionCodecFactory; + protected final String lineDelimiter; + + S3SelectLineRecordReader( + Configuration configuration, + HiveConfig hiveConfig, + Path path, + long start, + long length, + Properties schema, + String ionSqlQuery, + PrestoS3ClientFactory s3ClientFactory) + { + requireNonNull(configuration, "configuration is null"); + requireNonNull(hiveConfig, "hiveConfig is null"); + requireNonNull(schema, "schema is null"); + requireNonNull(path, "path is null"); + requireNonNull(ionSqlQuery, "ionSqlQuery is null"); + requireNonNull(s3ClientFactory, "s3ClientFactory is null"); + this.lineDelimiter = (schema).getProperty(LINE_DELIM, "\n"); + this.processedRecords = 0; + this.recordsFromS3 = 0; + this.start = start; + this.position = this.start; + this.end = this.start + length; + this.isFirstLine = true; + + this.compressionCodecFactory = new CompressionCodecFactory(configuration); + this.selectObjectContentRequest = buildSelectObjectRequest(schema, ionSqlQuery, path); + + HiveS3Config defaults = new HiveS3Config(); + this.maxAttempts = configuration.getInt(PrestoS3FileSystem.S3_MAX_CLIENT_RETRIES, defaults.getS3MaxClientRetries()) + 1; + this.maxBackoffTime = Duration.valueOf(configuration.get(PrestoS3FileSystem.S3_MAX_BACKOFF_TIME, defaults.getS3MaxBackoffTime().toString())); + this.maxRetryTime = Duration.valueOf(configuration.get(PrestoS3FileSystem.S3_MAX_RETRY_TIME, defaults.getS3MaxRetryTime().toString())); + + this.selectClient = new PrestoS3SelectClient(configuration, hiveConfig, s3ClientFactory); + closer.register(selectClient); + } + + public abstract SelectObjectContentRequest buildSelectObjectRequest(Properties schema, String query, Path path); + + private int readLine(Text value) + throws IOException + { + try { + return RetryDriver.retry() + .maxAttempts(maxAttempts) + .exponentialBackoff(BACKOFF_MIN_SLEEP, maxBackoffTime, maxRetryTime, 2.0) + .stopOn(InterruptedException.class, UnrecoverableS3OperationException.class) + .run("readRecordsContentStream", () -> { + if (isFirstLine) { + recordsFromS3 = 0; + selectObjectContent = selectClient.getRecordsContent(selectObjectContentRequest); + closer.register(selectObjectContent); + reader = new LineReader(selectObjectContent, lineDelimiter.getBytes(StandardCharsets.UTF_8)); + closer.register(reader); + isFirstLine = false; + } + try { + return reader.readLine(value); + } + catch (RuntimeException e) { + isFirstLine = true; + recordsFromS3 = 0; + if (e instanceof AmazonS3Exception) { + switch (((AmazonS3Exception) e).getStatusCode()) { + case HTTP_FORBIDDEN: + case HTTP_NOT_FOUND: + case HTTP_BAD_REQUEST: + throw new UnrecoverableS3OperationException(selectClient.getBucketName(), selectClient.getKeyName(), e); + } + } + throw e; + } + }); + } + catch (Exception e) { + throwIfInstanceOf(e, IOException.class); + throwIfUnchecked(e); + throw new RuntimeException(e); + } + } + + @Override + public synchronized boolean next(LongWritable key, Text value) + throws IOException + { + while (true) { + int bytes = readLine(value); + if (bytes <= 0) { + if (!selectClient.isRequestComplete()) { + throw new IOException("S3 Select request was incomplete as End Event was not received"); + } + return false; + } + recordsFromS3++; + if (recordsFromS3 > processedRecords) { + position += bytes; + processedRecords++; + key.set(processedRecords); + return true; + } + } + } + + @Override + public LongWritable createKey() + { + return new LongWritable(); + } + + @Override + public Text createValue() + { + return new Text(); + } + + @Override + public long getPos() + { + return position; + } + + @Override + public void close() + throws IOException + { + closer.close(); + } + + @Override + public float getProgress() + { + return ((float) (position - start)) / (end - start); + } + + String getFieldDelimiter(Properties schema) + { + return schema.getProperty(FIELD_DELIM, schema.getProperty(SERIALIZATION_FORMAT)); + } + + /** + * This exception is for stopping retries for S3 Select calls that shouldn't be retried. + * For example, "Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: Forbidden (Service: Amazon S3; Status Code: 403 ..." + */ + @VisibleForTesting + static class UnrecoverableS3OperationException + extends RuntimeException + { + public UnrecoverableS3OperationException(String bucket, String key, Throwable cause) + { + // append bucket and key to the message + super(format("%s (Bucket: %s, Key: %s)", cause, bucket, key)); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectPushdown.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectPushdown.java new file mode 100644 index 00000000..2d4feebc --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectPushdown.java @@ -0,0 +1,153 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableSet; +import io.airlift.log.Logger; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.MetastoreUtil; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.connector.ConnectorSession; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.io.compress.BZip2Codec; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.TextInputFormat; + +import java.util.List; +import java.util.Optional; +import java.util.Properties; +import java.util.Set; + +import static io.prestosql.plugin.hive.HiveSessionProperties.isS3SelectPushdownEnabled; +import static io.prestosql.plugin.hive.HiveUtil.getCompressionCodec; +import static io.prestosql.plugin.hive.HiveUtil.getDeserializerClassName; +import static io.prestosql.plugin.hive.HiveUtil.getInputFormatName; +import static org.apache.hadoop.hive.serde.serdeConstants.BIGINT_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.BOOLEAN_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.DATE_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.DECIMAL_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.INT_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.SMALLINT_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.STRING_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.TINYINT_TYPE_NAME; + +/** + * S3SelectPushdown uses Amazon S3 Select to push down queries to Amazon S3. This allows Presto to retrieve only a + * subset of data rather than retrieving the full S3 object thus improving Presto query performance. + */ +public class S3SelectPushdown +{ + private static final Logger LOG = Logger.get(S3SelectPushdown.class); + private static final Set SUPPORTED_S3_PREFIXES = ImmutableSet.of("s3://", "s3a://", "s3n://"); + private static final Set SUPPORTED_SERDES = ImmutableSet.of(LazySimpleSerDe.class.getName()); + private static final Set SUPPORTED_INPUT_FORMATS = ImmutableSet.of(TextInputFormat.class.getName()); + + /* + * Double and Real Types lose precision. Thus, they are not pushed down to S3. Please use Decimal Type if push down is desired. + * + * Pushing down timestamp to s3select is problematic due to following reasons: + * 1) Presto bug: TIMESTAMP behaviour does not match sql standard (https://github.com/prestodb/presto/issues/7122) + * 2) Presto uses the timezone from client to convert the timestamp if no timezone is provided, however, s3select is a different service and this could lead to unexpected results. + * 3) ION SQL compare timestamps using precision, timestamps with different precisions are not equal even actually they present the same instant of time. This could lead to unexpected results. + */ + private static final Set SUPPORTED_COLUMN_TYPES = ImmutableSet.of( + BOOLEAN_TYPE_NAME, + INT_TYPE_NAME, + TINYINT_TYPE_NAME, + SMALLINT_TYPE_NAME, + BIGINT_TYPE_NAME, + STRING_TYPE_NAME, + DECIMAL_TYPE_NAME, + DATE_TYPE_NAME); + + private S3SelectPushdown() {} + + private static boolean isSerdeSupported(Properties schema) + { + String serdeName = getDeserializerClassName(schema); + return SUPPORTED_SERDES.contains(serdeName); + } + + private static boolean isInputFormatSupported(Properties schema) + { + String inputFormat = getInputFormatName(schema); + return SUPPORTED_INPUT_FORMATS.contains(inputFormat); + } + + public static boolean isCompressionCodecSupported(InputFormat inputFormat, Path path) + { + if (inputFormat instanceof TextInputFormat) { + return getCompressionCodec((TextInputFormat) inputFormat, path) + .map(codec -> (codec instanceof GzipCodec) || (codec instanceof BZip2Codec)) + .orElse(true); + } + + return false; + } + + private static boolean areColumnTypesSupported(List columns) + { + if (columns == null || columns.isEmpty()) { + return false; + } + + for (Column column : columns) { + String type = column.getType().getHiveTypeName().toString(); + if (column.getType().getTypeInfo() instanceof DecimalTypeInfo) { + // skip precision and scale when check decimal type + type = DECIMAL_TYPE_NAME; + } + if (!SUPPORTED_COLUMN_TYPES.contains(type)) { + return false; + } + } + + return true; + } + + private static boolean isS3Storage(String path) + { + return SUPPORTED_S3_PREFIXES.stream().anyMatch(path::startsWith); + } + + static boolean shouldEnablePushdownForTable(ConnectorSession session, Table table, String path, Optional optionalPartition) + { + if (!isS3SelectPushdownEnabled(session)) { + return false; + } + + if (path == null) { + return false; + } + + // Hive table partitions could be on different storages, + // as a result, we have to check each individual optionalPartition + Properties schema = optionalPartition + .map(partition -> MetastoreUtil.getHiveSchema(partition, table)) + .orElseGet(() -> MetastoreUtil.getHiveSchema(table)); + return shouldEnablePushdownForTable(table, path, schema); + } + + private static boolean shouldEnablePushdownForTable(Table table, String path, Properties schema) + { + return isS3Storage(path) && + isSerdeSupported(schema) && + isInputFormatSupported(schema) && + areColumnTypesSupported(table.getDataColumns()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectRecordCursor.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectRecordCursor.java new file mode 100644 index 00000000..4e030a6b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectRecordCursor.java @@ -0,0 +1,231 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.RecordReader; + +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; +import java.util.Set; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.isNullOrEmpty; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMNS; +import static org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMN_TYPES; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_DDL; + +class S3SelectRecordCursor + extends GenericHiveRecordCursor +{ + private static final String THRIFT_STRUCT = "struct"; + private static final String START_STRUCT = "{"; + private static final String END_STRUCT = "}"; + private static final String FIELD_SEPARATOR = ","; + + public S3SelectRecordCursor( + Configuration configuration, + Path path, + RecordReader recordReader, + long totalBytes, + Properties splitSchema, + List columns, + TypeManager typeManager) + { + super(configuration, path, recordReader, totalBytes, updateSplitSchema(splitSchema, columns), columns, typeManager); + } + + // since s3select only returns the required column, not the whole columns + // we need to update the split schema to include only the required columns + // otherwise, Serde could not deserialize output from s3select to row data correctly + static Properties updateSplitSchema(Properties splitSchema, List columns) + { + requireNonNull(splitSchema, "splitSchema is null"); + requireNonNull(columns, "columns is null"); + // clone split properties for update so as not to affect the original one + Properties updatedSchema = new Properties(); + updatedSchema.putAll(splitSchema); + updatedSchema.setProperty(LIST_COLUMNS, buildColumns(columns)); + updatedSchema.setProperty(LIST_COLUMN_TYPES, buildColumnTypes(columns)); + ThriftTable thriftTable = parseThriftDdl(splitSchema.getProperty(SERIALIZATION_DDL)); + updatedSchema.setProperty(SERIALIZATION_DDL, + thriftTableToDdl(pruneThriftTable(thriftTable, columns))); + return updatedSchema; + } + + private static String buildColumns(List columns) + { + if (columns == null || columns.isEmpty()) { + return ""; + } + return columns.stream() + .map(HiveColumnHandle::getName) + .collect(Collectors.joining(",")); + } + + private static String buildColumnTypes(List columns) + { + if (columns == null || columns.isEmpty()) { + return ""; + } + return columns.stream() + .map(column -> column.getHiveType().getTypeInfo().getTypeName()) + .collect(Collectors.joining(",")); + } + + /** + * Parse Thrift description of a table schema. Examples: + *

    + *
  • struct article { varchar article varchar author date date_pub int quantity}
  • + *
  • struct article { varchar article, varchar author, date date_pub, int quantity }
  • + *
  • struct article { varchar article, varchar author, date date_pub, int quantity}
  • + *
+ */ + private static ThriftTable parseThriftDdl(String ddl) + { + if (isNullOrEmpty(ddl)) { + return null; + } + String[] parts = ddl.trim().split("\\s+"); + checkArgument(parts.length >= 5, "Invalid Thrift DDL " + ddl); + checkArgument(THRIFT_STRUCT.equals(parts[0]), "Thrift DDL should start with " + THRIFT_STRUCT); + ThriftTable thriftTable = new ThriftTable(); + thriftTable.setTableName(parts[1]); + checkArgument(START_STRUCT.equals(parts[2]), "Invalid Thrift DDL " + ddl); + checkArgument(parts[parts.length - 1].endsWith(END_STRUCT), "Invalid Thrift DDL " + ddl); + String lastColumnNameWithEndStruct = parts[parts.length - 1]; + parts[parts.length - 1] = lastColumnNameWithEndStruct.substring(0, lastColumnNameWithEndStruct.length() - 1); + List fields = new ArrayList<>(); + for (int i = 3; i < parts.length - 1; i += 2) { + ThriftField thriftField = new ThriftField(); + thriftField.setType(parts[i]); + String columnNameWithFieldSeparator = parts[i + 1]; + if (columnNameWithFieldSeparator.endsWith(FIELD_SEPARATOR)) { + parts[i + 1] = columnNameWithFieldSeparator.substring(0, columnNameWithFieldSeparator.length() - 1); + } + thriftField.setName(parts[i + 1]); + fields.add(thriftField); + } + thriftTable.setFields(fields); + + return thriftTable; + } + + private static ThriftTable pruneThriftTable(ThriftTable thriftTable, List columns) + { + if (thriftTable == null) { + return null; + } + List fields = thriftTable.getFields(); + if (fields == null || fields.isEmpty()) { + return thriftTable; + } + Set columnNames = columns.stream() + .map(HiveColumnHandle::getName) + .collect(toImmutableSet()); + List filteredFields = fields.stream() + .filter(field -> columnNames.contains(field.getName())) + .collect(toList()); + thriftTable.setFields(filteredFields); + + return thriftTable; + } + + private static String thriftTableToDdl(ThriftTable thriftTable) + { + if (thriftTable == null) { + return ""; + } + List fields = thriftTable.getFields(); + if (fields == null || fields.isEmpty()) { + return ""; + } + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append(THRIFT_STRUCT) + .append(" ") + .append(thriftTable.getTableName()) + .append(" ") + .append(START_STRUCT); + stringBuilder.append(fields.stream() + .map(field -> " " + field.getType() + " " + field.getName()) + .collect(Collectors.joining(","))); + stringBuilder.append(END_STRUCT); + + return stringBuilder.toString(); + } + + private static class ThriftField + { + private String type; + private String name; + + private String getType() + { + return type; + } + + private void setType(String type) + { + checkArgument(!isNullOrEmpty(type), "type is null or empty string"); + this.type = type; + } + + private String getName() + { + return name; + } + + private void setName(String name) + { + requireNonNull(name, "name is null"); + this.name = name; + } + } + + private static class ThriftTable + { + private String tableName; + private List fields; + + private String getTableName() + { + return tableName; + } + + private void setTableName(String tableName) + { + checkArgument(!isNullOrEmpty(tableName), "tableName is null or empty string"); + this.tableName = tableName; + } + + private List getFields() + { + return fields; + } + + private void setFields(List fields) + { + requireNonNull(fields, "fields is null"); + this.fields = fields; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectRecordCursorProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectRecordCursorProvider.java new file mode 100644 index 00000000..ce55a2f6 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/S3SelectRecordCursorProvider.java @@ -0,0 +1,95 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableSet; +import io.prestosql.plugin.hive.s3.PrestoS3ClientFactory; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; + +import javax.inject.Inject; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; +import java.util.Set; + +import static io.prestosql.plugin.hive.HiveUtil.getDeserializerClassName; +import static java.util.Objects.requireNonNull; + +public class S3SelectRecordCursorProvider + implements HiveRecordCursorProvider +{ + private static final Set CSV_SERDES = ImmutableSet.of(LazySimpleSerDe.class.getName()); + private final HdfsEnvironment hdfsEnvironment; + private final HiveConfig hiveConfig; + private final PrestoS3ClientFactory s3ClientFactory; + + @Inject + public S3SelectRecordCursorProvider( + HdfsEnvironment hdfsEnvironment, + HiveConfig hiveConfig, + PrestoS3ClientFactory s3ClientFactory) + { + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.hiveConfig = requireNonNull(hiveConfig, "hiveConfig is null"); + this.s3ClientFactory = requireNonNull(s3ClientFactory, "s3ClientFactory is null"); + } + + @Override + public Optional createRecordCursor( + Configuration configuration, + ConnectorSession session, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + TupleDomain effectivePredicate, + TypeManager typeManager, + boolean s3SelectPushdownEnabled, + Map customSplitInfo) + { + if (!s3SelectPushdownEnabled) { + return Optional.empty(); + } + + try { + this.hdfsEnvironment.getFileSystem(session.getUser(), path, configuration); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, "Failed getting FileSystem: " + path, e); + } + + String serdeName = getDeserializerClassName(schema); + if (CSV_SERDES.contains(serdeName)) { + IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager); + String ionSqlQuery = queryBuilder.buildSql(columns, effectivePredicate); + S3SelectLineRecordReader recordReader = new S3SelectCsvRecordReader(configuration, hiveConfig, path, start, length, schema, ionSqlQuery, s3ClientFactory); + return Optional.of(new S3SelectRecordCursor<>(configuration, path, recordReader, length, schema, columns, typeManager)); + } + + // unsupported serdes + return Optional.empty(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/SnapshotTempFileWriter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/SnapshotTempFileWriter.java new file mode 100644 index 00000000..49e4cc73 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/SnapshotTempFileWriter.java @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.log.Logger; +import io.prestosql.orc.OrcDataSink; +import io.prestosql.plugin.hive.util.TempFileWriter; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.type.Type; +import org.openjdk.jol.info.ClassLayout; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_CLOSE_ERROR; + +public class SnapshotTempFileWriter + implements HiveFileWriter +{ + private static final Logger log = Logger.get(SnapshotTempFileWriter.class); + private static final int INSTANCE_SIZE = ClassLayout.parseClass(SnapshotTempFileWriter.class).instanceSize(); + + private final TempFileWriter writer; + + public SnapshotTempFileWriter( + OrcDataSink dataSink, + List types) + { + writer = new TempFileWriter(types, dataSink); + } + + @Override + public long getWrittenBytes() + { + return writer.getWrittenBytes(); + } + + @Override + public long getSystemMemoryUsage() + { + return INSTANCE_SIZE + writer.getRetainedBytes(); + } + + @Override + public void appendRows(Page page) + { + writer.writePage(page); + } + + @Override + public void commit() + { + try { + writer.close(); + } + catch (IOException | UncheckedIOException e) { + // DO NOT delete the file. A newly schedule task may be recreating this file. + throw new PrestoException(HIVE_WRITER_CLOSE_ERROR, "Error committing write to Hive", e); + } + } + + @Override + public void rollback() + { + try { + writer.close(); + } + catch (Exception e) { + // DO NOT delete the file. A newly schedule task may be recreating this file. + // Don't need to throw the exception either. This is part of the cancel-to-resume task. + log.debug(e, "Error rolling back write to Hive"); + } + } + + @Override + public long getValidationCpuNanos() + { + return 0; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("writer", writer) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/SortingFileWriter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/SortingFileWriter.java new file mode 100644 index 00000000..737677e9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/SortingFileWriter.java @@ -0,0 +1,332 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.io.Closer; +import io.airlift.log.Logger; +import io.airlift.units.DataSize; +import io.prestosql.orc.OrcDataSink; +import io.prestosql.orc.OrcDataSource; +import io.prestosql.orc.OrcDataSourceId; +import io.prestosql.plugin.hive.orc.HdfsOrcDataSource; +import io.prestosql.plugin.hive.util.MergingPageIterator; +import io.prestosql.plugin.hive.util.SortBuffer; +import io.prestosql.plugin.hive.util.TempFileReader; +import io.prestosql.plugin.hive.util.TempFileWriter; +import io.prestosql.spi.Page; +import io.prestosql.spi.PageSorter; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.SortOrder; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.openjdk.jol.info.ClassLayout; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; +import java.util.PriorityQueue; +import java.util.Queue; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Consumer; +import java.util.stream.IntStream; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_CLOSE_ERROR; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_WRITER_DATA_ERROR; +import static java.lang.Math.min; +import static java.util.Comparator.comparing; +import static java.util.Objects.requireNonNull; + +public class SortingFileWriter + implements HiveFileWriter +{ + private static final Logger log = Logger.get(SortingFileWriter.class); + + private static final int INSTANCE_SIZE = ClassLayout.parseClass(SortingFileWriter.class).instanceSize(); + + private final FileSystem fileSystem; + private final Path tempFilePrefix; + private final int maxOpenTempFiles; + private final List types; + private final List sortFields; + private final List sortOrders; + private final HiveFileWriter outputWriter; + private final SortBuffer sortBuffer; + private final TempFileSinkFactory tempFileSinkFactory; + private final Queue tempFiles = new PriorityQueue<>(comparing(TempFile::getSize)); + private final AtomicLong nextFileId = new AtomicLong(); + + public SortingFileWriter( + FileSystem fileSystem, + Path tempFilePrefix, + HiveFileWriter outputWriter, + DataSize maxMemory, + int maxOpenTempFiles, + List types, + List sortFields, + List sortOrders, + PageSorter pageSorter, + TempFileSinkFactory tempFileSinkFactory) + { + checkArgument(maxOpenTempFiles >= 2, "maxOpenTempFiles must be at least two"); + this.fileSystem = requireNonNull(fileSystem, "fileSystem is null"); + this.tempFilePrefix = requireNonNull(tempFilePrefix, "tempFilePrefix is null"); + this.maxOpenTempFiles = maxOpenTempFiles; + this.types = ImmutableList.copyOf(requireNonNull(types, "types is null")); + this.sortFields = ImmutableList.copyOf(requireNonNull(sortFields, "sortFields is null")); + this.sortOrders = ImmutableList.copyOf(requireNonNull(sortOrders, "sortOrders is null")); + this.outputWriter = requireNonNull(outputWriter, "outputWriter is null"); + this.sortBuffer = new SortBuffer(maxMemory, types, sortFields, sortOrders, pageSorter); + this.tempFileSinkFactory = tempFileSinkFactory; + } + + @Override + public long getWrittenBytes() + { + return outputWriter.getWrittenBytes(); + } + + @Override + public long getSystemMemoryUsage() + { + return INSTANCE_SIZE + sortBuffer.getRetainedBytes(); + } + + @Override + public void appendRows(Page page) + { + if (!sortBuffer.canAdd(page)) { + flushToTempFile(); + } + sortBuffer.add(page); + } + + @Override + public void commit() + { + if (!sortBuffer.isEmpty()) { + // skip temporary files entirely if the total output size is small + if (tempFiles.isEmpty()) { + sortBuffer.flushTo(outputWriter::appendRows); + outputWriter.commit(); + return; + } + + flushToTempFile(); + } + + try { + writeSorted(); + outputWriter.commit(); + } + catch (UncheckedIOException e) { + throw new PrestoException(HIVE_WRITER_CLOSE_ERROR, "Error committing write to Hive", e); + } + cleanupTempFiles(); + } + + @Override + public void rollback() + { + cleanupTempFiles(); + + outputWriter.rollback(); + } + + private void cleanupTempFiles() + { + for (TempFile file : tempFiles) { + cleanupFile(file.getPath()); + } + } + + @Override + public long getValidationCpuNanos() + { + return outputWriter.getValidationCpuNanos(); + } + + public Path getTempFilePrefix() + { + return tempFilePrefix; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("tempFilePrefix", tempFilePrefix) + .add("outputWriter", outputWriter) + .toString(); + } + + @Override + public Optional getVerificationTask() + { + return outputWriter.getVerificationTask(); + } + + @Override + public ImmutableList getExtraPartitionFiles() + { + return outputWriter.getExtraPartitionFiles(); + } + + private void flushToTempFile() + { + writeTempFile(writer -> sortBuffer.flushTo(writer::writePage)); + } + + // TODO: change connector SPI to make this resumable and have memory tracking + private void writeSorted() + { + combineFiles(); + + mergeFiles(tempFiles, outputWriter::appendRows); + } + + private void combineFiles() + { + while (tempFiles.size() > maxOpenTempFiles) { + int count = min(maxOpenTempFiles, tempFiles.size() - (maxOpenTempFiles - 1)); + + List smallestFiles = IntStream.range(0, count) + .mapToObj(i -> tempFiles.poll()) + .collect(toImmutableList()); + + writeTempFile(writer -> mergeFiles(smallestFiles, writer::writePage)); + } + } + + private void mergeFiles(Iterable files, Consumer consumer) + { + try (Closer closer = Closer.create()) { + Collection> iterators = new ArrayList<>(); + + for (TempFile tempFile : files) { + Path file = tempFile.getPath(); + FileStatus fileStatus = fileSystem.getFileStatus(file); + OrcDataSource dataSource = new HdfsOrcDataSource( + new OrcDataSourceId(file.toString()), + fileStatus.getLen(), + new DataSize(1, MEGABYTE), + new DataSize(8, MEGABYTE), + new DataSize(8, MEGABYTE), + false, + fileSystem.open(file), + new FileFormatDataSourceStats(), + fileStatus.getModificationTime()); + TempFileReader reader = new TempFileReader(types, dataSource); + // Closing the reader also closes the data source + closer.register(reader); + iterators.add(reader); + } + + new MergingPageIterator(iterators, types, sortFields, sortOrders) + .forEachRemaining(consumer); + + for (TempFile tempFile : files) { + Path file = tempFile.getPath(); + fileSystem.delete(file, false); + if (fileSystem.exists(file)) { + throw new IOException("Failed to delete temporary file: " + file); + } + } + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private void writeTempFile(Consumer consumer) + { + Path tempFile = getTempFileName(); + + try (TempFileWriter writer = new TempFileWriter(types, tempFileSinkFactory.createSink(fileSystem, tempFile))) { + consumer.accept(writer); + writer.close(); + tempFiles.add(new TempFile(tempFile, writer.getWrittenBytes())); + } + catch (IOException | UncheckedIOException e) { + cleanupFile(tempFile); + throw new PrestoException(HIVE_WRITER_DATA_ERROR, "Failed to write temporary file: " + tempFile, e); + } + } + + private void cleanupFile(Path file) + { + try { + fileSystem.delete(file, false); + if (fileSystem.exists(file)) { + throw new IOException("Delete failed"); + } + } + catch (IOException e) { + log.warn(e, "Failed to delete temporary file: " + file); + } + } + + private Path getTempFileName() + { + return new Path(tempFilePrefix + "." + nextFileId.getAndIncrement()); + } + + private static class TempFile + { + private final Path path; + private final long size; + + public TempFile(Path path, long size) + { + checkArgument(size >= 0, "size is negative"); + this.path = requireNonNull(path, "path is null"); + this.size = size; + } + + public Path getPath() + { + return path; + } + + public long getSize() + { + return size; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("path", path) + .add("size", size) + .toString(); + } + } + + public interface TempFileSinkFactory + { + OrcDataSink createSink(FileSystem fileSystem, Path path) + throws IOException; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/SyncPartitionMetadataProcedure.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/SyncPartitionMetadataProcedure.java new file mode 100644 index 00000000..a099aa75 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/SyncPartitionMetadataProcedure.java @@ -0,0 +1,254 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Sets; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.classloader.ThreadContextClassLoader; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.procedure.Procedure; +import io.prestosql.spi.procedure.Procedure.Argument; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import javax.inject.Inject; +import javax.inject.Provider; + +import java.io.IOException; +import java.lang.invoke.MethodHandle; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.Supplier; +import java.util.stream.Stream; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; +import static io.prestosql.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME; +import static io.prestosql.plugin.hive.HivePartitionManager.extractPartitionValues; +import static io.prestosql.spi.StandardErrorCode.INVALID_PROCEDURE_ARGUMENT; +import static io.prestosql.spi.block.MethodHandleUtil.methodHandle; +import static io.prestosql.spi.type.StandardTypes.VARCHAR; +import static java.util.Locale.ENGLISH; +import static java.util.Objects.requireNonNull; + +public class SyncPartitionMetadataProcedure + implements Provider +{ + public enum SyncMode + { + ADD, DROP, FULL + } + + private static final MethodHandle SYNC_PARTITION_METADATA = methodHandle( + SyncPartitionMetadataProcedure.class, + "syncPartitionMetadata", + ConnectorSession.class, + String.class, + String.class, + String.class); + + private final Supplier hiveMetadataFactory; + private final HdfsEnvironment hdfsEnvironment; + + @Inject + public SyncPartitionMetadataProcedure( + Supplier hiveMetadataFactory, + HdfsEnvironment hdfsEnvironment) + { + this.hiveMetadataFactory = requireNonNull(hiveMetadataFactory, "hiveMetadataFactory is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + } + + @Override + public Procedure get() + { + return new Procedure( + "system", + "sync_partition_metadata", + ImmutableList.of( + new Argument("schema_name", VARCHAR), + new Argument("table_name", VARCHAR), + new Argument("mode", VARCHAR)), + SYNC_PARTITION_METADATA.bindTo(this)); + } + + public void syncPartitionMetadata(ConnectorSession session, String schemaName, String tableName, String mode) + { + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) { + doSyncPartitionMetadata(session, schemaName, tableName, mode); + } + } + + private void doSyncPartitionMetadata(ConnectorSession session, String schemaName, String tableName, String mode) + { + SyncMode syncMode = toSyncMode(mode); + HdfsContext hdfsContext = new HdfsContext(session, schemaName, tableName); + HiveIdentity identity = new HiveIdentity(session); + SemiTransactionalHiveMetastore metastore = ((HiveMetadata) hiveMetadataFactory.get()).getMetastore(); + SchemaTableName schemaTableName = new SchemaTableName(schemaName, tableName); + + Table table = metastore.getTable(identity, schemaName, tableName) + .orElseThrow(() -> new TableNotFoundException(schemaTableName)); + if (table.getPartitionColumns().isEmpty()) { + throw new PrestoException(INVALID_PROCEDURE_ARGUMENT, "Table is not partitioned: " + schemaTableName); + } + Path tableLocation = new Path(table.getStorage().getLocation()); + + Set partitionsToAdd; + Set partitionsToDrop; + + try { + FileSystem fileSystem = hdfsEnvironment.getFileSystem(hdfsContext, tableLocation); + List partitionsInMetastore = metastore.getPartitionNames(identity, schemaName, tableName) + .orElseThrow(() -> new TableNotFoundException(schemaTableName)); + List partitionsInFileSystem = listDirectory(fileSystem, fileSystem.getFileStatus(tableLocation), table.getPartitionColumns(), table.getPartitionColumns().size()).stream() + .map(fileStatus -> fileStatus.getPath().toUri()) + .map(uri -> tableLocation.toUri().relativize(uri).getPath()) + .collect(toImmutableList()); + + // partitions in file system but not in metastore + partitionsToAdd = difference(partitionsInFileSystem, partitionsInMetastore); + // partitions in metastore but not in file system + partitionsToDrop = difference(partitionsInMetastore, partitionsInFileSystem); + } + catch (IOException e) { + throw new PrestoException(HIVE_FILESYSTEM_ERROR, e); + } + + syncPartitions(partitionsToAdd, partitionsToDrop, syncMode, metastore, session, table); + } + + private static List listDirectory(FileSystem fileSystem, FileStatus current, List partitionColumns, int depth) + { + if (depth == 0) { + return ImmutableList.of(current); + } + + try { + return Stream.of(fileSystem.listStatus(current.getPath())) + .filter(fileStatus -> isValidPartitionPath(fileSystem, fileStatus, partitionColumns.get(partitionColumns.size() - depth))) + .flatMap(directory -> listDirectory(fileSystem, directory, partitionColumns, depth - 1).stream()) + .collect(toImmutableList()); + } + catch (IOException e) { + throw new PrestoException(HIVE_FILESYSTEM_ERROR, e); + } + } + + private static boolean isValidPartitionPath(FileSystem fileSystem, FileStatus file, Column column) + { + try { + Path path = file.getPath(); + String prefix = column.getName() + '='; + return fileSystem.isDirectory(path) && path.getName().startsWith(prefix); + } + catch (IOException e) { + throw new PrestoException(HIVE_FILESYSTEM_ERROR, e); + } + } + + // calculate relative complement of set b with respect to set a + private static Set difference(List a, List b) + { + return Sets.difference(new HashSet<>(a), new HashSet<>(b)); + } + + private static void syncPartitions( + Set partitionsToAdd, + Set partitionsToDrop, + SyncMode syncMode, + SemiTransactionalHiveMetastore metastore, + ConnectorSession session, + Table table) + { + if (syncMode == SyncMode.ADD || syncMode == SyncMode.FULL) { + addPartitions(metastore, session, table, partitionsToAdd); + } + if (syncMode == SyncMode.DROP || syncMode == SyncMode.FULL) { + dropPartitions(metastore, session, table, partitionsToDrop); + } + metastore.commit(); + } + + private static void addPartitions( + SemiTransactionalHiveMetastore metastore, + ConnectorSession session, + Table table, + Set partitions) + { + for (String name : partitions) { + metastore.addPartition( + session, + table.getDatabaseName(), + table.getTableName(), + buildPartitionObject(session, table, name), + new Path(table.getStorage().getLocation(), name), + PartitionStatistics.empty(), + HiveACIDWriteType.NONE); + } + } + + private static void dropPartitions( + SemiTransactionalHiveMetastore metastore, + ConnectorSession session, + Table table, + Set partitions) + { + for (String name : partitions) { + metastore.dropPartition( + session, + table.getDatabaseName(), + table.getTableName(), + extractPartitionValues(name)); + } + } + + private static Partition buildPartitionObject(ConnectorSession session, Table table, String partitionName) + { + return Partition.builder() + .setDatabaseName(table.getDatabaseName()) + .setTableName(table.getTableName()) + .setColumns(table.getDataColumns()) + .setValues(extractPartitionValues(partitionName)) + .setParameters(ImmutableMap.of(PRESTO_QUERY_ID_NAME, session.getQueryId())) + .withStorage(storage -> storage + .setStorageFormat(table.getStorage().getStorageFormat()) + .setLocation(new Path(table.getStorage().getLocation(), partitionName).toString()) + .setBucketProperty(table.getStorage().getBucketProperty()) + .setSerdeParameters(table.getStorage().getSerdeParameters())) + .build(); + } + + private static SyncMode toSyncMode(String mode) + { + try { + return SyncMode.valueOf(mode.toUpperCase(ENGLISH)); + } + catch (IllegalArgumentException e) { + throw new PrestoException(INVALID_PROCEDURE_ARGUMENT, "Invalid partition metadata sync mode: " + mode); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/TableOfflineException.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/TableOfflineException.java new file mode 100644 index 00000000..a192ab67 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/TableOfflineException.java @@ -0,0 +1,51 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.SchemaTableName; + +import static com.google.common.base.Strings.isNullOrEmpty; +import static java.util.Objects.requireNonNull; + +public class TableOfflineException + extends PrestoException +{ + private final SchemaTableName tableName; + + public TableOfflineException(SchemaTableName tableName, boolean forPresto, String offlineMessage) + { + super(HiveErrorCode.HIVE_TABLE_OFFLINE, formatMessage(tableName, forPresto, offlineMessage)); + this.tableName = requireNonNull(tableName, "tableName is null"); + } + + public SchemaTableName getTableName() + { + return tableName; + } + + private static String formatMessage(SchemaTableName tableName, boolean forPresto, String offlineMessage) + { + StringBuilder resultBuilder = new StringBuilder() + .append("Table '").append(tableName).append("'") + .append(" is offline"); + if (forPresto) { + resultBuilder.append(" for Presto"); + } + if (!isNullOrEmpty(offlineMessage)) { + resultBuilder.append(": ").append(offlineMessage); + } + return resultBuilder.toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/TransactionalMetadata.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/TransactionalMetadata.java new file mode 100644 index 00000000..62a58405 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/TransactionalMetadata.java @@ -0,0 +1,24 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.connector.ConnectorMetadata; + +public interface TransactionalMetadata + extends ConnectorMetadata +{ + void commit(); + + void rollback(); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/TypeTranslator.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/TypeTranslator.java new file mode 100644 index 00000000..ec8311d5 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/TypeTranslator.java @@ -0,0 +1,22 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +public interface TypeTranslator +{ + TypeInfo translate(Type type); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/VacuumCleaner.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/VacuumCleaner.java new file mode 100644 index 00000000..c863f60a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/VacuumCleaner.java @@ -0,0 +1,208 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableSet; +import io.airlift.log.Logger; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.spi.PrestoException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.ValidReaderWriteIdList; +import org.apache.hadoop.hive.common.ValidWriteIdList; +import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; +import org.apache.hadoop.hive.ql.io.AcidUtils; + +import java.io.IOException; +import java.util.BitSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +import static io.prestosql.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; + +public class VacuumCleaner +{ + private final VacuumTableInfoForCleaner vacuumTableInfo; + private static final Logger log = Logger.get(VacuumCleaner.class); + private final ScheduledExecutorService executorService; + private final long cleanupInterval; + private final HdfsEnvironment hdfsEnvironment; + private final HdfsEnvironment.HdfsContext hdfsContext; + private final Configuration configuration; + private final SemiTransactionalHiveMetastore metastore; + + private ScheduledFuture cleanupTask; + private Set lockIds; + + public VacuumCleaner(VacuumTableInfoForCleaner vacuumTableInfo, + SemiTransactionalHiveMetastore metastore, + HdfsEnvironment hdfsEnvironment, + HdfsEnvironment.HdfsContext hdfsContext) + { + this.vacuumTableInfo = vacuumTableInfo; + this.hdfsEnvironment = hdfsEnvironment; + this.hdfsContext = hdfsContext; + this.metastore = metastore; + + this.executorService = this.metastore.getVacuumExecutorService(); + this.cleanupInterval = this.metastore.getVacuumCleanupInterval(); + this.configuration = hdfsEnvironment.getConfiguration(hdfsContext, this.vacuumTableInfo.getDirectoryPath()); + } + + private void log(String message) + { + String logPrefix = String.format("%s.%s", vacuumTableInfo.getDbName(), vacuumTableInfo.getTableName()) + + ((vacuumTableInfo.getPartitionName().length() > 0) ? ("." + vacuumTableInfo.getPartitionName()) : ""); + log.debug(logPrefix + " : " + message); + } + + public void submitVacuumCleanupTask() + { + log("Submitting task to Vacuum Cleaner thread pool"); + cleanupTask = executorService.scheduleAtFixedRate( + new CleanerTask(), + 0, + cleanupInterval, + TimeUnit.MILLISECONDS); + } + + void stopScheduledCleanupTask() + { + log("Vacuum cleanup task Finished"); + cleanupTask.cancel(true); + } + + private class CleanerTask + implements Runnable + { + private int maxCleanerAttempts = 5; + private int currentAttempt = 1; + private boolean stop; + + @Override + public void run() + { + log("Starting Vacuum cleaner task. Attempt: " + currentAttempt); + + try { + if (!readyToClean()) { + log("Waiting for readers to finish"); + currentAttempt++; + if (currentAttempt <= maxCleanerAttempts) { + return; + } + else { + log("Vacuum Cleaner task reached to the maximum number of attempts."); + } + } + else { + // All readers which had started before vacuum operation + // have been finished. Now we are ready to clean up. + String fullTableName = vacuumTableInfo.getDbName() + "." + vacuumTableInfo.getTableName(); + long highestWriteId = vacuumTableInfo.getMaxId(); + final ValidWriteIdList validWriteIdList = (highestWriteId > 0) + ? new ValidReaderWriteIdList(fullTableName, new long[0], new BitSet(), highestWriteId) + : new ValidReaderWriteIdList(); + hdfsEnvironment.doAs(hdfsContext.getIdentity().getUser(), () -> { + removeFiles(validWriteIdList); + }); + } + stop = true; + } + catch (Exception e) { + log.info("Exception in Vacuum cleanup: " + e.toString()); + stop = true; + } + finally { + if (stop) { + stopScheduledCleanupTask(); + } + } + } + + private void removeFiles(ValidWriteIdList writeIdList) + { + FileSystem fileSystem = null; + List filesToDelete; + try { + AcidUtils.Directory dir = AcidUtils.getAcidState( + vacuumTableInfo.getDirectoryPath(), + configuration, + writeIdList); + filesToDelete = dir.getObsolete().stream() + .map(fs -> fs.getPath()) + .collect(Collectors.toList()); + if (filesToDelete.size() < 1) { + log("No files to delete"); + return; + } + fileSystem = filesToDelete.get(0).getFileSystem(configuration); + } + catch (IOException e) { + throw new PrestoException(GENERIC_INTERNAL_ERROR, "Failure while getting file system: ", e); + } + for (Path filePath : filesToDelete) { + log(String.format("Removing directory on path : %s", filePath.toString())); + try { + fileSystem.delete(filePath, true); + } + catch (IOException e) { + // Exception in cleaning one directory should not stop clean up of other directories. + // Therefore, ignoring this exception. + log(String.format("Directory %s deletion failed: %s", filePath, e.getMessage())); + } + } + } + + // Checks if there is any reader started before vacuum operation. + private boolean readyToClean() + { + // Get list of locks taken on given table and partition + ShowLocksResponse response = metastore.showLocks(vacuumTableInfo); + // Only wait for release of those locks which had been + // taken before the time when we had visited this for first time. + if (lockIds == null) { + lockIds = lockResponseToSet(response); + if (lockIds.size() < 1) { + log("No readers at present"); + return true; + } + } + log(String.format("Number of readers = %d", lockIds.size())); + Set currentLockIds = lockResponseToSet(response); + for (Long lockId : lockIds) { + if (currentLockIds.contains(lockId)) { + return false; + } + else { + lockIds.remove(lockId); + } + } + return true; + } + + private Set lockResponseToSet(ShowLocksResponse response) + { + if (response.getLocks() == null) { + return ImmutableSet.of(); + } + return response.getLocks().stream().map(e -> e.getLockid()).collect(Collectors.toSet()); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/VacuumEligibleTableCollector.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/VacuumEligibleTableCollector.java new file mode 100644 index 00000000..9305fca9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/VacuumEligibleTableCollector.java @@ -0,0 +1,306 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.airlift.log.Logger; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.connector.ConnectorVacuumTableInfo; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.security.ConnectorIdentity; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.FileUtils; +import org.apache.hadoop.hive.common.ValidReaderWriteIdList; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.shims.HadoopShims; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + +public class VacuumEligibleTableCollector +{ + private static VacuumEligibleTableCollector instance; + private final ScheduledExecutorService executorService; + private final Logger log = Logger.get(VacuumEligibleTableCollector.class); + + private SemiTransactionalHiveMetastore metastore; + private HdfsEnvironment hdfsEnvironment; + private int vacuumDeltaNumThreshold; + private double vacuumDeltaPercentThreshold; + private List vacuumTableList = Collections.synchronizedList(new ArrayList<>()); + private Map inProgressVacuums = new ConcurrentHashMap<>(); + + private VacuumTableCollectorTask task = new VacuumTableCollectorTask(); + + private VacuumEligibleTableCollector(SemiTransactionalHiveMetastore metastore, + HdfsEnvironment hdfsEnvironment, + int vacuumDeltaNumThreshold, + double vacuumDeltaPercentThreshold, + ScheduledExecutorService executorService) + { + this.metastore = metastore; + this.hdfsEnvironment = hdfsEnvironment; + this.vacuumDeltaNumThreshold = vacuumDeltaNumThreshold; + this.vacuumDeltaPercentThreshold = vacuumDeltaPercentThreshold; + this.executorService = executorService; + } + + public static synchronized void createInstance(SemiTransactionalHiveMetastore metastore, + HdfsEnvironment hdfsEnvironment, int vacuumDeltaNumThreshold, double vacuumDeltaPercentThreshold, + ScheduledExecutorService executorService, long vacuumCollectorInterval) + { + if (instance == null) { + instance = new VacuumEligibleTableCollector(metastore, hdfsEnvironment, vacuumDeltaNumThreshold, vacuumDeltaPercentThreshold, executorService); + //Initialize the file systems + HdfsEnvironment.HdfsContext context = new HdfsEnvironment.HdfsContext(new ConnectorIdentity("openLooKeng", Optional.empty(), Optional.empty())); + try { + hdfsEnvironment.getFileSystem(context, new Path("/")); + } + catch (IOException e) { + } + // Also start preparing vacuumTableList + instance.executorService.scheduleAtFixedRate(instance.task, + 0, vacuumCollectorInterval, TimeUnit.MILLISECONDS); + } + } + + public static void finishVacuum(String schemaTable) + { + if (instance.inProgressVacuums.containsKey(schemaTable)) { + instance.inProgressVacuums.remove(schemaTable); + } + } + + static List getVacuumTableList(SemiTransactionalHiveMetastore metastore, + HdfsEnvironment hdfsEnvironment, int vacuumDeltaNumThreshold, + double vacuumDeltaPercentThreshold, ScheduledExecutorService executorService, long vacuumCollectorInterval) + { + createInstance(metastore, hdfsEnvironment, vacuumDeltaNumThreshold, vacuumDeltaPercentThreshold, executorService, vacuumCollectorInterval); + synchronized (instance) { + instance.metastore = metastore; + ImmutableList newList = ImmutableList.copyOf(instance.vacuumTableList); + instance.vacuumTableList.clear(); + return newList; + } + } + + private synchronized void addToVacuumTableList(List tablesForVacuum) + { + for (ConnectorVacuumTableInfo tableInfo : tablesForVacuum) { + if (!inProgressVacuums.containsKey(tableInfo.getSchemaTableName()) + && !vacuumTableList.contains(tableInfo)) { + inProgressVacuums.put(tableInfo.getSchemaTableName(), tableInfo); + vacuumTableList.add(tableInfo); + } + } + } + + private class VacuumTableCollectorTask + implements Runnable + { + private Table getTable(String schemaName, String tableName, SemiTransactionalHiveMetastore metastore) + { + HiveIdentity identity = new HiveIdentity(new ConnectorIdentity("openLooKeng", Optional.empty(), Optional.empty())); + Optional
table = metastore.getTable(identity, schemaName, tableName); + if (!table.isPresent() || table.get().getTableType().equals(TableType.VIRTUAL_VIEW.name())) { + throw new TableNotFoundException(new SchemaTableName(schemaName, tableName)); + } + return table.get(); + } + + @Override + public void run() + { + try { + collectTablesForVacuum(); + } + catch (Exception e) { + log.info("Error while collecting tables for auto-vacuum" + e.toString()); + } + } + + private void collectTablesForVacuum() + { + SemiTransactionalHiveMetastore taskMetastore = metastore; + List databases = taskMetastore.getAllDatabases(); + for (String database : databases) { + //Let each database get analyzed asynchronously. + executorService.submit(() -> { + try { + scanDatabase(database, taskMetastore); + } + catch (Exception e) { + log.info("Error while scanning database for vacuum" + e.toString()); + } + }); + } + } + + private void scanDatabase(String database, SemiTransactionalHiveMetastore taskMetastore) + { + Optional> tables = taskMetastore.getAllTables(database); + if (tables.isPresent()) { + List tablesForVacuum = new ArrayList<>(); + for (String table : tables.get()) { + if (inProgressVacuums.containsKey(appendTableWithSchema(database, table))) { + log.debug("Auto-vacuum is in progress for table: " + appendTableWithSchema(database, table)); + continue; + } + Table tableInfo = getTable(database, table, taskMetastore); + if (isTransactional(tableInfo)) { + ConnectorIdentity connectorIdentity = new ConnectorIdentity("openLooKeng", Optional.empty(), Optional.empty()); + HiveIdentity identity = new HiveIdentity(connectorIdentity); + Optional> partitions = taskMetastore.getPartitionNames(identity, database, table); + String tablePath = getLocation(tableInfo); + HdfsEnvironment.HdfsContext hdfsContext = new HdfsEnvironment.HdfsContext(connectorIdentity); + hdfsEnvironment.doAs("openLooKeng", () -> { + try { + // For Hive partitioned table + if (partitions.isPresent() && partitions.get().size() > 0) { + for (String partitionName : partitions.get()) { + String partitionPath = tablePath + "/" + partitionName; + boolean updated = determineVacuumType(partitionPath, database, table, tablesForVacuum, tableInfo.getParameters(), hdfsContext); + // If auto-vacuum condition satisfies for 1 partition, + // stop checking for other partitions. Since auto-vacuum runs + // on entire table. + if (updated) { + break; + } + } + } + else { + determineVacuumType(tablePath, database, table, tablesForVacuum, tableInfo.getParameters(), hdfsContext); + } + } + catch (Exception e) { + log.info("Exception while determining vacuum type for table: " + database + "." + table + ": " + e.toString()); + } + }); + } + } + addToVacuumTableList(tablesForVacuum); + } + } + + private String getLocation(Table tableInfo) + { + return tableInfo.getStorage().getLocation(); + } + + private boolean isTransactional(Table tableInfo) + { + if (!tableInfo.getParameters().containsKey("transactional")) { + return false; + } + return tableInfo.getParameters().get("transactional").equalsIgnoreCase("true"); + } + + private boolean determineVacuumType(String path, String schema, String table, List tablesForVacuum, Map parameters, HdfsEnvironment.HdfsContext hdfsContext) + throws IOException + { + log.debug("Determining vacuum type for path: " + path); + Path tablePath = new Path(path); + AcidUtils.Directory dir = getDirectory(hdfsContext, tablePath); + FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, tablePath); + + boolean noBase = false; + Path base = dir.getBaseDirectory(); + long baseSize = 0; + if (base != null) { + baseSize += sumDirSize(fs, base); + } + List originals = dir.getOriginalFiles(); + for (HadoopShims.HdfsFileStatusWithId origStat : originals) { + baseSize += origStat.getFileStatus().getLen(); + } + long deltaSize = 0; + List deltas = dir.getCurrentDirectories(); + for (AcidUtils.ParsedDelta delta : deltas) { + deltaSize += sumDirSize(fs, delta.getPath()); + } + + logStats(schema, table, baseSize, deltaSize, dir.getCurrentDirectories().size()); + + if (baseSize == 0 && deltaSize > 0) { + noBase = true; + } + else { + boolean bigEnough = (float) deltaSize / (float) baseSize > vacuumDeltaPercentThreshold; + if (bigEnough) { + ConnectorVacuumTableInfo vacuumTable = new ConnectorVacuumTableInfo(appendTableWithSchema(schema, table), true); + tablesForVacuum.add(vacuumTable); + return true; + } + } + if (dir.getCurrentDirectories().size() > vacuumDeltaNumThreshold) { + boolean isFull = false; + //If insert-only table or first time vacuum, then it should be full vacuum. + if (AcidUtils.isInsertOnlyTable(parameters) || noBase) { + isFull = true; + } + ConnectorVacuumTableInfo vacuumTable = new ConnectorVacuumTableInfo(appendTableWithSchema(schema, table), isFull); + tablesForVacuum.add(vacuumTable); + return true; + } + return false; + } + + private void logStats(String schema, String table, long baseSize, long deltaSize, int numOfDeltaDir) + { + log.debug(String.format("Auto-vacuum stats for table '%s': baseSize='%d', delatSize='%d', numOfDeltaDir='%d'", + appendTableWithSchema(schema, table), + baseSize, + deltaSize, + numOfDeltaDir)); + } + + private String appendTableWithSchema(String schema, String table) + { + return schema + "." + table; + } + + public AcidUtils.Directory getDirectory(HdfsEnvironment.HdfsContext hdfsContext, Path tablePath) + throws IOException + { + Configuration conf = hdfsEnvironment.getConfiguration(hdfsContext, tablePath); + ValidReaderWriteIdList validWriteIds = new ValidReaderWriteIdList(); + return AcidUtils.getAcidState(tablePath, conf, validWriteIds); + } + + private long sumDirSize(FileSystem fs, Path dir) + throws IOException + { + long size = 0; + FileStatus[] buckets = fs.listStatus(dir, FileUtils.HIDDEN_FILES_PATH_FILTER); + for (int i = 0; i < buckets.length; i++) { + size += buckets[i].getLen(); + } + return size; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/VacuumTableInfoForCleaner.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/VacuumTableInfoForCleaner.java new file mode 100644 index 00000000..ee737bc2 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/VacuumTableInfoForCleaner.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive; + +import org.apache.hadoop.fs.Path; + +public class VacuumTableInfoForCleaner +{ + private final String dbName; + private final String tableName; + private final String partitionName; + private final long maxId; + private final Path directoryPath; + + public VacuumTableInfoForCleaner(String dbName, String tableName, String partitionName, long maxId, Path directoryPath) + { + this.dbName = dbName; + this.tableName = tableName; + this.partitionName = partitionName; + this.maxId = maxId; + this.directoryPath = directoryPath; + } + + public String getDbName() + { + return dbName; + } + + public String getTableName() + { + return tableName; + } + + public String getPartitionName() + { + return partitionName; + } + + public long getMaxId() + { + return maxId; + } + + public Path getDirectoryPath() + { + return directoryPath; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ViewAlreadyExistsException.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ViewAlreadyExistsException.java new file mode 100644 index 00000000..05a64523 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/ViewAlreadyExistsException.java @@ -0,0 +1,42 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.SchemaTableName; + +import static io.prestosql.spi.StandardErrorCode.ALREADY_EXISTS; +import static java.lang.String.format; + +public class ViewAlreadyExistsException + extends PrestoException +{ + private final SchemaTableName viewName; + + public ViewAlreadyExistsException(SchemaTableName viewName) + { + this(viewName, format("View already exists: '%s'", viewName)); + } + + public ViewAlreadyExistsException(SchemaTableName viewName, String message) + { + super(ALREADY_EXISTS, message); + this.viewName = viewName; + } + + public SchemaTableName getViewName() + { + return viewName; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/WriteCompletedEvent.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/WriteCompletedEvent.java new file mode 100644 index 00000000..31988bca --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/WriteCompletedEvent.java @@ -0,0 +1,172 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.event.client.EventField; +import io.airlift.event.client.EventField.EventFieldMapping; +import io.airlift.event.client.EventType; + +import javax.annotation.Nullable; +import javax.annotation.concurrent.Immutable; + +import java.time.Instant; +import java.util.Map; + +import static java.util.Objects.requireNonNull; + +@Immutable +@EventType("WriteCompletedEvent") +public class WriteCompletedEvent +{ + private final String queryId; + private final String path; + private final String schemaName; + private final String tableName; + private final String partitionName; + private final String storageFormat; + private final String writerImplementation; + private final String prestoVersion; + private final String host; + private final String principal; + private final String environment; + private final Map sessionProperties; + private final Long bytes; + private final long rows; + private final Instant timestamp = Instant.now(); + + public WriteCompletedEvent( + String queryId, + String path, + String schemaName, + String tableName, + @Nullable String partitionName, + String storageFormat, + String writerImplementation, + String prestoVersion, + String serverAddress, + @Nullable String principal, + String environment, + Map sessionProperties, + @Nullable Long bytes, + long rows) + { + this.queryId = requireNonNull(queryId, "queryId is null"); + this.path = requireNonNull(path, "path is null"); + this.schemaName = requireNonNull(schemaName, "schemaName is null"); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.partitionName = partitionName; + this.storageFormat = requireNonNull(storageFormat, "storageFormat is null"); + this.writerImplementation = requireNonNull(writerImplementation, "writerImplementation is null"); + this.prestoVersion = requireNonNull(prestoVersion, "prestoVersion is null"); + this.host = requireNonNull(serverAddress, "serverAddress is null"); + this.principal = principal; + this.environment = requireNonNull(environment, "environment is null"); + this.sessionProperties = requireNonNull(sessionProperties, "sessionProperties is null"); + this.bytes = bytes; + this.rows = rows; + } + + @EventField + public String getQueryId() + { + return queryId; + } + + @EventField + public String getPath() + { + return path; + } + + @EventField + public String getSchemaName() + { + return schemaName; + } + + @EventField + public String getTableName() + { + return tableName; + } + + @Nullable + @EventField + public String getPartitionName() + { + return partitionName; + } + + @EventField + public String getStorageFormat() + { + return storageFormat; + } + + @EventField + public String getWriterImplementation() + { + return writerImplementation; + } + + @EventField + public String getPrestoVersion() + { + return prestoVersion; + } + + @EventField(fieldMapping = EventFieldMapping.HOST) + public String getHost() + { + return host; + } + + @Nullable + @EventField + public String getPrincipal() + { + return principal; + } + + @EventField + public String getEnvironment() + { + return environment; + } + + @EventField + public Map getSessionProperties() + { + return sessionProperties; + } + + @Nullable + @EventField + public Long getBytes() + { + return bytes; + } + + @EventField + public long getRows() + { + return rows; + } + + @EventField(fieldMapping = EventFieldMapping.TIMESTAMP) + public Instant getTimestamp() + { + return timestamp; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/WriteIdInfo.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/WriteIdInfo.java new file mode 100644 index 00000000..e293a052 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/WriteIdInfo.java @@ -0,0 +1,90 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; + +public class WriteIdInfo +{ + private final long minWriteId; + private final long maxWriteId; + private final int statementId; + + @JsonCreator + public WriteIdInfo( + @JsonProperty("minWriteId") long minWriteId, + @JsonProperty("maxWriteId") long maxWriteId, + @JsonProperty("statementId") int statementId) + { + this.minWriteId = minWriteId; + this.maxWriteId = maxWriteId; + this.statementId = statementId; + } + + @JsonProperty + public long getMinWriteId() + { + return minWriteId; + } + + @JsonProperty + public long getMaxWriteId() + { + return maxWriteId; + } + + @JsonProperty + public int getStatementId() + { + return statementId; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + + if (o == null || getClass() != o.getClass()) { + return false; + } + + WriteIdInfo that = (WriteIdInfo) o; + return minWriteId == that.minWriteId && + maxWriteId == that.maxWriteId && + statementId == that.statementId; + } + + @Override + public int hashCode() + { + return Objects.hash(minWriteId, maxWriteId, statementId); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("minWriteId", minWriteId) + .add("maxWriteId", maxWriteId) + .add("statementId", statementId) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/AuthenticationModules.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/AuthenticationModules.java new file mode 100644 index 00000000..64d0b64c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/AuthenticationModules.java @@ -0,0 +1,145 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import com.google.inject.Binder; +import com.google.inject.Key; +import com.google.inject.Module; +import com.google.inject.Provides; +import com.google.inject.Singleton; +import io.prestosql.plugin.hive.ForHdfs; +import io.prestosql.plugin.hive.ForHiveMetastore; +import io.prestosql.plugin.hive.HdfsConfigurationInitializer; + +import javax.inject.Inject; + +import static com.google.inject.Scopes.SINGLETON; +import static io.airlift.configuration.ConfigBinder.configBinder; + +public final class AuthenticationModules +{ + private AuthenticationModules() {} + + public static Module noHiveMetastoreAuthenticationModule() + { + return binder -> binder + .bind(HiveMetastoreAuthentication.class) + .to(NoHiveMetastoreAuthentication.class) + .in(SINGLETON); + } + + public static Module kerberosHiveMetastoreAuthenticationModule() + { + return new Module() + { + @Override + public void configure(Binder binder) + { + binder.bind(HiveMetastoreAuthentication.class) + .to(KerberosHiveMetastoreAuthentication.class) + .in(SINGLETON); + configBinder(binder).bindConfig(MetastoreKerberosConfig.class); + } + + @Inject + @Provides + @Singleton + @ForHiveMetastore + HadoopAuthentication createHadoopAuthentication(MetastoreKerberosConfig config, HdfsConfigurationInitializer updater) + { + String principal = config.getHiveMetastoreClientPrincipal(); + String keytabLocation = config.getHiveMetastoreClientKeytab(); + return createCachingKerberosHadoopAuthentication(principal, keytabLocation, updater); + } + }; + } + + public static Module noHdfsAuthenticationModule() + { + return binder -> binder + .bind(HdfsAuthentication.class) + .to(NoHdfsAuthentication.class) + .in(SINGLETON); + } + + public static Module simpleImpersonatingHdfsAuthenticationModule() + { + return binder -> { + binder.bind(Key.get(HadoopAuthentication.class, ForHdfs.class)) + .to(SimpleHadoopAuthentication.class); + binder.bind(HdfsAuthentication.class) + .to(ImpersonatingHdfsAuthentication.class) + .in(SINGLETON); + }; + } + + public static Module kerberosHdfsAuthenticationModule() + { + return new Module() + { + @Override + public void configure(Binder binder) + { + binder.bind(HdfsAuthentication.class) + .to(DirectHdfsAuthentication.class) + .in(SINGLETON); + configBinder(binder).bindConfig(HdfsKerberosConfig.class); + } + + @Inject + @Provides + @Singleton + @ForHdfs + HadoopAuthentication createHadoopAuthentication(HdfsKerberosConfig config, HdfsConfigurationInitializer updater) + { + String principal = config.getHdfsPrestoPrincipal(); + String keytabLocation = config.getHdfsPrestoKeytab(); + return createCachingKerberosHadoopAuthentication(principal, keytabLocation, updater); + } + }; + } + + public static Module kerberosImpersonatingHdfsAuthenticationModule() + { + return new Module() + { + @Override + public void configure(Binder binder) + { + binder.bind(HdfsAuthentication.class) + .to(ImpersonatingHdfsAuthentication.class) + .in(SINGLETON); + configBinder(binder).bindConfig(HdfsKerberosConfig.class); + } + + @Inject + @Provides + @Singleton + @ForHdfs + HadoopAuthentication createHadoopAuthentication(HdfsKerberosConfig config, HdfsConfigurationInitializer updater) + { + String principal = config.getHdfsPrestoPrincipal(); + String keytabLocation = config.getHdfsPrestoKeytab(); + return createCachingKerberosHadoopAuthentication(principal, keytabLocation, updater); + } + }; + } + + private static HadoopAuthentication createCachingKerberosHadoopAuthentication(String principal, String keytabLocation, HdfsConfigurationInitializer updater) + { + KerberosAuthentication kerberosAuthentication = new KerberosAuthentication(principal, keytabLocation); + KerberosHadoopAuthentication kerberosHadoopAuthentication = KerberosHadoopAuthentication.createKerberosHadoopAuthentication(kerberosAuthentication, updater); + return new CachingKerberosHadoopAuthentication(kerberosHadoopAuthentication); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/CachingKerberosHadoopAuthentication.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/CachingKerberosHadoopAuthentication.java new file mode 100644 index 00000000..14d40c0f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/CachingKerberosHadoopAuthentication.java @@ -0,0 +1,73 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import org.apache.hadoop.security.UserGroupInformation; + +import javax.annotation.concurrent.GuardedBy; +import javax.security.auth.Subject; +import javax.security.auth.kerberos.KerberosTicket; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; +import static org.apache.hadoop.security.UserGroupInformationShim.getSubject; + +public class CachingKerberosHadoopAuthentication + implements HadoopAuthentication +{ + private final KerberosHadoopAuthentication delegate; + + private final Object lock = new Object(); + @GuardedBy("lock") + private UserGroupInformation userGroupInformation; + @GuardedBy("lock") + private long nextRefreshTime = Long.MIN_VALUE; + + public CachingKerberosHadoopAuthentication(KerberosHadoopAuthentication delegate) + { + this.delegate = requireNonNull(delegate, "hadoopAuthentication is null"); + } + + @Override + public UserGroupInformation getUserGroupInformation() + { + synchronized (lock) { + if (refreshIsNeeded()) { + refreshUgi(); + } + return userGroupInformation; + } + } + + @GuardedBy("lock") + private void refreshUgi() + { + userGroupInformation = delegate.getUserGroupInformation(); + nextRefreshTime = calculateNextRefreshTime(userGroupInformation); + } + + @GuardedBy("lock") + private boolean refreshIsNeeded() + { + return nextRefreshTime < System.currentTimeMillis() || userGroupInformation == null; + } + + private static long calculateNextRefreshTime(UserGroupInformation userGroupInformation) + { + Subject subject = getSubject(userGroupInformation); + checkArgument(subject != null, "subject must be present in kerberos based UGI"); + KerberosTicket tgtTicket = KerberosTicketUtils.getTicketGrantingTicket(subject); + return KerberosTicketUtils.getRefreshTime(tgtTicket); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/DirectHdfsAuthentication.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/DirectHdfsAuthentication.java new file mode 100644 index 00000000..f05bb31e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/DirectHdfsAuthentication.java @@ -0,0 +1,39 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import io.prestosql.plugin.hive.ForHdfs; + +import javax.inject.Inject; + +import static java.util.Objects.requireNonNull; + +public class DirectHdfsAuthentication + implements HdfsAuthentication +{ + private final HadoopAuthentication hadoopAuthentication; + + @Inject + public DirectHdfsAuthentication(@ForHdfs HadoopAuthentication hadoopAuthentication) + { + this.hadoopAuthentication = requireNonNull(hadoopAuthentication); + } + + @Override + public R doAs(String user, GenericExceptionAction action) + throws E + { + return UserGroupInformationUtils.executeActionInDoAs(hadoopAuthentication.getUserGroupInformation(), action); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/GenericExceptionAction.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/GenericExceptionAction.java new file mode 100644 index 00000000..fddf1a8e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/GenericExceptionAction.java @@ -0,0 +1,20 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +public interface GenericExceptionAction +{ + R run() + throws E; +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HadoopAuthentication.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HadoopAuthentication.java new file mode 100644 index 00000000..accad3a9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HadoopAuthentication.java @@ -0,0 +1,21 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import org.apache.hadoop.security.UserGroupInformation; + +public interface HadoopAuthentication +{ + UserGroupInformation getUserGroupInformation(); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HdfsAuthentication.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HdfsAuthentication.java new file mode 100644 index 00000000..17f5131e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HdfsAuthentication.java @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +public interface HdfsAuthentication +{ + R doAs(String user, GenericExceptionAction action) + throws E; + + default void doAs(String user, Runnable action) + { + doAs(user, () -> { + action.run(); + return null; + }); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HdfsKerberosConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HdfsKerberosConfig.java new file mode 100644 index 00000000..7c3600df --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HdfsKerberosConfig.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; + +import javax.validation.constraints.NotNull; + +public class HdfsKerberosConfig +{ + private String hdfsPrestoPrincipal; + private String hdfsPrestoKeytab; + + @NotNull + public String getHdfsPrestoPrincipal() + { + return hdfsPrestoPrincipal; + } + + @Config("hive.hdfs.presto.principal") + @ConfigDescription("Presto principal used to access HDFS") + public HdfsKerberosConfig setHdfsPrestoPrincipal(String hdfsPrestoPrincipal) + { + this.hdfsPrestoPrincipal = hdfsPrestoPrincipal; + return this; + } + + @NotNull + public String getHdfsPrestoKeytab() + { + return hdfsPrestoKeytab; + } + + @Config("hive.hdfs.presto.keytab") + @ConfigDescription("Presto keytab used to access HDFS") + public HdfsKerberosConfig setHdfsPrestoKeytab(String hdfsPrestoKeytab) + { + this.hdfsPrestoKeytab = hdfsPrestoKeytab; + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HiveAuthenticationModule.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HiveAuthenticationModule.java new file mode 100644 index 00000000..c98697f3 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HiveAuthenticationModule.java @@ -0,0 +1,76 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import com.google.inject.Binder; +import com.google.inject.Module; +import io.airlift.configuration.AbstractConfigurationAwareModule; +import io.prestosql.plugin.hive.HiveConfig; + +import java.util.function.Predicate; + +import static io.airlift.configuration.ConditionalModule.installModuleIf; +import static io.prestosql.plugin.hive.authentication.AuthenticationModules.kerberosHdfsAuthenticationModule; +import static io.prestosql.plugin.hive.authentication.AuthenticationModules.kerberosHiveMetastoreAuthenticationModule; +import static io.prestosql.plugin.hive.authentication.AuthenticationModules.kerberosImpersonatingHdfsAuthenticationModule; +import static io.prestosql.plugin.hive.authentication.AuthenticationModules.noHdfsAuthenticationModule; +import static io.prestosql.plugin.hive.authentication.AuthenticationModules.noHiveMetastoreAuthenticationModule; +import static io.prestosql.plugin.hive.authentication.AuthenticationModules.simpleImpersonatingHdfsAuthenticationModule; + +public class HiveAuthenticationModule + extends AbstractConfigurationAwareModule +{ + @Override + protected void setup(Binder binder) + { + bindAuthenticationModule( + config -> config.getHiveMetastoreAuthenticationType() == HiveConfig.HiveMetastoreAuthenticationType.NONE, + noHiveMetastoreAuthenticationModule()); + + bindAuthenticationModule( + config -> config.getHiveMetastoreAuthenticationType() == HiveConfig.HiveMetastoreAuthenticationType.KERBEROS, + kerberosHiveMetastoreAuthenticationModule()); + + bindAuthenticationModule( + config -> noHdfsAuth(config) && !config.isHdfsImpersonationEnabled(), + noHdfsAuthenticationModule()); + + bindAuthenticationModule( + config -> noHdfsAuth(config) && config.isHdfsImpersonationEnabled(), + simpleImpersonatingHdfsAuthenticationModule()); + + bindAuthenticationModule( + config -> kerberosHdfsAuth(config) && !config.isHdfsImpersonationEnabled(), + kerberosHdfsAuthenticationModule()); + + bindAuthenticationModule( + config -> kerberosHdfsAuth(config) && config.isHdfsImpersonationEnabled(), + kerberosImpersonatingHdfsAuthenticationModule()); + } + + private void bindAuthenticationModule(Predicate predicate, Module module) + { + install(installModuleIf(HiveConfig.class, predicate, module)); + } + + private static boolean noHdfsAuth(HiveConfig config) + { + return config.getHdfsAuthenticationType() == HiveConfig.HdfsAuthenticationType.NONE; + } + + private static boolean kerberosHdfsAuth(HiveConfig config) + { + return config.getHdfsAuthenticationType() == HiveConfig.HdfsAuthenticationType.KERBEROS; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HiveIdentity.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HiveIdentity.java new file mode 100644 index 00000000..b5ea598c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HiveIdentity.java @@ -0,0 +1,85 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.security.ConnectorIdentity; + +import java.util.Objects; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +public final class HiveIdentity +{ + private static final HiveIdentity NONE_IDENTITY = new HiveIdentity(); + + private final Optional username; + + private HiveIdentity() + { + this.username = Optional.empty(); + } + + public HiveIdentity(ConnectorSession session) + { + this(requireNonNull(session, "session is null").getIdentity()); + } + + public HiveIdentity(ConnectorIdentity identity) + { + requireNonNull(identity, "identity is null"); + this.username = Optional.of(requireNonNull(identity.getUser(), "identity.getUser() is null")); + } + + // this should be called only by CachingHiveMetastore + public static HiveIdentity none() + { + return NONE_IDENTITY; + } + + public Optional getUsername() + { + return username; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("username", username) + .toString(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + HiveIdentity other = (HiveIdentity) o; + return Objects.equals(username, other.username); + } + + @Override + public int hashCode() + { + return Objects.hash(username); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HiveMetastoreAuthentication.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HiveMetastoreAuthentication.java new file mode 100644 index 00000000..6c8d6bfd --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/HiveMetastoreAuthentication.java @@ -0,0 +1,21 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import org.apache.thrift.transport.TTransport; + +public interface HiveMetastoreAuthentication +{ + TTransport authenticate(TTransport rawTransport, String hiveMetastoreHost); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/ImpersonatingHdfsAuthentication.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/ImpersonatingHdfsAuthentication.java new file mode 100644 index 00000000..28544c58 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/ImpersonatingHdfsAuthentication.java @@ -0,0 +1,45 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import io.prestosql.plugin.hive.ForHdfs; +import org.apache.hadoop.security.UserGroupInformation; + +import javax.inject.Inject; + +import static java.util.Objects.requireNonNull; + +public class ImpersonatingHdfsAuthentication + implements HdfsAuthentication +{ + private final HadoopAuthentication hadoopAuthentication; + + @Inject + public ImpersonatingHdfsAuthentication(@ForHdfs HadoopAuthentication hadoopAuthentication) + { + this.hadoopAuthentication = requireNonNull(hadoopAuthentication); + } + + @Override + public R doAs(String user, GenericExceptionAction action) + throws E + { + return UserGroupInformationUtils.executeActionInDoAs(createProxyUser(user), action); + } + + private UserGroupInformation createProxyUser(String user) + { + return UserGroupInformation.createProxyUser(user, hadoopAuthentication.getUserGroupInformation()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosAuthentication.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosAuthentication.java new file mode 100644 index 00000000..b9bc3c0d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosAuthentication.java @@ -0,0 +1,112 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.airlift.log.Logger; + +import javax.security.auth.Subject; +import javax.security.auth.kerberos.KerberosPrincipal; +import javax.security.auth.login.AppConfigurationEntry; +import javax.security.auth.login.Configuration; +import javax.security.auth.login.LoginContext; +import javax.security.auth.login.LoginException; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.net.InetAddress; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Map; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.nio.file.Files.exists; +import static java.nio.file.Files.isReadable; +import static java.util.Collections.emptySet; +import static java.util.Objects.requireNonNull; +import static org.apache.hadoop.security.SecurityUtil.getServerPrincipal; + +public class KerberosAuthentication +{ + private static final Logger log = Logger.get(KerberosAuthentication.class); + private static final String KERBEROS_LOGIN_MODULE = "com.sun.security.auth.module.Krb5LoginModule"; + + private final KerberosPrincipal principal; + private final Configuration configuration; + + public KerberosAuthentication(String principal, String keytabLocation) + { + requireNonNull(principal, "principal is null"); + requireNonNull(keytabLocation, "keytabLocation is null"); + Path keytabPath = Paths.get(keytabLocation); + checkArgument(exists(keytabPath), "keytab does not exist: " + keytabLocation); + checkArgument(isReadable(keytabPath), "keytab is not readable: " + keytabLocation); + this.principal = createKerberosPrincipal(principal); + this.configuration = createConfiguration(this.principal.getName(), keytabLocation); + } + + public Subject getSubject() + { + Subject subject = new Subject(false, ImmutableSet.of(principal), emptySet(), emptySet()); + try { + LoginContext loginContext = new LoginContext("", subject, null, configuration); + loginContext.login(); + return loginContext.getSubject(); + } + catch (LoginException e) { + throw new RuntimeException(e); + } + } + + private static KerberosPrincipal createKerberosPrincipal(String principal) + { + try { + return new KerberosPrincipal(getServerPrincipal(principal, InetAddress.getLocalHost().getCanonicalHostName())); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static Configuration createConfiguration(String principal, String keytabLocation) + { + ImmutableMap.Builder optionsBuilder = ImmutableMap.builder() + .put("useKeyTab", "true") + .put("storeKey", "true") + .put("doNotPrompt", "true") + .put("isInitiator", "true") + .put("principal", principal) + .put("keyTab", keytabLocation); + + if (log.isDebugEnabled()) { + optionsBuilder.put("debug", "true"); + } + + Map options = optionsBuilder.build(); + + return new Configuration() + { + @Override + public AppConfigurationEntry[] getAppConfigurationEntry(String name) + { + return new AppConfigurationEntry[] { + new AppConfigurationEntry( + KERBEROS_LOGIN_MODULE, + AppConfigurationEntry.LoginModuleControlFlag.REQUIRED, + options)}; + } + }; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosHadoopAuthentication.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosHadoopAuthentication.java new file mode 100644 index 00000000..a3a183d1 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosHadoopAuthentication.java @@ -0,0 +1,57 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import io.prestosql.plugin.hive.HdfsConfigurationInitializer; +import io.prestosql.plugin.hive.util.ConfigurationUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.security.UserGroupInformation; + +import javax.security.auth.Subject; + +import static java.util.Objects.requireNonNull; +import static org.apache.hadoop.security.UserGroupInformationShim.createUserGroupInformationForSubject; + +public class KerberosHadoopAuthentication + implements HadoopAuthentication +{ + private final KerberosAuthentication kerberosAuthentication; + + public static KerberosHadoopAuthentication createKerberosHadoopAuthentication(KerberosAuthentication kerberosAuthentication, HdfsConfigurationInitializer initializer) + { + Configuration configuration = ConfigurationUtils.getInitialConfiguration(); + initializer.initializeConfiguration(configuration); + + // In order to enable KERBEROS authentication method for HDFS + // UserGroupInformation.authenticationMethod static field must be set to KERBEROS + // It is further used in many places in DfsClient + configuration.set("hadoop.security.authentication", "kerberos"); + + UserGroupInformation.setConfiguration(configuration); + + return new KerberosHadoopAuthentication(kerberosAuthentication); + } + + private KerberosHadoopAuthentication(KerberosAuthentication kerberosAuthentication) + { + this.kerberosAuthentication = requireNonNull(kerberosAuthentication, "kerberosAuthentication is null"); + } + + @Override + public UserGroupInformation getUserGroupInformation() + { + Subject subject = kerberosAuthentication.getSubject(); + return createUserGroupInformationForSubject(subject); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosHiveMetastoreAuthentication.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosHiveMetastoreAuthentication.java new file mode 100644 index 00000000..8cb1291c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosHiveMetastoreAuthentication.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.ForHiveMetastore; +import org.apache.hadoop.hive.thrift.client.TUGIAssumingTransport; +import org.apache.hadoop.security.SaslRpcServer; +import org.apache.thrift.transport.TSaslClientTransport; +import org.apache.thrift.transport.TTransport; + +import javax.inject.Inject; +import javax.security.sasl.Sasl; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Map; + +import static com.google.common.base.Preconditions.checkState; +import static java.util.Objects.requireNonNull; +import static org.apache.hadoop.security.SaslRpcServer.AuthMethod.KERBEROS; +import static org.apache.hadoop.security.SecurityUtil.getServerPrincipal; + +public class KerberosHiveMetastoreAuthentication + implements HiveMetastoreAuthentication +{ + private final String hiveMetastoreServicePrincipal; + private final HadoopAuthentication authentication; + + @Inject + public KerberosHiveMetastoreAuthentication( + MetastoreKerberosConfig config, + @ForHiveMetastore HadoopAuthentication authentication) + { + this(config.getHiveMetastoreServicePrincipal(), authentication); + } + + public KerberosHiveMetastoreAuthentication(String hiveMetastoreServicePrincipal, HadoopAuthentication authentication) + { + this.hiveMetastoreServicePrincipal = requireNonNull(hiveMetastoreServicePrincipal, "hiveMetastoreServicePrincipal is null"); + this.authentication = requireNonNull(authentication, "authentication is null"); + } + + @Override + public TTransport authenticate(TTransport rawTransport, String hiveMetastoreHost) + { + try { + String serverPrincipal = getServerPrincipal(hiveMetastoreServicePrincipal, hiveMetastoreHost); + String[] names = SaslRpcServer.splitKerberosName(serverPrincipal); + checkState(names.length == 3, + "Kerberos principal name does NOT have the expected hostname part: %s", serverPrincipal); + + Map saslProps = ImmutableMap.of( + Sasl.QOP, "auth-conf,auth", + Sasl.SERVER_AUTH, "true"); + + TTransport saslTransport = new TSaslClientTransport( + KERBEROS.getMechanismName(), + null, + names[0], + names[1], + saslProps, + null, + rawTransport); + + return new TUGIAssumingTransport(saslTransport, authentication.getUserGroupInformation()); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosTicketUtils.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosTicketUtils.java new file mode 100644 index 00000000..18cf6afa --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/KerberosTicketUtils.java @@ -0,0 +1,75 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import javax.security.auth.Subject; +import javax.security.auth.kerberos.KerberosPrincipal; +import javax.security.auth.kerberos.KerberosTicket; + +import java.util.Set; + +final class KerberosTicketUtils +{ + private static final float TICKET_RENEW_WINDOW = 0.80f; + + private KerberosTicketUtils() + { + } + + static KerberosTicket getTicketGrantingTicket(Subject subject) + { + Set tickets = subject.getPrivateCredentials(KerberosTicket.class); + for (KerberosTicket ticket : tickets) { + if (isOriginalTicketGrantingTicket(ticket)) { + return ticket; + } + } + throw new IllegalArgumentException("kerberos ticket not found in " + subject); + } + + static long getRefreshTime(KerberosTicket ticket) + { + long start = ticket.getStartTime().getTime(); + long end = ticket.getEndTime().getTime(); + return start + (long) ((end - start) * TICKET_RENEW_WINDOW); + } + + /** + * Check whether the server principal is the TGS's principal + * + * @param ticket the original TGT (the ticket that is obtained when a + * kinit is done) + * @return true or false + */ + static boolean isOriginalTicketGrantingTicket(KerberosTicket ticket) + { + return isTicketGrantingServerPrincipal(ticket.getServer()); + } + + /** + * TGS must have the server principal of the form "krbtgt/FOO@FOO". + * + * @return true or false + */ + private static boolean isTicketGrantingServerPrincipal(KerberosPrincipal principal) + { + if (principal == null) { + return false; + } + if (principal.getName().equals("krbtgt/" + principal.getRealm() + "@" + principal.getRealm())) { + return true; + } + return false; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/MetastoreKerberosConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/MetastoreKerberosConfig.java new file mode 100644 index 00000000..16cbd349 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/MetastoreKerberosConfig.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; + +import javax.validation.constraints.NotNull; + +public class MetastoreKerberosConfig +{ + private String hiveMetastoreServicePrincipal; + private String hiveMetastoreClientPrincipal; + private String hiveMetastoreClientKeytab; + private String hiveMetastoreKrb5; + + @NotNull + public String getHiveMetastoreServicePrincipal() + { + return hiveMetastoreServicePrincipal; + } + + @Config("hive.metastore.service.principal") + @ConfigDescription("Hive Metastore service principal") + public MetastoreKerberosConfig setHiveMetastoreServicePrincipal(String hiveMetastoreServicePrincipal) + { + this.hiveMetastoreServicePrincipal = hiveMetastoreServicePrincipal; + return this; + } + + @NotNull + public String getHiveMetastoreClientPrincipal() + { + return hiveMetastoreClientPrincipal; + } + + @Config("hive.metastore.client.principal") + @ConfigDescription("Hive Metastore client principal") + public MetastoreKerberosConfig setHiveMetastoreClientPrincipal(String hiveMetastoreClientPrincipal) + { + this.hiveMetastoreClientPrincipal = hiveMetastoreClientPrincipal; + return this; + } + + @NotNull + public String getHiveMetastoreClientKeytab() + { + return hiveMetastoreClientKeytab; + } + + @Config("hive.metastore.client.keytab") + @ConfigDescription("Hive Metastore client keytab location") + public MetastoreKerberosConfig setHiveMetastoreClientKeytab(String hiveMetastoreClientKeytab) + { + this.hiveMetastoreClientKeytab = hiveMetastoreClientKeytab; + return this; + } + + @NotNull + public String getHiveMetastoreKrb5() + { + return hiveMetastoreKrb5; + } + + @Config("hive.metastore.krb5.conf.path") + @ConfigDescription("Krb5 file path used to access HDFS") + public MetastoreKerberosConfig setHiveMetastoreKrb5(String hiveMetastoreKrb5) + { + this.hiveMetastoreKrb5 = hiveMetastoreKrb5; + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/NoHdfsAuthentication.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/NoHdfsAuthentication.java new file mode 100644 index 00000000..ea008680 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/NoHdfsAuthentication.java @@ -0,0 +1,25 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +public class NoHdfsAuthentication + implements HdfsAuthentication +{ + @Override + public R doAs(String user, GenericExceptionAction action) + throws E + { + return action.run(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/NoHiveMetastoreAuthentication.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/NoHiveMetastoreAuthentication.java new file mode 100644 index 00000000..f0b991b2 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/NoHiveMetastoreAuthentication.java @@ -0,0 +1,26 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import org.apache.thrift.transport.TTransport; + +public class NoHiveMetastoreAuthentication + implements HiveMetastoreAuthentication +{ + @Override + public TTransport authenticate(TTransport rawTransport, String hiveMetastoreHost) + { + return rawTransport; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/SimpleHadoopAuthentication.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/SimpleHadoopAuthentication.java new file mode 100644 index 00000000..cb7250d9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/SimpleHadoopAuthentication.java @@ -0,0 +1,34 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import org.apache.hadoop.security.UserGroupInformation; + +import java.io.IOException; +import java.io.UncheckedIOException; + +public class SimpleHadoopAuthentication + implements HadoopAuthentication +{ + @Override + public UserGroupInformation getUserGroupInformation() + { + try { + return UserGroupInformation.getLoginUser(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/UserGroupInformationUtils.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/UserGroupInformationUtils.java new file mode 100644 index 00000000..c4f0c4c7 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/authentication/UserGroupInformationUtils.java @@ -0,0 +1,64 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import org.apache.hadoop.security.UserGroupInformation; + +import java.security.PrivilegedAction; + +final class UserGroupInformationUtils +{ + private UserGroupInformationUtils() {} + + static R executeActionInDoAs(UserGroupInformation userGroupInformation, GenericExceptionAction action) + throws E + { + return userGroupInformation.doAs((PrivilegedAction>) () -> { + try { + return new ResultOrException<>(action.run(), null); + } + catch (Throwable e) { + return new ResultOrException<>(null, e); + } + }).get(); + } + + private static class ResultOrException + { + private final T result; + private final Throwable exception; + + public ResultOrException(T result, Throwable exception) + { + this.result = result; + this.exception = exception; + } + + @SuppressWarnings("unchecked") + public T get() + throws E + { + if (exception != null) { + if (exception instanceof Error) { + throw (Error) exception; + } + if (exception instanceof RuntimeException) { + throw (RuntimeException) exception; + } + throw (E) exception; + } + return result; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/avro/PrestoAvroSerDe.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/avro/PrestoAvroSerDe.java new file mode 100644 index 00000000..cb284b45 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/avro/PrestoAvroSerDe.java @@ -0,0 +1,41 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.avro; + +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde2.avro.AvroSerDe; +import org.apache.hadoop.hive.serde2.avro.AvroSerdeException; +import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils; + +import java.io.IOException; +import java.util.Properties; + +public class PrestoAvroSerDe + extends AvroSerDe +{ + @Override + public Schema determineSchemaOrReturnErrorSchema(Configuration conf, Properties props) + { + // AvroSerDe does not propagate initialization exceptions. Instead, it stores just an exception's message in + // this.configErrors (see https://issues.apache.org/jira/browse/HIVE-7868). In Presto, such behavior is not + // at all useful, as silenced exception usually carries important information which may be otherwise unavailable. + try { + return AvroSerdeUtils.determineSchemaOrThrowException(conf, props); + } + catch (IOException | AvroSerdeException e) { + throw new RuntimeException(e); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/DecimalCoercers.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/DecimalCoercers.java new file mode 100644 index 00000000..6a878971 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/DecimalCoercers.java @@ -0,0 +1,322 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.coercions; + +import io.airlift.slice.Slice; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.DoubleType; +import io.prestosql.spi.type.RealType; + +import static io.prestosql.spi.type.DecimalConversions.doubleToLongDecimal; +import static io.prestosql.spi.type.DecimalConversions.doubleToShortDecimal; +import static io.prestosql.spi.type.DecimalConversions.longDecimalToDouble; +import static io.prestosql.spi.type.DecimalConversions.longDecimalToReal; +import static io.prestosql.spi.type.DecimalConversions.longToLongCast; +import static io.prestosql.spi.type.DecimalConversions.longToShortCast; +import static io.prestosql.spi.type.DecimalConversions.realToLongDecimal; +import static io.prestosql.spi.type.DecimalConversions.realToShortDecimal; +import static io.prestosql.spi.type.DecimalConversions.shortDecimalToDouble; +import static io.prestosql.spi.type.DecimalConversions.shortDecimalToReal; +import static io.prestosql.spi.type.DecimalConversions.shortToLongCast; +import static io.prestosql.spi.type.DecimalConversions.shortToShortCast; +import static io.prestosql.spi.type.Decimals.longTenToNth; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.RealType.REAL; + +public final class DecimalCoercers +{ + private DecimalCoercers() {} + + public static HiveCoercer createDecimalToDecimalCoercer(DecimalType fromType, DecimalType toType) + { + if (fromType.isShort()) { + if (toType.isShort()) { + return new ShortDecimalToShortDecimalCoercer(fromType, toType); + } + else { + return new ShortDecimalToLongDecimalCoercer(fromType, toType); + } + } + else { + if (toType.isShort()) { + return new LongDecimalToShortDecimalCoercer(fromType, toType); + } + else { + return new LongDecimalToLongDecimalCoercer(fromType, toType); + } + } + } + + private static class ShortDecimalToShortDecimalCoercer + extends TypeCoercer + { + private final long rescale; + + public ShortDecimalToShortDecimalCoercer(DecimalType fromType, DecimalType toType) + { + super(fromType, toType); + rescale = longTenToNth(Math.abs(toType.getScale() - fromType.getScale())); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + long returnValue = shortToShortCast(fromType.getLong(block, position), + fromType.getPrecision(), + fromType.getScale(), + toType.getPrecision(), + toType.getScale(), + rescale, + rescale / 2); + toType.writeLong(blockBuilder, returnValue); + } + } + + private static class ShortDecimalToLongDecimalCoercer + extends TypeCoercer + { + public ShortDecimalToLongDecimalCoercer(DecimalType fromType, DecimalType toType) + { + super(fromType, toType); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + Slice coercedValue = shortToLongCast(fromType.getLong(block, position), + fromType.getPrecision(), + fromType.getScale(), + toType.getPrecision(), + toType.getScale()); + toType.writeSlice(blockBuilder, coercedValue); + } + } + + private static class LongDecimalToShortDecimalCoercer + extends TypeCoercer + { + public LongDecimalToShortDecimalCoercer(DecimalType fromType, DecimalType toType) + { + super(fromType, toType); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + long returnValue = longToShortCast(fromType.getSlice(block, position), + fromType.getPrecision(), + fromType.getScale(), + toType.getPrecision(), + toType.getScale()); + toType.writeLong(blockBuilder, returnValue); + } + } + + private static class LongDecimalToLongDecimalCoercer + extends TypeCoercer + { + public LongDecimalToLongDecimalCoercer(DecimalType fromType, DecimalType toType) + { + super(fromType, toType); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + Slice coercedValue = longToLongCast(fromType.getSlice(block, position), + fromType.getPrecision(), + fromType.getScale(), + toType.getPrecision(), + toType.getScale()); + toType.writeSlice(blockBuilder, coercedValue); + } + } + + public static HiveCoercer createDecimalToDoubleCoercer(DecimalType fromType) + { + if (fromType.isShort()) { + return new ShortDecimalToDoubleCoercer(fromType); + } + else { + return new LongDecimalToDoubleCoercer(fromType); + } + } + + private static class ShortDecimalToDoubleCoercer + extends TypeCoercer + { + private final long rescale; + + public ShortDecimalToDoubleCoercer(DecimalType fromType) + { + super(fromType, DOUBLE); + rescale = longTenToNth(fromType.getScale()); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + toType.writeDouble(blockBuilder, + shortDecimalToDouble(fromType.getLong(block, position), rescale)); + } + } + + private static class LongDecimalToDoubleCoercer + extends TypeCoercer + { + public LongDecimalToDoubleCoercer(DecimalType fromType) + { + super(fromType, DOUBLE); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + toType.writeDouble(blockBuilder, + longDecimalToDouble(fromType.getSlice(block, position), fromType.getScale())); + } + } + + public static HiveCoercer createDecimalToRealCoercer(DecimalType fromType) + { + if (fromType.isShort()) { + return new ShortDecimalToRealCoercer(fromType); + } + else { + return new LongDecimalToRealCoercer(fromType); + } + } + + private static class ShortDecimalToRealCoercer + extends TypeCoercer + { + private final long rescale; + + public ShortDecimalToRealCoercer(DecimalType fromType) + { + super(fromType, REAL); + rescale = longTenToNth(fromType.getScale()); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + toType.writeLong(blockBuilder, + shortDecimalToReal(fromType.getLong(block, position), rescale)); + } + } + + private static class LongDecimalToRealCoercer + extends TypeCoercer + { + public LongDecimalToRealCoercer(DecimalType fromType) + { + super(fromType, REAL); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + toType.writeLong(blockBuilder, + longDecimalToReal(fromType.getSlice(block, position), fromType.getScale())); + } + } + + public static HiveCoercer createDoubleToDecimalCoercer(DecimalType toType) + { + if (toType.isShort()) { + return new DoubleToShortDecimalCoercer(toType); + } + else { + return new DoubleToLongDecimalCoercer(toType); + } + } + + private static class DoubleToShortDecimalCoercer + extends TypeCoercer + { + public DoubleToShortDecimalCoercer(DecimalType toType) + { + super(DOUBLE, toType); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + toType.writeLong(blockBuilder, + doubleToShortDecimal(fromType.getDouble(block, position), toType.getPrecision(), toType.getScale())); + } + } + + private static class DoubleToLongDecimalCoercer + extends TypeCoercer + { + public DoubleToLongDecimalCoercer(DecimalType toType) + { + super(DOUBLE, toType); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + toType.writeSlice(blockBuilder, + doubleToLongDecimal(fromType.getDouble(block, position), toType.getPrecision(), toType.getScale())); + } + } + + public static HiveCoercer createRealToDecimalCoercer(DecimalType toType) + { + if (toType.isShort()) { + return new RealToShortDecimalCoercer(toType); + } + else { + return new RealToLongDecimalCoercer(toType); + } + } + + private static class RealToShortDecimalCoercer + extends TypeCoercer + { + public RealToShortDecimalCoercer(DecimalType toType) + { + super(REAL, toType); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + toType.writeLong(blockBuilder, + realToShortDecimal(fromType.getLong(block, position), toType.getPrecision(), toType.getScale())); + } + } + + private static class RealToLongDecimalCoercer + extends TypeCoercer + { + public RealToLongDecimalCoercer(DecimalType toType) + { + super(REAL, toType); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + toType.writeSlice(blockBuilder, + realToLongDecimal(fromType.getLong(block, position), toType.getPrecision(), toType.getScale())); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/DoubleToFloatCoercer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/DoubleToFloatCoercer.java new file mode 100644 index 00000000..ec5f64e9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/DoubleToFloatCoercer.java @@ -0,0 +1,39 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.coercions; + +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.type.DoubleType; +import io.prestosql.spi.type.RealType; + +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.RealType.REAL; +import static java.lang.Float.floatToRawIntBits; + +public class DoubleToFloatCoercer + extends TypeCoercer +{ + public DoubleToFloatCoercer() + { + super(DOUBLE, REAL); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + REAL.writeLong(blockBuilder, floatToRawIntBits((float) DOUBLE.getDouble(block, position))); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/FloatToDoubleCoercer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/FloatToDoubleCoercer.java new file mode 100644 index 00000000..5d8d1d88 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/FloatToDoubleCoercer.java @@ -0,0 +1,39 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.coercions; + +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.type.DoubleType; +import io.prestosql.spi.type.RealType; + +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.RealType.REAL; +import static java.lang.Float.intBitsToFloat; + +public class FloatToDoubleCoercer + extends TypeCoercer +{ + public FloatToDoubleCoercer() + { + super(REAL, DOUBLE); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + DOUBLE.writeDouble(blockBuilder, intBitsToFloat((int) REAL.getLong(block, position))); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/HiveCoercer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/HiveCoercer.java new file mode 100644 index 00000000..2321b455 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/HiveCoercer.java @@ -0,0 +1,246 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.coercions; + +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.ArrayBlock; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.ColumnarArray; +import io.prestosql.spi.block.ColumnarMap; +import io.prestosql.spi.block.ColumnarRow; +import io.prestosql.spi.block.DictionaryBlock; +import io.prestosql.spi.block.RowBlock; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.MapType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.type.VarcharType; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; + +import java.util.List; +import java.util.Optional; +import java.util.function.Function; + +import static io.prestosql.plugin.hive.HiveType.HIVE_BYTE; +import static io.prestosql.plugin.hive.HiveType.HIVE_DOUBLE; +import static io.prestosql.plugin.hive.HiveType.HIVE_FLOAT; +import static io.prestosql.plugin.hive.HiveType.HIVE_INT; +import static io.prestosql.plugin.hive.HiveType.HIVE_LONG; +import static io.prestosql.plugin.hive.HiveType.HIVE_SHORT; +import static io.prestosql.plugin.hive.HiveUtil.extractStructFieldTypes; +import static io.prestosql.plugin.hive.HiveUtil.isArrayType; +import static io.prestosql.plugin.hive.HiveUtil.isMapType; +import static io.prestosql.plugin.hive.HiveUtil.isRowType; +import static io.prestosql.plugin.hive.coercions.DecimalCoercers.createDecimalToDecimalCoercer; +import static io.prestosql.plugin.hive.coercions.DecimalCoercers.createDecimalToDoubleCoercer; +import static io.prestosql.plugin.hive.coercions.DecimalCoercers.createDecimalToRealCoercer; +import static io.prestosql.plugin.hive.coercions.DecimalCoercers.createDoubleToDecimalCoercer; +import static io.prestosql.plugin.hive.coercions.DecimalCoercers.createRealToDecimalCoercer; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.block.ColumnarArray.toColumnarArray; +import static io.prestosql.spi.block.ColumnarMap.toColumnarMap; +import static io.prestosql.spi.block.ColumnarRow.toColumnarRow; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.RealType.REAL; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public interface HiveCoercer + extends Function +{ + static HiveCoercer createCoercer(TypeManager typeManager, HiveType fromHiveType, HiveType toHiveType) + { + Type fromType = typeManager.getType(fromHiveType.getTypeSignature()); + Type toType = typeManager.getType(toHiveType.getTypeSignature()); + + if (toType instanceof VarcharType && fromType instanceof VarcharType) { + return new VarcharToVarcharCoercer((VarcharType) fromType, (VarcharType) toType); + } + if (toType instanceof VarcharType && (fromHiveType.equals(HIVE_BYTE) || fromHiveType.equals(HIVE_SHORT) || fromHiveType.equals(HIVE_INT) || fromHiveType.equals(HIVE_LONG))) { + return new IntegerNumberToVarcharCoercer<>(fromType, (VarcharType) toType); + } + if (fromType instanceof VarcharType && (toHiveType.equals(HIVE_BYTE) || toHiveType.equals(HIVE_SHORT) || toHiveType.equals(HIVE_INT) || toHiveType.equals(HIVE_LONG))) { + return new VarcharToIntegerNumberCoercer<>((VarcharType) fromType, toType); + } + if (fromHiveType.equals(HIVE_BYTE) && toHiveType.equals(HIVE_SHORT) || toHiveType.equals(HIVE_INT) || toHiveType.equals(HIVE_LONG)) { + return new IntegerNumberUpscaleCoercer<>(fromType, toType); + } + if (fromHiveType.equals(HIVE_SHORT) && toHiveType.equals(HIVE_INT) || toHiveType.equals(HIVE_LONG)) { + return new IntegerNumberUpscaleCoercer<>(fromType, toType); + } + if (fromHiveType.equals(HIVE_INT) && toHiveType.equals(HIVE_LONG)) { + return new IntegerNumberUpscaleCoercer<>(fromType, toType); + } + if (fromHiveType.equals(HIVE_FLOAT) && toHiveType.equals(HIVE_DOUBLE)) { + return new FloatToDoubleCoercer(); + } + if (fromHiveType.equals(HIVE_DOUBLE) && toHiveType.equals(HIVE_FLOAT)) { + return new DoubleToFloatCoercer(); + } + if (fromType instanceof DecimalType && toType instanceof DecimalType) { + return createDecimalToDecimalCoercer((DecimalType) fromType, (DecimalType) toType); + } + if (fromType instanceof DecimalType && toType == DOUBLE) { + return createDecimalToDoubleCoercer((DecimalType) fromType); + } + if (fromType instanceof DecimalType && toType == REAL) { + return createDecimalToRealCoercer((DecimalType) fromType); + } + if (fromType == DOUBLE && toType instanceof DecimalType) { + return createDoubleToDecimalCoercer((DecimalType) toType); + } + if (fromType == REAL && toType instanceof DecimalType) { + return createRealToDecimalCoercer((DecimalType) toType); + } + if (isArrayType(fromType) && isArrayType(toType)) { + return new ListCoercer(typeManager, fromHiveType, toHiveType); + } + if (isMapType(fromType) && isMapType(toType)) { + return new MapCoercer(typeManager, fromHiveType, toHiveType); + } + if (isRowType(fromType) && isRowType(toType)) { + return new StructCoercer(typeManager, fromHiveType, toHiveType); + } + + throw new PrestoException(NOT_SUPPORTED, format("Unsupported coercion from %s to %s", fromHiveType, toHiveType)); + } + + class ListCoercer + implements HiveCoercer + { + private final Function elementCoercer; + + public ListCoercer(TypeManager typeManager, HiveType fromHiveType, HiveType toHiveType) + { + requireNonNull(typeManager, "typeManage is null"); + requireNonNull(fromHiveType, "fromHiveType is null"); + requireNonNull(toHiveType, "toHiveType is null"); + HiveType fromElementHiveType = HiveType.valueOf(((ListTypeInfo) fromHiveType.getTypeInfo()).getListElementTypeInfo().getTypeName()); + HiveType toElementHiveType = HiveType.valueOf(((ListTypeInfo) toHiveType.getTypeInfo()).getListElementTypeInfo().getTypeName()); + this.elementCoercer = fromElementHiveType.equals(toElementHiveType) ? null : createCoercer(typeManager, fromElementHiveType, toElementHiveType); + } + + @Override + public Block apply(Block block) + { + if (elementCoercer == null) { + return block; + } + ColumnarArray arrayBlock = toColumnarArray(block); + Block elementsBlock = elementCoercer.apply(arrayBlock.getElementsBlock()); + boolean[] valueIsNull = new boolean[arrayBlock.getPositionCount()]; + int[] offsets = new int[arrayBlock.getPositionCount() + 1]; + for (int i = 0; i < arrayBlock.getPositionCount(); i++) { + valueIsNull[i] = arrayBlock.isNull(i); + offsets[i + 1] = offsets[i] + arrayBlock.getLength(i); + } + return ArrayBlock.fromElementBlock(arrayBlock.getPositionCount(), Optional.of(valueIsNull), offsets, elementsBlock); + } + } + + class MapCoercer + implements HiveCoercer + { + private final Type toType; + private final Function keyCoercer; + private final Function valueCoercer; + + public MapCoercer(TypeManager typeManager, HiveType fromHiveType, HiveType toHiveType) + { + requireNonNull(typeManager, "typeManage is null"); + requireNonNull(fromHiveType, "fromHiveType is null"); + this.toType = requireNonNull(toHiveType, "toHiveType is null").getType(typeManager); + HiveType fromKeyHiveType = HiveType.valueOf(((MapTypeInfo) fromHiveType.getTypeInfo()).getMapKeyTypeInfo().getTypeName()); + HiveType fromValueHiveType = HiveType.valueOf(((MapTypeInfo) fromHiveType.getTypeInfo()).getMapValueTypeInfo().getTypeName()); + HiveType toKeyHiveType = HiveType.valueOf(((MapTypeInfo) toHiveType.getTypeInfo()).getMapKeyTypeInfo().getTypeName()); + HiveType toValueHiveType = HiveType.valueOf(((MapTypeInfo) toHiveType.getTypeInfo()).getMapValueTypeInfo().getTypeName()); + this.keyCoercer = fromKeyHiveType.equals(toKeyHiveType) ? null : createCoercer(typeManager, fromKeyHiveType, toKeyHiveType); + this.valueCoercer = fromValueHiveType.equals(toValueHiveType) ? null : createCoercer(typeManager, fromValueHiveType, toValueHiveType); + } + + @Override + public Block apply(Block block) + { + ColumnarMap mapBlock = toColumnarMap(block); + Block keysBlock = keyCoercer == null ? mapBlock.getKeysBlock() : keyCoercer.apply(mapBlock.getKeysBlock()); + Block valuesBlock = valueCoercer == null ? mapBlock.getValuesBlock() : valueCoercer.apply(mapBlock.getValuesBlock()); + boolean[] valueIsNull = new boolean[mapBlock.getPositionCount()]; + int[] offsets = new int[mapBlock.getPositionCount() + 1]; + for (int i = 0; i < mapBlock.getPositionCount(); i++) { + valueIsNull[i] = mapBlock.isNull(i); + offsets[i + 1] = offsets[i] + mapBlock.getEntryCount(i); + } + return ((MapType) toType).createBlockFromKeyValue(Optional.of(valueIsNull), offsets, keysBlock, valuesBlock); + } + } + + class StructCoercer + implements HiveCoercer + { + private final List>> coercers; + private final Block[] nullBlocks; + + public StructCoercer(TypeManager typeManager, HiveType fromHiveType, HiveType toHiveType) + { + requireNonNull(typeManager, "typeManage is null"); + requireNonNull(fromHiveType, "fromHiveType is null"); + requireNonNull(toHiveType, "toHiveType is null"); + List fromFieldTypes = extractStructFieldTypes(fromHiveType); + List toFieldTypes = extractStructFieldTypes(toHiveType); + ImmutableList.Builder>> coercers = ImmutableList.builder(); + this.nullBlocks = new Block[toFieldTypes.size()]; + for (int i = 0; i < toFieldTypes.size(); i++) { + if (i >= fromFieldTypes.size()) { + nullBlocks[i] = toFieldTypes.get(i).getType(typeManager).createBlockBuilder(null, 1).appendNull().build(); + coercers.add(Optional.empty()); + } + else if (!fromFieldTypes.get(i).equals(toFieldTypes.get(i))) { + coercers.add(Optional.of(createCoercer(typeManager, fromFieldTypes.get(i), toFieldTypes.get(i)))); + } + else { + coercers.add(Optional.empty()); + } + } + this.coercers = coercers.build(); + } + + @Override + public Block apply(Block block) + { + ColumnarRow rowBlock = toColumnarRow(block); + Block[] fields = new Block[coercers.size()]; + int[] ids = new int[rowBlock.getField(0).getPositionCount()]; + for (int i = 0; i < coercers.size(); i++) { + Optional> coercer = coercers.get(i); + if (coercer.isPresent()) { + fields[i] = coercer.get().apply(rowBlock.getField(i)); + } + else if (i < rowBlock.getFieldCount()) { + fields[i] = rowBlock.getField(i); + } + else { + fields[i] = new DictionaryBlock(nullBlocks[i], ids); + } + } + boolean[] valueIsNull = new boolean[rowBlock.getPositionCount()]; + for (int i = 0; i < rowBlock.getPositionCount(); i++) { + valueIsNull[i] = rowBlock.isNull(i); + } + return RowBlock.fromFieldBlocks(valueIsNull.length, Optional.of(valueIsNull), fields); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/IntegerNumberToVarcharCoercer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/IntegerNumberToVarcharCoercer.java new file mode 100644 index 00000000..da381171 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/IntegerNumberToVarcharCoercer.java @@ -0,0 +1,37 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.coercions; + +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.VarcharType; + +import static io.airlift.slice.Slices.utf8Slice; + +public class IntegerNumberToVarcharCoercer + extends TypeCoercer +{ + public IntegerNumberToVarcharCoercer(F fromType, VarcharType toType) + { + super(fromType, toType); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + toType.writeSlice(blockBuilder, utf8Slice(String.valueOf(fromType.getLong(block, position)))); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/IntegerNumberUpscaleCoercer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/IntegerNumberUpscaleCoercer.java new file mode 100644 index 00000000..d9c5956b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/IntegerNumberUpscaleCoercer.java @@ -0,0 +1,34 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.coercions; + +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.type.Type; + +public class IntegerNumberUpscaleCoercer + extends TypeCoercer +{ + public IntegerNumberUpscaleCoercer(F fromType, T toType) + { + super(fromType, toType); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + toType.writeLong(blockBuilder, fromType.getLong(block, position)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/TypeCoercer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/TypeCoercer.java new file mode 100644 index 00000000..30154d36 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/TypeCoercer.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.coercions; + +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.type.Type; + +import static java.util.Objects.requireNonNull; + +public abstract class TypeCoercer + implements HiveCoercer +{ + protected final F fromType; + protected final T toType; + + protected TypeCoercer(F fromType, T toType) + { + this.fromType = requireNonNull(fromType); + this.toType = requireNonNull(toType); + } + + @Override + public Block apply(Block block) + { + BlockBuilder blockBuilder = toType.createBlockBuilder(null, block.getPositionCount()); + for (int i = 0; i < block.getPositionCount(); i++) { + if (block.isNull(i)) { + blockBuilder.appendNull(); + continue; + } + applyCoercedValue(blockBuilder, block, i); + } + return blockBuilder.build(); + } + + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + throw new UnsupportedOperationException(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/VarcharToIntegerNumberCoercer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/VarcharToIntegerNumberCoercer.java new file mode 100644 index 00000000..a6c03696 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/VarcharToIntegerNumberCoercer.java @@ -0,0 +1,77 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.coercions; + +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.VarcharType; + +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static java.lang.String.format; + +public class VarcharToIntegerNumberCoercer + extends TypeCoercer +{ + private final long minValue; + private final long maxValue; + + public VarcharToIntegerNumberCoercer(VarcharType fromType, T toType) + { + super(fromType, toType); + + if (toType.equals(TINYINT)) { + minValue = Byte.MIN_VALUE; + maxValue = Byte.MAX_VALUE; + } + else if (toType.equals(SMALLINT)) { + minValue = Short.MIN_VALUE; + maxValue = Short.MAX_VALUE; + } + else if (toType.equals(INTEGER)) { + minValue = Integer.MIN_VALUE; + maxValue = Integer.MAX_VALUE; + } + else if (toType.equals(BIGINT)) { + minValue = Long.MIN_VALUE; + maxValue = Long.MAX_VALUE; + } + else { + throw new PrestoException(NOT_SUPPORTED, format("Could not create Coercer from from varchar to %s", toType)); + } + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + try { + long value = Long.parseLong(fromType.getSlice(block, position).toStringUtf8()); + if (minValue <= value && value <= maxValue) { + toType.writeLong(blockBuilder, value); + } + else { + blockBuilder.appendNull(); + } + } + catch (NumberFormatException e) { + blockBuilder.appendNull(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/VarcharToVarcharCoercer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/VarcharToVarcharCoercer.java new file mode 100644 index 00000000..2ca5cd0c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/coercions/VarcharToVarcharCoercer.java @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.coercions; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.type.VarcharType; + +public class VarcharToVarcharCoercer + extends TypeCoercer +{ + public VarcharToVarcharCoercer(VarcharType fromType, VarcharType toType) + { + super(fromType, toType); + } + + @Override + protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position) + { + Slice from = fromType.getSlice(block, position); + int length = Math.min(from.length(), toType.getLength().orElse(VarcharType.UNBOUNDED_LENGTH)); + Slice copy = Slices.copyOf(from, 0, length); + toType.writeSlice(blockBuilder, copy); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/GcsAccessTokenProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/GcsAccessTokenProvider.java new file mode 100644 index 00000000..c456bd85 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/GcsAccessTokenProvider.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.gcs; + +import com.google.cloud.hadoop.util.AccessTokenProvider; +import org.apache.hadoop.conf.Configuration; + +import static com.google.common.base.Strings.nullToEmpty; +import static java.util.concurrent.TimeUnit.HOURS; + +public class GcsAccessTokenProvider + implements AccessTokenProvider +{ + public static final String GCS_ACCESS_TOKEN_CONF = "presto.gcs.oauth-access-token"; + public static final Long EXPIRATION_TIME_MILLISECONDS = HOURS.toMillis(1); + private Configuration config; + + @Override + public AccessToken getAccessToken() + { + return new AccessToken(nullToEmpty(config.get(GCS_ACCESS_TOKEN_CONF)), EXPIRATION_TIME_MILLISECONDS); + } + + @Override + public void refresh() {} + + @Override + public void setConf(Configuration configuration) + { + this.config = configuration; + } + + @Override + public Configuration getConf() + { + return config; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/GcsConfigurationProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/GcsConfigurationProvider.java new file mode 100644 index 00000000..421d4285 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/GcsConfigurationProvider.java @@ -0,0 +1,43 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.gcs; + +import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem; +import io.prestosql.plugin.hive.DynamicConfigurationProvider; +import org.apache.hadoop.conf.Configuration; + +import java.net.URI; + +import static io.prestosql.plugin.hive.DynamicConfigurationProvider.setCacheKey; +import static io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; + +public class GcsConfigurationProvider + implements DynamicConfigurationProvider +{ + private static final String GCS_OAUTH_KEY = "hive.gcs.oauth"; + + @Override + public void updateConfiguration(Configuration configuration, HdfsContext context, URI uri) + { + if (!uri.getScheme().equals(GoogleCloudStorageFileSystem.SCHEME)) { + return; + } + + String accessToken = context.getIdentity().getExtraCredentials().get(GCS_OAUTH_KEY); + if (accessToken != null) { + configuration.set(GcsAccessTokenProvider.GCS_ACCESS_TOKEN_CONF, accessToken); + setCacheKey(configuration, accessToken); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/GoogleGcsConfigurationInitializer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/GoogleGcsConfigurationInitializer.java new file mode 100644 index 00000000..b75751ba --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/GoogleGcsConfigurationInitializer.java @@ -0,0 +1,56 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.gcs; + +import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem; +import io.prestosql.plugin.hive.s3.ConfigurationInitializer; +import org.apache.hadoop.conf.Configuration; + +import javax.inject.Inject; + +import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemBase.AUTHENTICATION_PREFIX; +import static com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemConfiguration.AUTH_SERVICE_ACCOUNT_ENABLE; +import static com.google.cloud.hadoop.util.AccessTokenProviderClassFromConfigFactory.ACCESS_TOKEN_PROVIDER_IMPL_SUFFIX; +import static com.google.cloud.hadoop.util.EntriesCredentialConfiguration.JSON_KEYFILE_SUFFIX; + +public class GoogleGcsConfigurationInitializer + implements ConfigurationInitializer +{ + private final boolean useGcsAccessToken; + private final String jsonKeyFilePath; + + @Inject + public GoogleGcsConfigurationInitializer(HiveGcsConfig config) + { + this.useGcsAccessToken = config.isUseGcsAccessToken(); + this.jsonKeyFilePath = config.getJsonKeyFilePath(); + } + + @Override + public void initializeConfiguration(Configuration config) + { + config.set("fs.gs.impl", GoogleHadoopFileSystem.class.getName()); + + if (useGcsAccessToken) { + // use oauth token to authenticate with Google Cloud Storage + config.set(AUTH_SERVICE_ACCOUNT_ENABLE.getKey(), "false"); + config.set(AUTHENTICATION_PREFIX + ACCESS_TOKEN_PROVIDER_IMPL_SUFFIX, GcsAccessTokenProvider.class.getName()); + } + else if (jsonKeyFilePath != null) { + // use service account key file + config.set(AUTH_SERVICE_ACCOUNT_ENABLE.getKey(), "true"); + config.set(AUTHENTICATION_PREFIX + JSON_KEYFILE_SUFFIX, jsonKeyFilePath); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/HiveGcsConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/HiveGcsConfig.java new file mode 100644 index 00000000..fb4c4746 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/HiveGcsConfig.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.gcs; + +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; + +public class HiveGcsConfig +{ + private boolean useGcsAccessToken; + private String jsonKeyFilePath; + + public String getJsonKeyFilePath() + { + return jsonKeyFilePath; + } + + @Config("hive.gcs.json-key-file-path") + @ConfigDescription("JSON key file used to access Google Cloud Storage") + public HiveGcsConfig setJsonKeyFilePath(String jsonKeyFilePath) + { + this.jsonKeyFilePath = jsonKeyFilePath; + return this; + } + + public boolean isUseGcsAccessToken() + { + return useGcsAccessToken; + } + + @Config("hive.gcs.use-access-token") + @ConfigDescription("Use client-provided OAuth token to access Google Cloud Storage") + public HiveGcsConfig setUseGcsAccessToken(boolean useGcsAccessToken) + { + this.useGcsAccessToken = useGcsAccessToken; + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/HiveGcsModule.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/HiveGcsModule.java new file mode 100644 index 00000000..7e36872a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/gcs/HiveGcsModule.java @@ -0,0 +1,40 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.gcs; + +import com.google.inject.Binder; +import com.google.inject.Scopes; +import com.google.inject.multibindings.Multibinder; +import io.airlift.configuration.AbstractConfigurationAwareModule; +import io.prestosql.plugin.hive.DynamicConfigurationProvider; +import io.prestosql.plugin.hive.s3.ConfigurationInitializer; + +import static com.google.inject.multibindings.Multibinder.newSetBinder; +import static io.airlift.configuration.ConfigBinder.configBinder; + +public class HiveGcsModule + extends AbstractConfigurationAwareModule +{ + @Override + protected void setup(Binder binder) + { + configBinder(binder).bindConfig(HiveGcsConfig.class); + + Multibinder.newSetBinder(binder, ConfigurationInitializer.class).addBinding().to(GoogleGcsConfigurationInitializer.class).in(Scopes.SINGLETON); + + if (buildConfigObject(HiveGcsConfig.class).isUseGcsAccessToken()) { + newSetBinder(binder, DynamicConfigurationProvider.class).addBinding().to(GcsConfigurationProvider.class).in(Scopes.SINGLETON); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/BooleanStatistics.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/BooleanStatistics.java new file mode 100644 index 00000000..efc7f0ef --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/BooleanStatistics.java @@ -0,0 +1,82 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import javax.annotation.concurrent.Immutable; + +import java.util.Objects; +import java.util.OptionalLong; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class BooleanStatistics +{ + private final OptionalLong trueCount; + private final OptionalLong falseCount; + + @JsonCreator + public BooleanStatistics( + @JsonProperty("trueCount") OptionalLong trueCount, + @JsonProperty("falseCount") OptionalLong falseCount) + { + this.trueCount = requireNonNull(trueCount, "trueCount is null"); + this.falseCount = requireNonNull(falseCount, "falseCount is null"); + } + + @JsonProperty + public OptionalLong getTrueCount() + { + return trueCount; + } + + @JsonProperty + public OptionalLong getFalseCount() + { + return falseCount; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + BooleanStatistics that = (BooleanStatistics) o; + return Objects.equals(trueCount, that.trueCount) && + Objects.equals(falseCount, that.falseCount); + } + + @Override + public int hashCode() + { + return Objects.hash(trueCount, falseCount); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("trueCount", trueCount) + .add("falseCount", falseCount) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/CachingHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/CachingHiveMetastore.java new file mode 100644 index 00000000..03e53b8b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/CachingHiveMetastore.java @@ -0,0 +1,1353 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterables; +import com.google.common.collect.SetMultimap; +import com.google.common.util.concurrent.UncheckedExecutionException; +import io.airlift.log.Logger; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.ForCachingHiveMetastore; +import io.prestosql.plugin.hive.ForCachingHiveMetastoreTableRefresh; +import io.prestosql.plugin.hive.HiveBasicStatistics; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HivePartition; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.PartitionNotFoundException; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil; +import io.prestosql.spi.NodeManager; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.metastore.api.DataOperationType; +import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; +import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; +import org.weakref.jmx.Managed; + +import javax.annotation.concurrent.ThreadSafe; +import javax.inject.Inject; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.Optional; +import java.util.OptionalLong; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executor; +import java.util.function.Function; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Throwables.throwIfInstanceOf; +import static com.google.common.base.Throwables.throwIfUnchecked; +import static com.google.common.cache.CacheLoader.asyncReloading; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.google.common.collect.ImmutableSetMultimap.toImmutableSetMultimap; +import static com.google.common.collect.Iterables.transform; +import static com.google.common.collect.Maps.immutableEntry; +import static com.google.common.collect.Streams.stream; +import static com.google.common.util.concurrent.MoreExecutors.newDirectExecutorService; +import static io.prestosql.plugin.hive.HivePartitionManager.extractPartitionValues; +import static io.prestosql.plugin.hive.metastore.HivePartitionName.hivePartitionName; +import static io.prestosql.plugin.hive.metastore.HiveTableName.hiveTableName; +import static io.prestosql.plugin.hive.metastore.MetastoreUtil.makePartitionName; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.MILLISECONDS; + +/** + * Hive Metastore Cache + */ +@ThreadSafe +public class CachingHiveMetastore + implements HiveMetastore +{ + private static final Logger LOG = Logger.get(CachingHiveMetastore.class); + public static final int PASSIVE_CACHE_VERIFICATION_THRESHOLD = 300 * 1000; + public static final int TABLE_CACHE_CLEANUP_TIME = 2000; + public static final int TABLE_CACHE_REFRESH_TIME = 1000; + + protected final HiveMetastore delegate; + private final LoadingCache> databaseCache; + private final LoadingCache> databaseNamesCache; + private final LoadingCache>> tableNamesCache; + private final LoadingCache>> viewNamesCache; + private final LoadingCache> rolesCache; + private final LoadingCache> roleGrantsCache; + private final LoadingCache> configValuesCache; + + private final LoadingCache, Optional
> tableCache; + private final LoadingCache, Optional>> partitionNamesCache; + private final LoadingCache, WithValidation> tableStatisticsCache; + private final LoadingCache, WithValidation> partitionStatisticsCache; + private final LoadingCache, Optional>> partitionCache; + private final LoadingCache, Optional>>> partitionFilterCache; + + private final boolean skipCache; + private final boolean skipTableCache; + private final boolean dontVerifyCacheEntry; + + @Inject + public CachingHiveMetastore(@ForCachingHiveMetastore HiveMetastore delegate, + @ForCachingHiveMetastore Executor executor, + @ForCachingHiveMetastoreTableRefresh Executor tableRefreshExecutor, + HiveConfig hiveConfig, + NodeManager nodeManager) + { + this( + delegate, + executor, + tableRefreshExecutor, + hiveConfig.getMetastoreCacheTtl(), + hiveConfig.getMetastoreRefreshInterval(), + hiveConfig.getMetastoreDBCacheTtl(), + hiveConfig.getMetastoreDBRefreshInterval(), + hiveConfig.getMetastoreCacheMaximumSize(), + !(nodeManager.getCurrentNode().isCoordinator() || hiveConfig.getWorkerMetaStoreCacheEnabled())); + } + + public CachingHiveMetastore(HiveMetastore delegate, Executor executor, Executor tableRefreshExecutor, Duration cacheTtl, Duration refreshInterval, + Duration dbCacheTtl, Duration dbRefreshInterval, + long maximumSize, boolean skipCache) + { + this( + delegate, + executor, + tableRefreshExecutor, OptionalLong.of(cacheTtl.toMillis()), + refreshInterval.toMillis() >= cacheTtl.toMillis() ? OptionalLong.empty() : OptionalLong.of(refreshInterval.toMillis()), + OptionalLong.of(dbCacheTtl.toMillis()), + dbRefreshInterval.toMillis() >= dbCacheTtl.toMillis() ? OptionalLong.empty() : OptionalLong.of(dbRefreshInterval.toMillis()), + maximumSize, + skipCache); + } + + public static CachingHiveMetastore memoizeMetastore(HiveMetastore delegate, long maximumSize) + { + // If delegate is instance of CachingHiveMetastore, we are bypassing directly to second layer of cache, to get cached values. + return new CachingHiveMetastore( + delegate, + newDirectExecutorService(), + newDirectExecutorService(), + OptionalLong.empty(), + OptionalLong.empty(), + OptionalLong.empty(), + OptionalLong.empty(), + maximumSize, + false || delegate instanceof CachingHiveMetastore); + } + + private CachingHiveMetastore(HiveMetastore delegate, Executor executor, Executor tableRefreshExecutor, + OptionalLong expiresAfterWriteMillisTable, OptionalLong refreshMillsTable, + OptionalLong expiresAfterWriteMillisDB, OptionalLong refreshMillsDB, + long maximumSize, boolean skipCache) + { + boolean dontVerifyCache; + this.delegate = requireNonNull(delegate, "delegate is null"); + requireNonNull(executor, "executor is null"); + + // if refreshMillsDB is present and is 0 , keeps cache unrefreshed. + this.skipCache = skipCache + || (refreshMillsDB.isPresent() && refreshMillsDB.getAsLong() == 0); + this.skipTableCache = skipCache + || (refreshMillsTable.isPresent() && refreshMillsTable.getAsLong() == 0 + && expiresAfterWriteMillisTable.isPresent() && expiresAfterWriteMillisTable.getAsLong() == 0); + + OptionalLong tableCacheTtl; + OptionalLong tableRefreshTtl; + + dontVerifyCache = true; + tableCacheTtl = expiresAfterWriteMillisTable; + tableRefreshTtl = refreshMillsTable; + + if (this.skipTableCache == false) { + long refresh = refreshMillsTable.orElse(0); + long ttl = expiresAfterWriteMillisTable.orElse(0); + if (refresh > PASSIVE_CACHE_VERIFICATION_THRESHOLD + || (0 == refresh && ttl > PASSIVE_CACHE_VERIFICATION_THRESHOLD)) { + dontVerifyCache = false; + tableCacheTtl = OptionalLong.of(TABLE_CACHE_CLEANUP_TIME); + tableRefreshTtl = OptionalLong.of(TABLE_CACHE_REFRESH_TIME); + } + } + + dontVerifyCacheEntry = dontVerifyCache; + databaseNamesCache = newCacheBuilder(expiresAfterWriteMillisDB, refreshMillsDB, maximumSize) + .build(asyncReloading(CacheLoader.from(this::loadAllDatabases), executor)); + + databaseCache = newCacheBuilder(expiresAfterWriteMillisDB, refreshMillsDB, maximumSize) + .build(asyncReloading(CacheLoader.from(this::loadDatabase), executor)); + + tableNamesCache = newCacheBuilder(expiresAfterWriteMillisDB, refreshMillsDB, maximumSize) + .build(asyncReloading(CacheLoader.from(this::loadAllTables), executor)); + + viewNamesCache = newCacheBuilder(expiresAfterWriteMillisDB, refreshMillsDB, maximumSize) + .build(asyncReloading(CacheLoader.from(this::loadAllViews), executor)); + + tableCache = newCacheBuilder(tableCacheTtl, tableRefreshTtl, maximumSize) + .build(asyncReloading(CacheLoader.from(this::loadTable), tableRefreshExecutor)); + + partitionNamesCache = newCacheBuilder(tableCacheTtl, tableRefreshTtl, maximumSize) + .build(asyncReloading(CacheLoader.from(this::loadPartitionNames), tableRefreshExecutor)); + + tableStatisticsCache = newCacheBuilder(expiresAfterWriteMillisTable, refreshMillsTable, maximumSize) + .build(asyncReloading(new CacheLoader, WithValidation>() + { + @Override + public WithValidation load(WithIdentity key) + { + Table table = getExistingTable(key.getIdentity(), key.getKey().getDatabaseName(), key.getKey().getTableName()); + PartitionStatistics ps = loadTableColumnStatistics(key, table); + Table validationParams = getCacheValidationParams(key.getIdentity(), table); + return new WithValidation<>(validationParams, ps); + } + }, executor)); + + partitionStatisticsCache = newCacheBuilder(expiresAfterWriteMillisTable, refreshMillsTable, maximumSize) + .build(asyncReloading(new CacheLoader, WithValidation>() + { + @Override + public WithValidation load(WithIdentity key) + { + return loadPartitionColumnStatistics(key); + } + + @Override + public Map, WithValidation> loadAll(Iterable> keys) + { + return loadPartitionColumnStatistics(keys); + } + }, executor)); + + partitionFilterCache = newCacheBuilder(expiresAfterWriteMillisTable, refreshMillsTable, maximumSize) + .build(asyncReloading(CacheLoader.from(this::loadPartitionNamesByParts), executor)); + + partitionCache = newCacheBuilder(expiresAfterWriteMillisTable, refreshMillsTable, maximumSize) + .build(asyncReloading(new CacheLoader, Optional>>() + { + @Override + public Optional> load(WithIdentity partitionName) + { + return loadPartitionByName(partitionName); + } + + @Override + public Map, Optional>> loadAll(Iterable> partitionNames) + { + return loadPartitionsByNames(partitionNames); + } + }, executor)); + + rolesCache = newCacheBuilder(expiresAfterWriteMillisDB, refreshMillsDB, maximumSize) + .build(asyncReloading(CacheLoader.from(() -> loadRoles()), executor)); + + roleGrantsCache = newCacheBuilder(expiresAfterWriteMillisDB, refreshMillsDB, maximumSize) + .build(asyncReloading(CacheLoader.from(this::loadRoleGrants), executor)); + + configValuesCache = newCacheBuilder(expiresAfterWriteMillisDB, refreshMillsDB, maximumSize) + .build(asyncReloading(CacheLoader.from(this::loadConfigValue), executor)); + } + + @Override + public void refreshMetastoreCache() + { + if (skipCache) { + delegate.refreshMetastoreCache(); + } + flushCache(); + } + + @Managed + public void flushCache() + { + databaseNamesCache.invalidateAll(); + tableNamesCache.invalidateAll(); + viewNamesCache.invalidateAll(); + partitionNamesCache.invalidateAll(); + databaseCache.invalidateAll(); + tableCache.invalidateAll(); + partitionCache.invalidateAll(); + partitionFilterCache.invalidateAll(); + tableStatisticsCache.invalidateAll(); + partitionStatisticsCache.invalidateAll(); + rolesCache.invalidateAll(); + } + + private static V get(LoadingCache cache, K key) + { + try { + return cache.getUnchecked(key); + } + catch (UncheckedExecutionException e) { + throwIfInstanceOf(e.getCause(), PrestoException.class); + throw e; + } + } + + private static Map getAll(LoadingCache cache, Iterable keys) + { + try { + return cache.getAll(keys); + } + catch (ExecutionException | UncheckedExecutionException e) { + throwIfInstanceOf(e.getCause(), PrestoException.class); + throwIfUnchecked(e); + throw new UncheckedExecutionException(e); + } + } + + @Override + public Optional getDatabase(String databaseName) + { + if (skipCache) { + return this.delegate.getDatabase(databaseName); + } + + return get(databaseCache, databaseName); + } + + private Optional loadDatabase(String databaseName) + { + return delegate.getDatabase(databaseName); + } + + @Override + public List getAllDatabases() + { + if (skipCache) { + return this.delegate.getAllDatabases(); + } + + return get(databaseNamesCache, ""); + } + + private List loadAllDatabases() + { + return delegate.getAllDatabases(); + } + + private Table getExistingTable(HiveIdentity identity, String databaseName, String tableName) + { + return getTable(identity, databaseName, tableName) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + } + + private Table getCacheValidationParams(HiveIdentity identity, String databaseName, String tableName) + { + if (dontVerifyCacheEntry) { + return null; + } + + Table table = getTable(identity, databaseName, tableName) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + + if (table.getPartitionColumns().size() > 0) { + Table.Builder builder = Table.builder(table); + getPartitionNames(identity, databaseName, tableName) + .ifPresent(list -> builder.setParameter("partitionNames", String.valueOf(list.hashCode()))); + + return builder.build(); + } + + return table; + } + + private Table getCacheValidationParams(HiveIdentity identity, Table table) + { + if (dontVerifyCacheEntry) { + return null; + } + + if (table.getPartitionColumns().size() > 0) { + Table.Builder builder = Table.builder(table); + getPartitionNames(identity, table.getDatabaseName(), table.getTableName()) + .ifPresent(list -> builder.setParameter("partitionNames", String.valueOf(list.hashCode()))); + + return builder.build(); + } + + return table; + } + + private Table getCacheValidationPartitionParams(Table table, HiveBasicStatistics partition) + { + if (dontVerifyCacheEntry) { + return null; + } + + if (table.getPartitionColumns().size() > 0) { + Table.Builder builder = Table.builder(table); + builder.setParameter("partition::rowCount", partition.getRowCount().toString()); + builder.setParameter("partition::fileCount", partition.getFileCount().toString()); + builder.setParameter("partition::inMemSize", partition.getInMemoryDataSizeInBytes().toString()); + builder.setParameter("partition::onDiskSize", partition.getOnDiskDataSizeInBytes().toString()); + + return builder.build(); + } + + return table; + } + + @Override + public Optional
getTable(HiveIdentity identity, String databaseName, String tableName) + { + identity = updateIdentity(identity); + if (skipTableCache) { + return delegate.getTable(identity, databaseName, tableName); + } + + return get(tableCache, new WithIdentity<>(identity, HiveTableName.hiveTableName(databaseName, tableName))); + } + + @Override + public Set getSupportedColumnStatistics(Type type) + { + return delegate.getSupportedColumnStatistics(type); + } + + private Optional
loadTable(WithIdentity hiveTableName) + { + Optional
table = delegate.getTable(hiveTableName.getIdentity(), hiveTableName.getKey().getDatabaseName(), hiveTableName.getKey().getTableName()); + Map>> map = tableNamesCache.asMap(); + String databaseName = hiveTableName.getKey().getDatabaseName(); + + if (map.containsKey(databaseName)) { + Optional> allTables = map.get(databaseName); + if (allTables.isPresent()) { + /* New Table or Dropped table */ + if ((table.isPresent() && !allTables.get().contains(hiveTableName.getKey().getTableName())) + || (!table.isPresent() && allTables.get().contains(hiveTableName.getKey().getTableName()))) { + tableNamesCache.invalidate(databaseName); + } + } + } + return table; + } + + @Override + public PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) + { + if (skipTableCache) { + return delegate.getTableStatistics(identity, table); + } + + WithIdentity key = new WithIdentity<>(updateIdentity(identity), hiveTableName(table.getDatabaseName(), table.getTableName())); + WithValidation ps = get(tableStatisticsCache, key); + + /* Note: table object need not have partition info for validation; as it will come here for only non-partitioned tables*/ + if (dontVerifyCacheEntry || ps.matches(table)) { + return ps.get(); + } + + tableStatisticsCache.invalidate(key); + return get(tableStatisticsCache, key).get(); + } + + private PartitionStatistics loadTableColumnStatistics(WithIdentity hiveTableName, Table table) + { + return delegate.getTableStatistics(hiveTableName.getIdentity(), table); + } + + @Override + public Map getPartitionStatistics(HiveIdentity identity, Table table, List partitionNames) + { + HiveTableName hiveTableName = hiveTableName(table.getDatabaseName(), table.getTableName()); + List> partitions = partitionNames.stream() + .map(partition -> new WithIdentity<>(updateIdentity(identity), hivePartitionName(hiveTableName, makePartitionName(table, partition)))) + .collect(toImmutableList()); + + if (skipTableCache) { + HiveIdentity identity1 = updateIdentity(identity); + return delegate.getPartitionStatistics(identity1, table, partitionNames); + } + + Map, WithValidation> statistics = getAll(partitionStatisticsCache, partitions); + if (dontVerifyCacheEntry + || statistics.size() == 0) { + return statistics.entrySet() + .stream() + .collect(toImmutableMap(entry -> entry.getKey().getKey().getPartitionName().get(), + entry -> entry.getValue().get())); + } + + Map, WithValidation> finalStatistics = statistics; + boolean allMatch = partitionNames.stream() + .allMatch(partition -> finalStatistics.get(new WithIdentity<>(updateIdentity(identity), hivePartitionName(hiveTableName, makePartitionName(table, partition)))) + .matches(getCacheValidationPartitionParams(table, + ThriftMetastoreUtil.getHiveBasicStatistics(partition.getParameters())))); + if (allMatch) { + return statistics.entrySet() + .stream() + .collect(toImmutableMap(entry -> entry.getKey().getKey().getPartitionName().get(), + entry -> entry.getValue().get())); + } + + partitionCache.invalidate(partitions); + partitionStatisticsCache.invalidate(partitions); + statistics = getAll(partitionStatisticsCache, partitions); + return statistics.entrySet() + .stream() + .collect(toImmutableMap(entry -> entry.getKey().getKey().getPartitionName().get(), + entry -> entry.getValue().get())); + } + + private WithValidation loadPartitionColumnStatistics(WithIdentity partition) + { + HiveTableName hiveTableName = partition.getKey().getHiveTableName(); + HiveIdentity identity = partition.getIdentity(); + + Table table = getExistingTable(identity, hiveTableName.getDatabaseName(), hiveTableName.getTableName()); + String partitionName = partition.getKey().getPartitionName().get(); + Map partitionStatistics = delegate.getPartitionStatistics( + identity, + table, + ImmutableList.of(getExistingPartition(identity, table, partition.getKey().getPartitionValues()))); + if (!partitionStatistics.containsKey(partitionName)) { + throw new PrestoException(HiveErrorCode.HIVE_PARTITION_DROPPED_DURING_QUERY, "Statistics result does not contain entry for partition: " + partition.getKey().getPartitionName()); + } + + PartitionStatistics value = partitionStatistics.get(partitionName); + return new WithValidation<>(getCacheValidationPartitionParams(table, value.getBasicStatistics()), value); + } + + private Map, WithValidation> loadPartitionColumnStatistics(Iterable> keys) + { + SetMultimap, WithIdentity> tablePartitions = stream(keys) + .collect(toImmutableSetMultimap(value -> new WithIdentity<>(value.getIdentity(), value.getKey().getHiveTableName()), key -> key)); + ImmutableMap.Builder, WithValidation> result = ImmutableMap.builder(); + tablePartitions.keySet().forEach(tableName -> { + Set> partitionNames = tablePartitions.get(tableName); + Set partitionNameStrings = partitionNames.stream() + .map(partitionName -> partitionName.getKey().getPartitionName().get()) + .collect(toImmutableSet()); + Table table = getExistingTable(tableName.getIdentity(), tableName.getKey().getDatabaseName(), tableName.getKey().getTableName()); + List partitions = getExistingPartitionsByNames(tableName.getIdentity(), table, ImmutableList.copyOf(partitionNameStrings)); + Map statisticsByPartitionName = delegate.getPartitionStatistics(tableName.getIdentity(), table, partitions); + for (WithIdentity partitionName : partitionNames) { + String stringNameForPartition = partitionName.getKey().getPartitionName().get(); + PartitionStatistics value = statisticsByPartitionName.get(stringNameForPartition); + if (value == null) { + throw new PrestoException(HiveErrorCode.HIVE_PARTITION_DROPPED_DURING_QUERY, "Statistics result does not contain entry for partition: " + stringNameForPartition); + } + result.put(partitionName, new WithValidation<>(getCacheValidationPartitionParams(table, value.getBasicStatistics()), value)); + } + }); + return result.build(); + } + + @Override + public void updateTableStatistics(HiveIdentity identity, String databaseName, String tableName, Function update) + { + identity = updateIdentity(identity); + try { + delegate.updateTableStatistics(identity, databaseName, tableName, update); + } + finally { + tableStatisticsCache.invalidate(new WithIdentity<>(identity, HiveTableName.hiveTableName(databaseName, tableName))); + tableCache.invalidate(new WithIdentity<>(identity, HiveTableName.hiveTableName(databaseName, tableName))); + } + } + + @Override + public void updatePartitionStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, Function update) + { + identity = updateIdentity(identity); + try { + delegate.updatePartitionStatistics(identity, databaseName, tableName, partitionName, update); + } + finally { + partitionStatisticsCache.invalidate(new WithIdentity<>(identity, HivePartitionName.hivePartitionName(databaseName, tableName, partitionName))); + //statistics updated for partition itself in above call. + partitionCache.invalidate(new WithIdentity<>(identity, HivePartitionName.hivePartitionName(databaseName, tableName, partitionName))); + tableCache.invalidate(new WithIdentity<>(identity, HiveTableName.hiveTableName(databaseName, tableName))); + } + } + + @Override + public void updatePartitionsStatistics(HiveIdentity identity, String databaseName, String tableName, Map> partNamesUpdateFunctionMap) + { + try { + delegate.updatePartitionsStatistics(identity, databaseName, tableName, partNamesUpdateFunctionMap); + } + finally { + partNamesUpdateFunctionMap.entrySet().stream().forEach(e -> { + partitionStatisticsCache.invalidate(new WithIdentity<>(identity, HivePartitionName.hivePartitionName(databaseName, tableName, e.getKey()))); + //statistics updated for partition itself in above call. + partitionCache.invalidate(new WithIdentity<>(identity, HivePartitionName.hivePartitionName(databaseName, tableName, e.getKey()))); + }); + tableCache.invalidate(new WithIdentity<>(identity, HiveTableName.hiveTableName(databaseName, tableName))); + } + } + + @Override + public Optional> getAllTables(String databaseName) + { + if (skipCache) { + return delegate.getAllTables(databaseName); + } + return get(tableNamesCache, databaseName); + } + + private Optional> loadAllTables(String databaseName) + { + return delegate.getAllTables(databaseName); + } + + @Override + public Optional> getAllViews(String databaseName) + { + if (skipCache) { + return delegate.getAllViews(databaseName); + } + return get(viewNamesCache, databaseName); + } + + private Optional> loadAllViews(String databaseName) + { + return delegate.getAllViews(databaseName); + } + + @Override + public void createDatabase(HiveIdentity identity, Database database) + { + identity = updateIdentity(identity); + try { + delegate.createDatabase(identity, database); + } + finally { + invalidateDatabase(database.getDatabaseName()); + } + } + + @Override + public void dropDatabase(HiveIdentity identity, String databaseName) + { + identity = updateIdentity(identity); + try { + delegate.dropDatabase(identity, databaseName); + } + finally { + invalidateDatabase(databaseName); + } + } + + @Override + public void renameDatabase(HiveIdentity identity, String databaseName, String newDatabaseName) + { + identity = updateIdentity(identity); + try { + delegate.renameDatabase(identity, databaseName, newDatabaseName); + } + finally { + invalidateDatabase(databaseName); + invalidateDatabase(newDatabaseName); + } + } + + protected void invalidateDatabase(String databaseName) + { + databaseCache.invalidate(databaseName); + databaseNamesCache.invalidateAll(); + } + + @Override + public void createTable(HiveIdentity identity, Table table, PrincipalPrivileges principalPrivileges) + { + identity = updateIdentity(identity); + try { + delegate.createTable(identity, table, principalPrivileges); + } + finally { + invalidateTable(table.getDatabaseName(), table.getTableName()); + } + } + + @Override + public void dropTable(HiveIdentity identity, String databaseName, String tableName, boolean deleteData) + { + identity = updateIdentity(identity); + try { + delegate.dropTable(identity, databaseName, tableName, deleteData); + } + finally { + invalidateTable(databaseName, tableName); + } + } + + @Override + public void replaceTable(HiveIdentity identity, String databaseName, String tableName, Table newTable, PrincipalPrivileges principalPrivileges) + { + identity = updateIdentity(identity); + try { + delegate.replaceTable(identity, databaseName, tableName, newTable, principalPrivileges); + } + finally { + invalidateTable(databaseName, tableName); + invalidateTable(newTable.getDatabaseName(), newTable.getTableName()); + } + } + + @Override + public void renameTable(HiveIdentity identity, String databaseName, String tableName, String newDatabaseName, String newTableName) + { + identity = updateIdentity(identity); + try { + delegate.renameTable(identity, databaseName, tableName, newDatabaseName, newTableName); + } + finally { + invalidateTable(databaseName, tableName); + invalidateTable(newDatabaseName, newTableName); + } + } + + @Override + public void commentTable(HiveIdentity identity, String databaseName, String tableName, Optional comment) + { + identity = updateIdentity(identity); + try { + delegate.commentTable(identity, databaseName, tableName, comment); + } + finally { + invalidateTable(databaseName, tableName); + } + } + + @Override + public void addColumn(HiveIdentity identity, String databaseName, String tableName, String columnName, HiveType columnType, String columnComment) + { + identity = updateIdentity(identity); + try { + delegate.addColumn(identity, databaseName, tableName, columnName, columnType, columnComment); + } + finally { + invalidateTable(databaseName, tableName); + } + } + + @Override + public void renameColumn(HiveIdentity identity, String databaseName, String tableName, String oldColumnName, String newColumnName) + { + identity = updateIdentity(identity); + try { + delegate.renameColumn(identity, databaseName, tableName, oldColumnName, newColumnName); + } + finally { + invalidateTable(databaseName, tableName); + } + } + + @Override + public void dropColumn(HiveIdentity identity, String databaseName, String tableName, String columnName) + { + identity = updateIdentity(identity); + try { + delegate.dropColumn(identity, databaseName, tableName, columnName); + } + finally { + invalidateTable(databaseName, tableName); + } + } + + protected void invalidateTable(String databaseName, String tableName) + { + invalidateTableCache(databaseName, tableName); + tableNamesCache.invalidate(databaseName); + viewNamesCache.invalidate(databaseName); + invalidateTableStatisticsCache(databaseName, tableName); + invalidatePartitionCache(databaseName, tableName); + } + + private Partition getExistingPartition(HiveIdentity identity, Table table, List partitionValues) + { + return getPartition(identity, table.getDatabaseName(), table.getTableName(), partitionValues) + .orElseThrow(() -> new PartitionNotFoundException(table.getSchemaTableName(), partitionValues)); + } + + private List getExistingPartitionsByNames(HiveIdentity identity, Table table, List partitionNames) + { + Map partitions = getPartitionsByNames(identity, table.getDatabaseName(), table.getTableName(), partitionNames).entrySet().stream() + .map(entry -> immutableEntry(entry.getKey(), entry.getValue().orElseThrow(() -> + new PartitionNotFoundException(table.getSchemaTableName(), extractPartitionValues(entry.getKey()))))) + .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue)); + + return partitionNames.stream() + .map(partitions::get) + .collect(toImmutableList()); + } + + private void invalidateTableCache(String databaseName, String tableName) + { + tableCache.asMap().keySet().stream() + .filter(table -> table.getKey().getDatabaseName().equals(databaseName) && table.getKey().getTableName().equals(tableName)) + .forEach(tableCache::invalidate); + } + + private void invalidateTableStatisticsCache(String databaseName, String tableName) + { + tableStatisticsCache.asMap().keySet().stream() + .filter(table -> table.getKey().getDatabaseName().equals(databaseName) && table.getKey().getTableName().equals(tableName)) + .forEach(tableStatisticsCache::invalidate); + } + + @Override + public Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues) + { + identity = updateIdentity(identity); + if (skipTableCache) { + return delegate.getPartition(identity, databaseName, tableName, partitionValues); + } + + WithIdentity name = new WithIdentity<>(identity, hivePartitionName(databaseName, tableName, partitionValues)); + Optional> partition = get(partitionCache, name); + if (dontVerifyCacheEntry || !partition.isPresent()) { + return partition.isPresent() ? Optional.of(partition.get().get()) : Optional.empty(); + } + + Table table = getCacheValidationParams(identity, databaseName, tableName); + if (partition.get().matches(table)) { + return Optional.of(partition.get().get()); + } + + partitionCache.invalidate(name); + partition = get(partitionCache, name); + return partition.isPresent() ? Optional.of(partition.get().get()) : Optional.empty(); + } + + @Override + public Optional> getPartitionNames(HiveIdentity identity, String databaseName, String tableName) + { + identity = updateIdentity(identity); + if (skipTableCache) { + return delegate.getPartitionNames(identity, databaseName, tableName); + } + + WithIdentity key = new WithIdentity<>(identity, HiveTableName.hiveTableName(databaseName, tableName)); + return get(partitionNamesCache, key); + } + + private Optional> loadPartitionNames(WithIdentity hiveTableName) + { + return delegate.getPartitionNames(hiveTableName.getIdentity(), hiveTableName.getKey().getDatabaseName(), hiveTableName.getKey().getTableName()); + } + + @Override + public Optional> getPartitionNamesByParts(HiveIdentity identity, String databaseName, String tableName, List parts) + { + identity = updateIdentity(identity); + if (skipTableCache) { + return delegate.getPartitionNamesByParts(identity, databaseName, tableName, parts); + } + + WithIdentity key = new WithIdentity<>(identity, PartitionFilter.partitionFilter(databaseName, tableName, parts)); + Optional>> values = get(partitionFilterCache, key); + + if (dontVerifyCacheEntry || !values.isPresent()) { + return values.isPresent() ? Optional.of(values.get().get()) : Optional.empty(); + } + + Table table = getCacheValidationParams(identity, databaseName, tableName); + if (values.get().matches(table)) { + return Optional.of(values.get().get()); + } + + partitionFilterCache.invalidate(key); + values = get(partitionFilterCache, key); + return values.isPresent() ? Optional.of(values.get().get()) : Optional.empty(); + } + + private Optional>> loadPartitionNamesByParts(WithIdentity partitionFilter) + { + Optional> result = delegate.getPartitionNamesByParts( + partitionFilter.getIdentity(), + partitionFilter.getKey().getHiveTableName().getDatabaseName(), + partitionFilter.getKey().getHiveTableName().getTableName(), + partitionFilter.getKey().getParts()); + + if (result.isPresent()) { + Table table = getCacheValidationParams(partitionFilter.getIdentity(), + partitionFilter.getKey().getHiveTableName().getDatabaseName(), + partitionFilter.getKey().getHiveTableName().getTableName()); + return Optional.of(new WithValidation<>(table, result.get())); + } + + return Optional.empty(); + } + + @Override + public Map> getPartitionsByNames(HiveIdentity identity, String databaseName, String tableName, List partitionNames) + { + if (skipTableCache) { + HiveIdentity identity1 = updateIdentity(identity); + return delegate.getPartitionsByNames(identity1, databaseName, tableName, partitionNames); + } + + Iterable> names = transform(partitionNames, name -> new WithIdentity<>(updateIdentity(identity), HivePartitionName.hivePartitionName(databaseName, tableName, name))); + + Map, Optional>> all = getAll(partitionCache, names); + ImmutableMap.Builder> partitionsByName = ImmutableMap.builder(); + + if (dontVerifyCacheEntry || all.size() == 0) { + return buildParitionsByName(all, partitionsByName); + } + + Table table = getCacheValidationParams(identity, databaseName, tableName); + if (Iterables.get(all.values(), 0).get().matches(table)) { + return buildParitionsByName(all, partitionsByName); + } + + partitionCache.invalidateAll(names); + all = getAll(partitionCache, names); + return buildParitionsByName(all, partitionsByName); + } + + private Map> buildParitionsByName(Map, Optional>> all, + ImmutableMap.Builder> partitionsByName) + { + for (Entry, Optional>> entry : all.entrySet()) { + partitionsByName.put(entry.getKey().getKey().getPartitionName().get(), + entry.getValue().isPresent() ? Optional.of(entry.getValue().get().get()) : Optional.empty()); + } + return partitionsByName.build(); + } + + private Optional> loadPartitionByName(WithIdentity partitionName) + { + Optional result = delegate.getPartition( + partitionName.getIdentity(), + partitionName.getKey().getHiveTableName().getDatabaseName(), + partitionName.getKey().getHiveTableName().getTableName(), + partitionName.getKey().getPartitionValues()); + + if (result.isPresent()) { + Table table = getCacheValidationParams(partitionName.getIdentity(), + partitionName.getKey().getHiveTableName().getDatabaseName(), + partitionName.getKey().getHiveTableName().getTableName()); + + return Optional.of(new WithValidation<>(table, result.get())); + } + + return Optional.empty(); + } + + private Map, Optional>> loadPartitionsByNames(Iterable> partitionNames) + { + requireNonNull(partitionNames, "partitionNames is null"); + checkArgument(!Iterables.isEmpty(partitionNames), "partitionNames is empty"); + + WithIdentity firstPartition = Iterables.get(partitionNames, 0); + + HiveTableName hiveTableName = firstPartition.getKey().getHiveTableName(); + HiveIdentity identity = updateIdentity(firstPartition.getIdentity()); + String databaseName = hiveTableName.getDatabaseName(); + String tableName = hiveTableName.getTableName(); + + List partitionsToFetch = new ArrayList<>(); + for (WithIdentity partitionName : partitionNames) { + checkArgument(partitionName.getKey().getHiveTableName().equals(hiveTableName), "Expected table name %s but got %s", hiveTableName, partitionName.getKey().getHiveTableName()); + partitionsToFetch.add(partitionName.getKey().getPartitionName().get()); + } + + ImmutableMap.Builder, Optional>> partitions = ImmutableMap.builder(); + Map> partitionsByNames = delegate.getPartitionsByNames(identity, databaseName, tableName, partitionsToFetch); + Table table = getCacheValidationParams(identity, databaseName, tableName); + for (Entry> entry : partitionsByNames.entrySet()) { + partitions.put(new WithIdentity<>(identity, HivePartitionName.hivePartitionName(hiveTableName, entry.getKey())), + entry.getValue().isPresent() ? Optional.of(new WithValidation<>(table, entry.getValue().get())) : Optional.empty()); + } + return partitions.build(); + } + + @Override + public void addPartitions(HiveIdentity identity, String databaseName, String tableName, List partitions) + { + identity = updateIdentity(identity); + try { + delegate.addPartitions(identity, databaseName, tableName, partitions); + } + finally { + // todo do we need to invalidate all partitions? + invalidatePartitionCache(databaseName, tableName); + } + } + + @Override + public void dropPartition(HiveIdentity identity, String databaseName, String tableName, List parts, boolean deleteData) + { + identity = updateIdentity(identity); + try { + delegate.dropPartition(identity, databaseName, tableName, parts, deleteData); + } + finally { + invalidatePartitionCache(databaseName, tableName); + } + } + + @Override + public void alterPartition(HiveIdentity identity, String databaseName, String tableName, PartitionWithStatistics partition) + { + identity = updateIdentity(identity); + try { + delegate.alterPartition(identity, databaseName, tableName, partition); + } + finally { + invalidatePartitionCache(databaseName, tableName); + } + } + + @Override + public void createRole(String role, String grantor) + { + try { + delegate.createRole(role, grantor); + } + finally { + rolesCache.invalidateAll(); + } + } + + @Override + public void dropRole(String role) + { + try { + delegate.dropRole(role); + } + finally { + rolesCache.invalidateAll(); + roleGrantsCache.invalidateAll(); + } + } + + @Override + public Set listRoles() + { + if (skipCache) { + return delegate.listRoles(); + } + return get(rolesCache, ""); + } + + private Set loadRoles() + { + return delegate.listRoles(); + } + + @Override + public void grantRoles(Set roles, Set grantees, boolean withAdminOption, HivePrincipal grantor) + { + try { + delegate.grantRoles(roles, grantees, withAdminOption, grantor); + } + finally { + roleGrantsCache.invalidateAll(); + } + } + + @Override + public void revokeRoles(Set roles, Set grantees, boolean adminOptionFor, HivePrincipal grantor) + { + try { + delegate.revokeRoles(roles, grantees, adminOptionFor, grantor); + } + finally { + roleGrantsCache.invalidateAll(); + } + } + + @Override + public Set listRoleGrants(HivePrincipal principal) + { + if (skipCache) { + return delegate.listRoleGrants(principal); + } + return get(roleGrantsCache, principal); + } + + private Set loadRoleGrants(HivePrincipal principal) + { + return delegate.listRoleGrants(principal); + } + + private void invalidatePartitionCache(String databaseName, String tableName) + { + HiveTableName hiveTableName = HiveTableName.hiveTableName(databaseName, tableName); + partitionNamesCache.asMap().keySet().stream() + .filter(partitionName -> partitionName.getKey().equals(hiveTableName)) + .forEach(partitionNamesCache::invalidate); + partitionCache.asMap().keySet().stream() + .filter(partitionName -> partitionName.getKey().getHiveTableName().equals(hiveTableName)) + .forEach(partitionCache::invalidate); + partitionFilterCache.asMap().keySet().stream() + .filter(partitionFilter -> partitionFilter.getKey().getHiveTableName().equals(hiveTableName)) + .forEach(partitionFilterCache::invalidate); + partitionStatisticsCache.asMap().keySet().stream() + .filter(partitionFilter -> partitionFilter.getKey().getHiveTableName().equals(hiveTableName)) + .forEach(partitionStatisticsCache::invalidate); + } + + @Override + public void grantTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + delegate.grantTablePrivileges(databaseName, tableName, grantee, privileges); + } + + @Override + public void revokeTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + delegate.revokeTablePrivileges(databaseName, tableName, grantee, privileges); + } + + @Override + public Set listTablePrivileges(String databaseName, String tableName, HivePrincipal principal) + { + return delegate.listTablePrivileges(databaseName, tableName, principal); + } + + @Override + public Optional getConfigValue(String name) + { + if (skipCache) { + return delegate.getConfigValue(name); + } + return get(configValuesCache, name); + } + + private Optional loadConfigValue(String name) + { + return delegate.getConfigValue(name); + } + + @Override + public long openTransaction(HiveIdentity identity) + { + return delegate.openTransaction(identity); + } + + @Override + public void commitTransaction(HiveIdentity identity, long transactionId) + { + delegate.commitTransaction(identity, transactionId); + } + + @Override + public void abortTransaction(HiveIdentity identity, long transactionId) + { + delegate.abortTransaction(identity, transactionId); + } + + @Override + public void sendTransactionHeartbeat(HiveIdentity identity, long transactionId) + { + delegate.sendTransactionHeartbeat(identity, transactionId); + } + + @Override + public void acquireSharedReadLock(HiveIdentity identity, String queryId, long transactionId, List fullTables, List partitions) + { + delegate.acquireSharedReadLock(identity, queryId, transactionId, fullTables, partitions); + } + + @Override + public void acquireLock(HiveIdentity identity, String queryId, long transactionId, List fullTables, List partitions, DataOperationType operationType) + { + delegate.acquireLock(identity, queryId, transactionId, fullTables, partitions, operationType); + } + + @Override + public String getValidWriteIds(HiveIdentity identity, List tables, long currentTransactionId, boolean isVacuum) + { + return delegate.getValidWriteIds(identity, tables, currentTransactionId, isVacuum); + } + + @Override + public ShowLocksResponse showLocks(ShowLocksRequest rqst) + { + return delegate.showLocks(rqst); + } + + @Override + public long getTableWriteId(String dbName, String tableName, long transactionId) + { + return delegate.getTableWriteId(dbName, tableName, transactionId); + } + + @Override + public boolean isImpersonationEnabled() + { + return delegate.isImpersonationEnabled(); + } + + public Set loadTablePrivileges(String databaseName, String tableName, HivePrincipal principal) + { + return delegate.listTablePrivileges(databaseName, tableName, principal); + } + + private static CacheBuilder newCacheBuilder(OptionalLong expiresAfterWriteMillis, OptionalLong refreshMillis, long maximumSize) + { + CacheBuilder cacheBuilder = CacheBuilder.newBuilder(); + if (expiresAfterWriteMillis.isPresent()) { + cacheBuilder = cacheBuilder.expireAfterWrite(expiresAfterWriteMillis.getAsLong(), MILLISECONDS); + } + if (refreshMillis.isPresent() && (!expiresAfterWriteMillis.isPresent() || expiresAfterWriteMillis.getAsLong() > refreshMillis.getAsLong())) { + cacheBuilder = cacheBuilder.refreshAfterWrite(refreshMillis.getAsLong(), MILLISECONDS); + } + cacheBuilder = cacheBuilder.maximumSize(maximumSize); + return cacheBuilder; + } + + /** + * list the privilege of db + * + * @param databaseName + * @param principal + * @return + */ + @Override + public Set listSchemaPrivileges(String databaseName, String tableName, + HivePrincipal principal) + { + return delegate.listSchemaPrivileges(databaseName, tableName, principal); + } + + /** + * list the privilege of column + * + * @param databaseName + * @param tableName + * @param columnName + * @param principal + * @return + */ + @Override + public Set listColumnPrivileges(String databaseName, String tableName, String columnName, + HivePrincipal principal) + { + return delegate.listColumnPrivileges(databaseName, tableName, columnName, principal); + } + + private static class WithIdentity + { + private final HiveIdentity identity; + private final T key; + + public WithIdentity(HiveIdentity identity, T key) + { + this.identity = requireNonNull(identity, "identity is null"); + this.key = requireNonNull(key, "key is null"); + } + + public HiveIdentity getIdentity() + { + return identity; + } + + public T getKey() + { + return key; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + WithIdentity other = (WithIdentity) o; + return Objects.equals(identity, other.identity) && + Objects.equals(key, other.key); + } + + @Override + public int hashCode() + { + return Objects.hash(identity, key); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("identity", identity) + .add("key", key) + .toString(); + } + } + + private HiveIdentity updateIdentity(HiveIdentity identity) + { + // remove identity if not doing impersonation + return delegate.isImpersonationEnabled() ? identity : HiveIdentity.none(); + } + + private static class WithValidation + { + private final K key; + private final T payload; + + public WithValidation(K key, T payload) + { + this.key = key; + this.payload = payload; + } + + public T get() + { + return payload; + } + + public boolean matches(K key) + { + if (this.key == null) { + return true; + } + return Objects.equals(key, this.key); + } + + @Override + public int hashCode() + { + return Objects.hash(key); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + WithValidation that = (WithValidation) o; + return Objects.equals(key, that.key); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Column.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Column.java new file mode 100644 index 00000000..49324d7b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Column.java @@ -0,0 +1,94 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.prestosql.plugin.hive.HiveType; + +import javax.annotation.concurrent.Immutable; + +import java.util.Objects; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class Column +{ + private final String name; + private final HiveType type; + private final Optional comment; + + @JsonCreator + public Column( + @JsonProperty("name") String name, + @JsonProperty("type") HiveType type, + @JsonProperty("comment") Optional comment) + { + this.name = requireNonNull(name, "name is null"); + this.type = requireNonNull(type, "type is null"); + this.comment = requireNonNull(comment, "comment is null"); + } + + @JsonProperty + public String getName() + { + return name; + } + + @JsonProperty + public HiveType getType() + { + return type; + } + + @JsonProperty + public Optional getComment() + { + return comment; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("name", name) + .add("type", type) + .toString(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + Column column = (Column) o; + return Objects.equals(name, column.name) && + Objects.equals(type, column.type) && + Objects.equals(comment, column.comment); + } + + @Override + public int hashCode() + { + return Objects.hash(name, type, comment); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Database.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Database.java new file mode 100644 index 00000000..18eae5d0 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Database.java @@ -0,0 +1,218 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableMap; +import io.prestosql.spi.security.PrincipalType; + +import javax.annotation.concurrent.Immutable; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class Database +{ + public static final String DEFAULT_DATABASE_NAME = "default"; + + private final String databaseName; + private final Optional location; + private final String ownerName; + private final PrincipalType ownerType; + private final Optional comment; + private final Map parameters; + + @JsonCreator + public Database( + @JsonProperty("databaseName") String databaseName, + @JsonProperty("location") Optional location, + @JsonProperty("ownerName") String ownerName, + @JsonProperty("ownerType") PrincipalType ownerType, + @JsonProperty("comment") Optional comment, + @JsonProperty("parameters") Map parameters) + { + this.databaseName = requireNonNull(databaseName, "databaseName is null"); + this.location = requireNonNull(location, "location is null"); + this.ownerName = requireNonNull(ownerName, "ownerName is null"); + this.ownerType = requireNonNull(ownerType, "ownerType is null"); + this.comment = requireNonNull(comment, "comment is null"); + this.parameters = ImmutableMap.copyOf(requireNonNull(parameters, "parameters is null")); + } + + @JsonProperty + public String getDatabaseName() + { + return databaseName; + } + + @JsonProperty + public Optional getLocation() + { + return location; + } + + @JsonProperty + public String getOwnerName() + { + return ownerName; + } + + @JsonProperty + public PrincipalType getOwnerType() + { + return ownerType; + } + + @JsonProperty + public Optional getComment() + { + return comment; + } + + @JsonProperty + public Map getParameters() + { + return parameters; + } + + public static Builder builder() + { + return new Builder(); + } + + public static Builder builder(Database database) + { + return new Builder(database); + } + + public static class Builder + { + private String databaseName; + private Optional location = Optional.empty(); + private String ownerName; + private PrincipalType ownerType; + private Optional comment = Optional.empty(); + private Map parameters = new LinkedHashMap<>(); + + public Builder() {} + + public Builder(Database database) + { + this.databaseName = database.databaseName; + this.location = database.location; + this.ownerName = database.ownerName; + this.ownerType = database.ownerType; + this.comment = database.comment; + this.parameters = database.parameters; + } + + public Builder setDatabaseName(String databaseName) + { + requireNonNull(databaseName, "databaseName is null"); + this.databaseName = databaseName; + return this; + } + + public Builder setLocation(Optional location) + { + requireNonNull(location, "location is null"); + this.location = location; + return this; + } + + public Builder setOwnerName(String ownerName) + { + requireNonNull(ownerName, "ownerName is null"); + this.ownerName = ownerName; + return this; + } + + public Builder setOwnerType(PrincipalType ownerType) + { + requireNonNull(ownerType, "ownerType is null"); + this.ownerType = ownerType; + return this; + } + + public Builder setComment(Optional comment) + { + requireNonNull(comment, "comment is null"); + this.comment = comment; + return this; + } + + public Builder setParameters(Map parameters) + { + requireNonNull(parameters, "parameters is null"); + this.parameters = parameters; + return this; + } + + public Database build() + { + return new Database( + databaseName, + location, + ownerName, + ownerType, + comment, + parameters); + } + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("databaseName", databaseName) + .add("location", location) + .add("ownerName", ownerName) + .add("ownerType", ownerType) + .add("comment", comment) + .add("parameters", parameters) + .toString(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + Database database = (Database) o; + return Objects.equals(databaseName, database.databaseName) && + Objects.equals(location, database.location) && + Objects.equals(ownerName, database.ownerName) && + ownerType == database.ownerType && + Objects.equals(comment, database.comment) && + Objects.equals(parameters, database.parameters); + } + + @Override + public int hashCode() + { + return Objects.hash(databaseName, location, ownerName, ownerType, comment, parameters); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/DateStatistics.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/DateStatistics.java new file mode 100644 index 00000000..f9903713 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/DateStatistics.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import javax.annotation.concurrent.Immutable; + +import java.time.LocalDate; +import java.util.Objects; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class DateStatistics +{ + private final Optional min; + private final Optional max; + + @JsonCreator + public DateStatistics( + @JsonProperty("min") Optional min, + @JsonProperty("max") Optional max) + { + this.min = requireNonNull(min, "min is null"); + this.max = requireNonNull(max, "max is null"); + } + + @JsonProperty + public Optional getMin() + { + return min; + } + + @JsonProperty + public Optional getMax() + { + return max; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DateStatistics that = (DateStatistics) o; + return Objects.equals(min, that.min) && + Objects.equals(max, that.max); + } + + @Override + public int hashCode() + { + return Objects.hash(min, max); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("min", min) + .add("max", max) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/DecimalStatistics.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/DecimalStatistics.java new file mode 100644 index 00000000..8bfb277a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/DecimalStatistics.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import javax.annotation.concurrent.Immutable; + +import java.math.BigDecimal; +import java.util.Objects; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class DecimalStatistics +{ + private final Optional min; + private final Optional max; + + @JsonCreator + public DecimalStatistics( + @JsonProperty("min") Optional min, + @JsonProperty("max") Optional max) + { + this.min = requireNonNull(min, "min is null"); + this.max = requireNonNull(max, "max is null"); + } + + @JsonProperty + public Optional getMin() + { + return min; + } + + @JsonProperty + public Optional getMax() + { + return max; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DecimalStatistics that = (DecimalStatistics) o; + return Objects.equals(min, that.min) && + Objects.equals(max, that.max); + } + + @Override + public int hashCode() + { + return Objects.hash(min, max); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("min", min) + .add("max", max) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/DoubleStatistics.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/DoubleStatistics.java new file mode 100644 index 00000000..ae751322 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/DoubleStatistics.java @@ -0,0 +1,82 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import javax.annotation.concurrent.Immutable; + +import java.util.Objects; +import java.util.OptionalDouble; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class DoubleStatistics +{ + private final OptionalDouble min; + private final OptionalDouble max; + + @JsonCreator + public DoubleStatistics( + @JsonProperty("min") OptionalDouble min, + @JsonProperty("max") OptionalDouble max) + { + this.min = requireNonNull(min, "min is null"); + this.max = requireNonNull(max, "max is null"); + } + + @JsonProperty + public OptionalDouble getMin() + { + return min; + } + + @JsonProperty + public OptionalDouble getMax() + { + return max; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DoubleStatistics that = (DoubleStatistics) o; + return Objects.equals(min, that.min) && + Objects.equals(max, that.max); + } + + @Override + public int hashCode() + { + return Objects.hash(min, max); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("min", min) + .add("max", max) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveColumnStatistics.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveColumnStatistics.java new file mode 100644 index 00000000..50cb7840 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveColumnStatistics.java @@ -0,0 +1,417 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import javax.annotation.concurrent.Immutable; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.OptionalDouble; +import java.util.OptionalLong; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +@Immutable +public class HiveColumnStatistics +{ + private static final HiveColumnStatistics EMPTY = HiveColumnStatistics.builder().build(); + + private final Optional integerStatistics; + private final Optional doubleStatistics; + private final Optional decimalStatistics; + private final Optional dateStatistics; + private final Optional booleanStatistics; + private final OptionalLong maxValueSizeInBytes; + private final OptionalLong totalSizeInBytes; + private final OptionalLong nullsCount; + private final OptionalLong distinctValuesCount; + + public static HiveColumnStatistics empty() + { + return EMPTY; + } + + @JsonCreator + public HiveColumnStatistics( + @JsonProperty("integerStatistics") Optional integerStatistics, + @JsonProperty("doubleStatistics") Optional doubleStatistics, + @JsonProperty("decimalStatistics") Optional decimalStatistics, + @JsonProperty("dateStatistics") Optional dateStatistics, + @JsonProperty("booleanStatistics") Optional booleanStatistics, + @JsonProperty("maxValueSizeInBytes") OptionalLong maxValueSizeInBytes, + @JsonProperty("totalSizeInBytes") OptionalLong totalSizeInBytes, + @JsonProperty("nullsCount") OptionalLong nullsCount, + @JsonProperty("distinctValuesCount") OptionalLong distinctValuesCount) + { + this.integerStatistics = requireNonNull(integerStatistics, "integerStatistics is null"); + this.doubleStatistics = requireNonNull(doubleStatistics, "doubleStatistics is null"); + this.decimalStatistics = requireNonNull(decimalStatistics, "decimalStatistics is null"); + this.dateStatistics = requireNonNull(dateStatistics, "dateStatistics is null"); + this.booleanStatistics = requireNonNull(booleanStatistics, "booleanStatistics is null"); + this.maxValueSizeInBytes = requireNonNull(maxValueSizeInBytes, "maxValueSizeInBytes is null"); + this.totalSizeInBytes = requireNonNull(totalSizeInBytes, "totalSizeInBytes is null"); + this.nullsCount = requireNonNull(nullsCount, "nullsCount is null"); + this.distinctValuesCount = requireNonNull(distinctValuesCount, "distinctValuesCount is null"); + + List presentStatistics = new ArrayList<>(); + integerStatistics.ifPresent(s -> presentStatistics.add("integerStatistics")); + doubleStatistics.ifPresent(s -> presentStatistics.add("doubleStatistics")); + decimalStatistics.ifPresent(s -> presentStatistics.add("decimalStatistics")); + dateStatistics.ifPresent(s -> presentStatistics.add("dateStatistics")); + booleanStatistics.ifPresent(s -> presentStatistics.add("booleanStatistics")); + checkArgument(presentStatistics.size() <= 1, "multiple type specific statistic objects are present: %s", presentStatistics); + } + + @JsonProperty + public Optional getIntegerStatistics() + { + return integerStatistics; + } + + @JsonProperty + public Optional getDoubleStatistics() + { + return doubleStatistics; + } + + @JsonProperty + public Optional getDecimalStatistics() + { + return decimalStatistics; + } + + @JsonProperty + public Optional getDateStatistics() + { + return dateStatistics; + } + + @JsonProperty + public Optional getBooleanStatistics() + { + return booleanStatistics; + } + + @JsonProperty + public OptionalLong getMaxValueSizeInBytes() + { + return maxValueSizeInBytes; + } + + @JsonProperty + public OptionalLong getTotalSizeInBytes() + { + return totalSizeInBytes; + } + + @JsonProperty + public OptionalLong getNullsCount() + { + return nullsCount; + } + + @JsonProperty + public OptionalLong getDistinctValuesCount() + { + return distinctValuesCount; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HiveColumnStatistics that = (HiveColumnStatistics) o; + return Objects.equals(integerStatistics, that.integerStatistics) && + Objects.equals(doubleStatistics, that.doubleStatistics) && + Objects.equals(decimalStatistics, that.decimalStatistics) && + Objects.equals(dateStatistics, that.dateStatistics) && + Objects.equals(booleanStatistics, that.booleanStatistics) && + Objects.equals(maxValueSizeInBytes, that.maxValueSizeInBytes) && + Objects.equals(totalSizeInBytes, that.totalSizeInBytes) && + Objects.equals(nullsCount, that.nullsCount) && + Objects.equals(distinctValuesCount, that.distinctValuesCount); + } + + @Override + public int hashCode() + { + return Objects.hash( + integerStatistics, + doubleStatistics, + decimalStatistics, + dateStatistics, + booleanStatistics, + maxValueSizeInBytes, + totalSizeInBytes, + nullsCount, + distinctValuesCount); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("integerStatistics", integerStatistics) + .add("doubleStatistics", doubleStatistics) + .add("decimalStatistics", decimalStatistics) + .add("dateStatistics", dateStatistics) + .add("booleanStatistics", booleanStatistics) + .add("maxValueSizeInBytes", maxValueSizeInBytes) + .add("totalSizeInBytes", totalSizeInBytes) + .add("nullsCount", nullsCount) + .add("distinctValuesCount", distinctValuesCount) + .toString(); + } + + public static HiveColumnStatistics createIntegerColumnStatistics(OptionalLong min, OptionalLong max, OptionalLong nullsCount, OptionalLong distinctValuesCount) + { + return builder() + .setIntegerStatistics(new IntegerStatistics(min, max)) + .setNullsCount(nullsCount) + .setDistinctValuesCount(distinctValuesCount) + .build(); + } + + public static HiveColumnStatistics createDoubleColumnStatistics(OptionalDouble min, OptionalDouble max, OptionalLong nullsCount, OptionalLong distinctValuesCount) + { + return builder() + .setDoubleStatistics(new DoubleStatistics(min, max)) + .setNullsCount(nullsCount) + .setDistinctValuesCount(distinctValuesCount) + .build(); + } + + public static HiveColumnStatistics createDecimalColumnStatistics(Optional min, Optional max, OptionalLong nullsCount, OptionalLong distinctValuesCount) + { + return builder() + .setDecimalStatistics(new DecimalStatistics(min, max)) + .setNullsCount(nullsCount) + .setDistinctValuesCount(distinctValuesCount) + .build(); + } + + public static HiveColumnStatistics createDateColumnStatistics(Optional min, Optional max, OptionalLong nullsCount, OptionalLong distinctValuesCount) + { + return builder() + .setDateStatistics(new DateStatistics(min, max)) + .setNullsCount(nullsCount) + .setDistinctValuesCount(distinctValuesCount) + .build(); + } + + public static HiveColumnStatistics createBooleanColumnStatistics(OptionalLong trueCount, OptionalLong falseCount, OptionalLong nullsCount) + { + return builder() + .setBooleanStatistics(new BooleanStatistics(trueCount, falseCount)) + .setNullsCount(nullsCount) + .build(); + } + + public static HiveColumnStatistics createStringColumnStatistics( + OptionalLong maxValueSizeInBytes, + OptionalLong totalSizeInBytes, + OptionalLong nullsCount, + OptionalLong distinctValuesCount) + { + return builder() + .setMaxValueSizeInBytes(maxValueSizeInBytes) + .setTotalSizeInBytes(totalSizeInBytes) + .setNullsCount(nullsCount) + .setDistinctValuesCount(distinctValuesCount) + .build(); + } + + public static HiveColumnStatistics createBinaryColumnStatistics(OptionalLong maxValueSizeInBytes, OptionalLong totalSizeInBytes, OptionalLong nullsCount) + { + return builder() + .setMaxValueSizeInBytes(maxValueSizeInBytes) + .setTotalSizeInBytes(totalSizeInBytes) + .setNullsCount(nullsCount) + .build(); + } + + public static Builder builder(HiveColumnStatistics other) + { + return new Builder(other); + } + + public static Builder builder() + { + return new Builder(); + } + + public static class Builder + { + private Optional integerStatistics = Optional.empty(); + private Optional doubleStatistics = Optional.empty(); + private Optional decimalStatistics = Optional.empty(); + private Optional dateStatistics = Optional.empty(); + private Optional booleanStatistics = Optional.empty(); + private OptionalLong maxValueSizeInBytes = OptionalLong.empty(); + private OptionalLong totalSizeInBytes = OptionalLong.empty(); + private OptionalLong nullsCount = OptionalLong.empty(); + private OptionalLong distinctValuesCount = OptionalLong.empty(); + + private Builder() {} + + private Builder(HiveColumnStatistics other) + { + this.integerStatistics = other.getIntegerStatistics(); + this.doubleStatistics = other.getDoubleStatistics(); + this.decimalStatistics = other.getDecimalStatistics(); + this.dateStatistics = other.getDateStatistics(); + this.booleanStatistics = other.getBooleanStatistics(); + this.maxValueSizeInBytes = other.getMaxValueSizeInBytes(); + this.totalSizeInBytes = other.getTotalSizeInBytes(); + this.nullsCount = other.getNullsCount(); + this.distinctValuesCount = other.getDistinctValuesCount(); + } + + public Builder setIntegerStatistics(Optional integerStatistics) + { + this.integerStatistics = integerStatistics; + return this; + } + + public Builder setIntegerStatistics(IntegerStatistics integerStatistics) + { + this.integerStatistics = Optional.of(integerStatistics); + return this; + } + + public Builder setDoubleStatistics(Optional doubleStatistics) + { + this.doubleStatistics = doubleStatistics; + return this; + } + + public Builder setDoubleStatistics(DoubleStatistics doubleStatistics) + { + this.doubleStatistics = Optional.of(doubleStatistics); + return this; + } + + public Builder setDecimalStatistics(Optional decimalStatistics) + { + this.decimalStatistics = decimalStatistics; + return this; + } + + public Builder setDecimalStatistics(DecimalStatistics decimalStatistics) + { + this.decimalStatistics = Optional.of(decimalStatistics); + return this; + } + + public Builder setDateStatistics(Optional dateStatistics) + { + this.dateStatistics = dateStatistics; + return this; + } + + public Builder setDateStatistics(DateStatistics dateStatistics) + { + this.dateStatistics = Optional.of(dateStatistics); + return this; + } + + public Builder setBooleanStatistics(Optional booleanStatistics) + { + this.booleanStatistics = booleanStatistics; + return this; + } + + public Builder setBooleanStatistics(BooleanStatistics booleanStatistics) + { + this.booleanStatistics = Optional.of(booleanStatistics); + return this; + } + + public Builder setMaxValueSizeInBytes(long maxValueSizeInBytes) + { + this.maxValueSizeInBytes = OptionalLong.of(maxValueSizeInBytes); + return this; + } + + public Builder setMaxValueSizeInBytes(OptionalLong maxValueSizeInBytes) + { + this.maxValueSizeInBytes = maxValueSizeInBytes; + return this; + } + + public Builder setTotalSizeInBytes(long totalSizeInBytes) + { + this.totalSizeInBytes = OptionalLong.of(totalSizeInBytes); + return this; + } + + public Builder setTotalSizeInBytes(OptionalLong totalSizeInBytes) + { + this.totalSizeInBytes = totalSizeInBytes; + return this; + } + + public Builder setNullsCount(OptionalLong nullsCount) + { + this.nullsCount = nullsCount; + return this; + } + + public Builder setNullsCount(long nullsCount) + { + this.nullsCount = OptionalLong.of(nullsCount); + return this; + } + + public Builder setDistinctValuesCount(OptionalLong distinctValuesCount) + { + this.distinctValuesCount = distinctValuesCount; + return this; + } + + public Builder setDistinctValuesCount(long distinctValuesCount) + { + this.distinctValuesCount = OptionalLong.of(distinctValuesCount); + return this; + } + + public HiveColumnStatistics build() + { + return new HiveColumnStatistics( + integerStatistics, + doubleStatistics, + decimalStatistics, + dateStatistics, + booleanStatistics, + maxValueSizeInBytes, + totalSizeInBytes, + nullsCount, + distinctValuesCount); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveMetastore.java new file mode 100644 index 00000000..a2022074 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveMetastore.java @@ -0,0 +1,200 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import io.prestosql.plugin.hive.HivePartition; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.metastore.api.DataOperationType; +import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; +import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; + +public interface HiveMetastore +{ + Optional getDatabase(String databaseName); + + List getAllDatabases(); + + Optional
getTable(HiveIdentity identity, String databaseName, String tableName); + + Set getSupportedColumnStatistics(Type type); + + PartitionStatistics getTableStatistics(HiveIdentity identity, Table table); + + Map getPartitionStatistics(HiveIdentity identity, Table table, List partitions); + + void updateTableStatistics(HiveIdentity identity, String databaseName, String tableName, Function update); + + void updatePartitionStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, Function update); + + void updatePartitionsStatistics(HiveIdentity identity, String databaseName, String tableName, Map> partNamesUpdateFunctionMap); + + Optional> getAllTables(String databaseName); + + Optional> getAllViews(String databaseName); + + void createDatabase(HiveIdentity identity, Database database); + + void dropDatabase(HiveIdentity identity, String databaseName); + + void renameDatabase(HiveIdentity identity, String databaseName, String newDatabaseName); + + void createTable(HiveIdentity identity, Table table, PrincipalPrivileges principalPrivileges); + + void dropTable(HiveIdentity identity, String databaseName, String tableName, boolean deleteData); + + /** + * This should only be used if the semantic here is drop and add. Trying to + * alter one field of a table object previously acquired from getTable is + * probably not what you want. + */ + void replaceTable(HiveIdentity identity, String databaseName, String tableName, Table newTable, PrincipalPrivileges principalPrivileges); + + void renameTable(HiveIdentity identity, String databaseName, String tableName, String newDatabaseName, String newTableName); + + void commentTable(HiveIdentity identity, String databaseName, String tableName, Optional comment); + + void addColumn(HiveIdentity identity, String databaseName, String tableName, String columnName, HiveType columnType, String columnComment); + + void renameColumn(HiveIdentity identity, String databaseName, String tableName, String oldColumnName, String newColumnName); + + void dropColumn(HiveIdentity identity, String databaseName, String tableName, String columnName); + + Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues); + + Optional> getPartitionNames(HiveIdentity identity, String databaseName, String tableName); + + Optional> getPartitionNamesByParts(HiveIdentity identity, String databaseName, String tableName, List parts); + + Map> getPartitionsByNames(HiveIdentity identity, String databaseName, String tableName, List partitionNames); + + void addPartitions(HiveIdentity identity, String databaseName, String tableName, List partitions); + + void dropPartition(HiveIdentity identity, String databaseName, String tableName, List parts, boolean deleteData); + + void alterPartition(HiveIdentity identity, String databaseName, String tableName, PartitionWithStatistics partition); + + void createRole(String role, String grantor); + + void dropRole(String role); + + Set listRoles(); + + void grantRoles(Set roles, Set grantees, boolean withAdminOption, HivePrincipal grantor); + + void revokeRoles(Set roles, Set grantees, boolean adminOptionFor, HivePrincipal grantor); + + Set listRoleGrants(HivePrincipal principal); + + void grantTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges); + + void revokeTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges); + + Set listTablePrivileges(String databaseName, String tableName, HivePrincipal principal); + + default long openTransaction(HiveIdentity identity) + { + throw new UnsupportedOperationException(); + } + + default void commitTransaction(HiveIdentity identity, long transactionId) + { + throw new UnsupportedOperationException(); + } + + default void abortTransaction(HiveIdentity identity, long transactionId) + { + throw new UnsupportedOperationException(); + } + + default void sendTransactionHeartbeat(HiveIdentity identity, long transactionId) + { + throw new UnsupportedOperationException(); + } + + default void acquireSharedReadLock(HiveIdentity identity, String queryId, long transactionId, List fullTables, List partitions) + { + throw new UnsupportedOperationException(); + } + + default void acquireLock(HiveIdentity identity, String queryId, long transactionId, List fullTables, List partitions, DataOperationType operationType) + { + throw new UnsupportedOperationException(); + } + + default String getValidWriteIds(HiveIdentity identity, List tables, long currentTransactionId, boolean isVacuum) + { + throw new UnsupportedOperationException(); + } + + default ShowLocksResponse showLocks(ShowLocksRequest rqst) + { + throw new UnsupportedOperationException(); + } + + default Optional getConfigValue(String name) + { + return Optional.empty(); + } + + default long getTableWriteId(String dbName, String tableName, long transactionId) + { + throw new UnsupportedOperationException(); + } + + /** + * list Privileges of Column + * + * @param databaseName databaseName + * @param tableName tableName + * @param columnName columnName + * @param principal principal + * @return HivePrivilegeInfo + */ + default Set listColumnPrivileges(String databaseName, String tableName, String columnName, + HivePrincipal principal) + { + throw new UnsupportedOperationException(); + } + + /** + * list privilege of Schema(database) + * + * @param databaseName databaseName + * @param tableName tableName + * @param principal principal + * @return HivePrivilegeInfo + */ + default Set listSchemaPrivileges(String databaseName, String tableName, HivePrincipal principal) + { + throw new UnsupportedOperationException(); + } + + boolean isImpersonationEnabled(); + + default void refreshMetastoreCache() + { + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveMetastoreModule.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveMetastoreModule.java new file mode 100644 index 00000000..d8ff3e57 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveMetastoreModule.java @@ -0,0 +1,57 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.inject.Binder; +import com.google.inject.Module; +import io.airlift.configuration.AbstractConfigurationAwareModule; +import io.prestosql.plugin.hive.metastore.file.FileMetastoreModule; +import io.prestosql.plugin.hive.metastore.glue.GlueMetastoreModule; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreModule; + +import java.util.Optional; + +import static io.airlift.configuration.ConditionalModule.installModuleIf; + +public class HiveMetastoreModule + extends AbstractConfigurationAwareModule +{ + private final Optional metastore; + + public HiveMetastoreModule(Optional metastore) + { + this.metastore = metastore; + } + + @Override + protected void setup(Binder binder) + { + if (metastore.isPresent()) { + binder.bind(HiveMetastore.class).toInstance(metastore.get()); + } + else { + bindMetastoreModule("thrift", new ThriftMetastoreModule()); + bindMetastoreModule("file", new FileMetastoreModule()); + bindMetastoreModule("glue", new GlueMetastoreModule()); + } + } + + private void bindMetastoreModule(String name, Module module) + { + install(installModuleIf( + MetastoreConfig.class, + metastore -> name.equalsIgnoreCase(metastore.getMetastoreType()), + module)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePageSinkMetadata.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePageSinkMetadata.java new file mode 100644 index 00000000..df7e5936 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePageSinkMetadata.java @@ -0,0 +1,118 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.prestosql.spi.connector.SchemaTableName; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class HivePageSinkMetadata +{ + private final SchemaTableName schemaTableName; + private final Optional
table; + private final Map, Optional> modifiedPartitions; + + public HivePageSinkMetadata( + SchemaTableName schemaTableName, + Optional
table, + Map, Optional> modifiedPartitions) + { + this.schemaTableName = requireNonNull(schemaTableName, "schemaTableName is null"); + this.table = requireNonNull(table, "table is null"); + this.modifiedPartitions = requireNonNull(modifiedPartitions, "modifiedPartitions is null"); + checkArgument(table.isPresent() && !table.get().getPartitionColumns().isEmpty() || modifiedPartitions.isEmpty()); + } + + @JsonCreator + public static HivePageSinkMetadata deserialize( + @JsonProperty("schemaTableName") SchemaTableName schemaTableName, + @JsonProperty("table") Optional
table, + @JsonProperty("modifiedPartitions") List, Optional>> modifiedPartitions) + { + requireNonNull(modifiedPartitions, "modifiedPartitions is null"); + return new HivePageSinkMetadata(schemaTableName, table, JsonSerializableEntry.toMap(modifiedPartitions)); + } + + @JsonProperty + public SchemaTableName getSchemaTableName() + { + return schemaTableName; + } + + /** + * This method returns empty when the table has not been created yet (i.e. for CREATE TABLE AS SELECT queries) + */ + @JsonProperty + public Optional
getTable() + { + return table; + } + + @JsonProperty("modifiedPartitions") + public List, Optional>> getJsonSerializableModifiedPartitions() + { + return JsonSerializableEntry.fromMap(modifiedPartitions); + } + + public Map, Optional> getModifiedPartitions() + { + return modifiedPartitions; + } + + public static class JsonSerializableEntry + { + private final K key; + private final V value; + + @JsonCreator + public JsonSerializableEntry(@JsonProperty("key") K key, @JsonProperty("value") V value) + { + this.key = key; + this.value = value; + } + + @JsonProperty + public K getKey() + { + return key; + } + + @JsonProperty + public V getValue() + { + return value; + } + + public static List> fromMap(Map map) + { + return map.entrySet().stream() + .map(entry -> new JsonSerializableEntry<>(entry.getKey(), entry.getValue())) + .collect(Collectors.toList()); + } + + public static Map toMap(List> list) + { + return list.stream() + .collect(Collectors.toMap(JsonSerializableEntry::getKey, JsonSerializableEntry::getValue)); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePageSinkMetadataProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePageSinkMetadataProvider.java new file mode 100644 index 00000000..e569f243 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePageSinkMetadataProvider.java @@ -0,0 +1,63 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.spi.connector.SchemaTableName; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class HivePageSinkMetadataProvider +{ + private final HiveIdentity identity; + private final HiveMetastore delegate; + private final SchemaTableName schemaTableName; + private final Optional
table; + private final Map, Optional> modifiedPartitions; + + public HivePageSinkMetadataProvider(HivePageSinkMetadata pageSinkMetadata, HiveMetastore delegate, HiveIdentity identity) + { + requireNonNull(pageSinkMetadata, "pageSinkMetadata is null"); + this.delegate = delegate; + this.identity = requireNonNull(identity, "identity is null"); + this.schemaTableName = pageSinkMetadata.getSchemaTableName(); + this.table = pageSinkMetadata.getTable(); + this.modifiedPartitions = pageSinkMetadata.getModifiedPartitions(); + } + + public Optional
getTable() + { + return table; + } + + public Optional getPartition(List partitionValues) + { + if (!table.isPresent() || table.get().getPartitionColumns().isEmpty()) { + throw new IllegalArgumentException( + format("Unexpected call to getPartition. Table name: %s", schemaTableName)); + } + Optional modifiedPartition = modifiedPartitions.get(partitionValues); + if (modifiedPartition == null) { + return delegate.getPartition(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName(), partitionValues); + } + else { + return modifiedPartition; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePartitionName.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePartitionName.java new file mode 100644 index 00000000..ca855705 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePartitionName.java @@ -0,0 +1,111 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; + +import javax.annotation.concurrent.Immutable; + +import java.util.List; +import java.util.Objects; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.prestosql.plugin.hive.HiveUtil.toPartitionValues; +import static java.util.Objects.requireNonNull; + +@Immutable +public class HivePartitionName +{ + private final HiveTableName hiveTableName; + private final List partitionValues; + private final Optional partitionName; // does not participate in hashCode/equals + + @JsonCreator + public HivePartitionName( + @JsonProperty("hiveTableName") HiveTableName hiveTableName, + @JsonProperty("partitionValues") List partitionValues, + @JsonProperty("partitionName") Optional partitionName) + { + this.hiveTableName = requireNonNull(hiveTableName, "hiveTableName is null"); + this.partitionValues = ImmutableList.copyOf(requireNonNull(partitionValues, "partitionValues is null")); + this.partitionName = requireNonNull(partitionName, "partitionName is null"); + } + + public static HivePartitionName hivePartitionName(HiveTableName hiveTableName, String partitionName) + { + return new HivePartitionName(hiveTableName, toPartitionValues(partitionName), Optional.of(partitionName)); + } + + public static HivePartitionName hivePartitionName(String databaseName, String tableName, String partitionName) + { + return hivePartitionName(HiveTableName.hiveTableName(databaseName, tableName), partitionName); + } + + public static HivePartitionName hivePartitionName(String databaseName, String tableName, List partitionValues) + { + return new HivePartitionName(HiveTableName.hiveTableName(databaseName, tableName), partitionValues, Optional.empty()); + } + + @JsonProperty + public HiveTableName getHiveTableName() + { + return hiveTableName; + } + + @JsonProperty + public List getPartitionValues() + { + return partitionValues; + } + + @JsonProperty + public Optional getPartitionName() + { + return partitionName; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("hiveTableName", hiveTableName) + .add("partitionValues", partitionValues) + .add("partitionName", partitionName) + .toString(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + HivePartitionName other = (HivePartitionName) o; + return Objects.equals(hiveTableName, other.hiveTableName) && + Objects.equals(partitionValues, other.partitionValues); + } + + @Override + public int hashCode() + { + return Objects.hash(hiveTableName, partitionValues); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePrincipal.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePrincipal.java new file mode 100644 index 00000000..83bcfdca --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePrincipal.java @@ -0,0 +1,104 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.prestosql.spi.security.PrestoPrincipal; +import io.prestosql.spi.security.PrincipalType; + +import java.util.Objects; +import java.util.Set; + +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static java.util.Objects.requireNonNull; + +public class HivePrincipal +{ + public static Set from(Set prestoPrincipals) + { + return prestoPrincipals.stream() + .map(HivePrincipal::from) + .collect(toImmutableSet()); + } + + public static HivePrincipal from(PrestoPrincipal prestoPrincipal) + { + return new HivePrincipal(prestoPrincipal.getType(), prestoPrincipal.getName()); + } + + private final PrincipalType type; + private final String name; + + @JsonCreator + public HivePrincipal(@JsonProperty("type") PrincipalType type, @JsonProperty("name") String name) + { + this.type = requireNonNull(type, "type is null"); + requireNonNull(name, "name is null"); + if (type == PrincipalType.USER) { + // In Hive user names are case sensitive + this.name = name; + } + else if (type == PrincipalType.ROLE) { + // Role name can be case sensitive depends on the hive MetaStore thrift config + this.name = name; + } + else { + throw new IllegalArgumentException("Unsupported type: " + type); + } + } + + @JsonProperty + public PrincipalType getType() + { + return type; + } + + @JsonProperty + public String getName() + { + return name; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HivePrincipal prestoPrincipal = (HivePrincipal) o; + return type == prestoPrincipal.type && + Objects.equals(name, prestoPrincipal.name); + } + + @Override + public int hashCode() + { + return Objects.hash(type, name); + } + + @Override + public String toString() + { + return type + " " + name; + } + + public PrestoPrincipal toPrestoPrincipal() + { + return new PrestoPrincipal(type, name); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePrivilegeInfo.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePrivilegeInfo.java new file mode 100644 index 00000000..06959092 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HivePrivilegeInfo.java @@ -0,0 +1,157 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableSet; +import io.prestosql.spi.security.Privilege; +import io.prestosql.spi.security.PrivilegeInfo; + +import javax.annotation.concurrent.Immutable; + +import java.util.Objects; +import java.util.Set; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege.DELETE; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege.INSERT; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege.SELECT; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege.UPDATE; +import static java.util.Objects.requireNonNull; + +@Immutable +public class HivePrivilegeInfo +{ + public enum HivePrivilege + { + SELECT, INSERT, UPDATE, DELETE, OWNERSHIP + } + + private final HivePrivilege hivePrivilege; + private final boolean grantOption; + private final HivePrincipal grantor; + private final HivePrincipal grantee; + + @JsonCreator + public HivePrivilegeInfo( + @JsonProperty("hivePrivilege") HivePrivilege hivePrivilege, + @JsonProperty("grantOption") boolean grantOption, + @JsonProperty("grantor") HivePrincipal grantor, + @JsonProperty("grantee") HivePrincipal grantee) + { + this.hivePrivilege = requireNonNull(hivePrivilege, "hivePrivilege is null"); + this.grantOption = grantOption; + this.grantor = requireNonNull(grantor, "grantor is null"); + this.grantee = requireNonNull(grantee, "grantee is null"); + } + + @JsonProperty + public HivePrivilege getHivePrivilege() + { + return hivePrivilege; + } + + @JsonProperty + public boolean isGrantOption() + { + return grantOption; + } + + @JsonProperty + public HivePrincipal getGrantor() + { + return grantor; + } + + @JsonProperty + public HivePrincipal getGrantee() + { + return grantee; + } + + public static HivePrivilege toHivePrivilege(Privilege privilege) + { + switch (privilege) { + case SELECT: + return SELECT; + case INSERT: + return INSERT; + case DELETE: + return DELETE; + case UPDATE: + return UPDATE; + default: + throw new IllegalArgumentException("Unexpected privilege: " + privilege); + } + } + + public boolean isContainedIn(HivePrivilegeInfo hivePrivilegeInfo) + { + return (getHivePrivilege().equals(hivePrivilegeInfo.getHivePrivilege()) && + (isGrantOption() == hivePrivilegeInfo.isGrantOption() || + (!isGrantOption() && hivePrivilegeInfo.isGrantOption()))); + } + + public Set toPrivilegeInfo() + { + switch (hivePrivilege) { + case SELECT: + return ImmutableSet.of(new PrivilegeInfo(Privilege.SELECT, isGrantOption())); + case INSERT: + return ImmutableSet.of(new PrivilegeInfo(Privilege.INSERT, isGrantOption())); + case DELETE: + return ImmutableSet.of(new PrivilegeInfo(Privilege.DELETE, isGrantOption())); + case UPDATE: + return ImmutableSet.of(new PrivilegeInfo(Privilege.UPDATE, isGrantOption())); + case OWNERSHIP: + return ImmutableSet.of(); + default: + throw new IllegalArgumentException("Unsupported hivePrivilege: " + hivePrivilege); + } + } + + @Override + public int hashCode() + { + return Objects.hash(hivePrivilege, grantOption, grantor, grantee); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + HivePrivilegeInfo hivePrivilegeInfo = (HivePrivilegeInfo) o; + return Objects.equals(hivePrivilege, hivePrivilegeInfo.hivePrivilege) && + Objects.equals(grantOption, hivePrivilegeInfo.grantOption) && + Objects.equals(grantor, hivePrivilegeInfo.grantor) && + Objects.equals(grantee, hivePrivilegeInfo.grantee); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("privilege", hivePrivilege) + .add("grantOption", grantOption) + .add("grantor", grantor) + .add("grantee", grantee) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveTableName.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveTableName.java new file mode 100644 index 00000000..643842d2 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveTableName.java @@ -0,0 +1,84 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import javax.annotation.concurrent.Immutable; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; + +@Immutable +public class HiveTableName +{ + private final String databaseName; + private final String tableName; + + @JsonCreator + public HiveTableName(@JsonProperty("databaseName") String databaseName, @JsonProperty("tableName") String tableName) + { + this.databaseName = databaseName; + this.tableName = tableName; + } + + public static HiveTableName hiveTableName(String databaseName, String tableName) + { + return new HiveTableName(databaseName, tableName); + } + + @JsonProperty + public String getDatabaseName() + { + return databaseName; + } + + @JsonProperty + public String getTableName() + { + return tableName; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("databaseName", databaseName) + .add("tableName", tableName) + .toString(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + HiveTableName other = (HiveTableName) o; + return Objects.equals(databaseName, other.databaseName) && + Objects.equals(tableName, other.tableName); + } + + @Override + public int hashCode() + { + return Objects.hash(databaseName, tableName); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveTransaction.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveTransaction.java new file mode 100644 index 00000000..4d8cb329 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/HiveTransaction.java @@ -0,0 +1,166 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.HiveACIDWriteType; +import io.prestosql.plugin.hive.HivePartition; +import io.prestosql.plugin.hive.HiveTableHandle; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.spi.connector.SchemaTableName; +import org.apache.hadoop.hive.common.ValidTxnWriteIdList; +import org.apache.hadoop.hive.metastore.api.DataOperationType; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.atomic.AtomicBoolean; + +import static java.util.Objects.requireNonNull; + +public class HiveTransaction +{ + private final HiveIdentity identity; + private final long transactionId; + private final ScheduledFuture heartbeatTask; + private final Map locksMap = new HashMap<>(); + private final Map partitionLocks = new HashMap<>(); + + private final Map validHiveTransactionsForTable = new HashMap<>(); + + public HiveTransaction(HiveIdentity identity, long transactionId, ScheduledFuture heartbeatTask) + { + this.identity = requireNonNull(identity, "identity is null"); + this.transactionId = transactionId; + this.heartbeatTask = requireNonNull(heartbeatTask, "heartbeatTask is null"); + } + + public HiveIdentity getIdentity() + { + return identity; + } + + public long getTransactionId() + { + return transactionId; + } + + public ScheduledFuture getHeartbeatTask() + { + return heartbeatTask; + } + + public ValidTxnWriteIdList getValidWriteIds(HiveMetastore metastore, HiveTableHandle tableHandle, String queryId, boolean isVacuum) + { + //If the update or delete would have locked exclusive lock, then there is no need to lock again. + if (isSharedLockNeeded(tableHandle)) { + // Different calls for same table might need to lock different partitions so acquire locks every time + metastore.acquireSharedReadLock( + identity, + queryId, + transactionId, + !tableHandle.getPartitions().isPresent() ? ImmutableList.of(tableHandle.getSchemaTableName()) : ImmutableList.of(), + tableHandle.getPartitions().orElse(ImmutableList.of())); + } + + // For repeatable reads within a query, use the same list of valid transactions for a table which have once been used + return validHiveTransactionsForTable.computeIfAbsent(tableHandle.getSchemaTableName(), schemaTableName -> new ValidTxnWriteIdList( + metastore.getValidWriteIds( + identity, + ImmutableList.of(schemaTableName), + transactionId, + isVacuum))); + } + + //If the query is on the same table, on which update/delete happening separate lock not required. + private synchronized boolean isSharedLockNeeded(HiveTableHandle tableHandle) + { + if (tableHandle.getPartitions().isPresent() && tableHandle.getPartitions().get().size() > 0) { + List hivePartitions = tableHandle.getPartitions().get(); + for (HivePartition partition : hivePartitions) { + AtomicBoolean partitionLockFlag = partitionLocks.get(partition); + if (partitionLockFlag == null || !partitionLockFlag.get()) { + //some partition lock not found + return true; + } + } + } + else { + AtomicBoolean lockFlag = locksMap.get(tableHandle.getSchemaPrefixedTableName()); + if (lockFlag == null || !lockFlag.get()) { + return true; + } + } + return false; + } + + private synchronized void setLockFlagForTable(HiveTableHandle tableHandle) + { + if (tableHandle.getPartitions().isPresent() && tableHandle.getPartitions().get().size() > 0) { + List hivePartitions = tableHandle.getPartitions().get(); + hivePartitions.stream().forEach(hivePartition -> { + AtomicBoolean flag = partitionLocks.get(hivePartition); + if (flag == null) { + flag = new AtomicBoolean(true); + partitionLocks.put(hivePartition, flag); + } + else { + flag.set(true); + } + }); + } + else { + AtomicBoolean flag = locksMap.get(tableHandle.getSchemaPrefixedTableName()); + if (flag == null) { + flag = new AtomicBoolean(true); + locksMap.put(tableHandle.getSchemaPrefixedTableName(), flag); + } + else { + flag.set(true); + } + } + } + + public Long getTableWriteId(HiveMetastore metastore, HiveTableHandle tableHandle, HiveACIDWriteType writeType, String queryId) + { + DataOperationType operationType = DataOperationType.INSERT; + boolean semiSharedLock = false; + switch (writeType) { + case VACUUM: + case INSERT: + operationType = DataOperationType.INSERT; + break; + case INSERT_OVERWRITE: + case UPDATE: + operationType = DataOperationType.UPDATE; + semiSharedLock = true; + break; + case DELETE: + operationType = DataOperationType.DELETE; + semiSharedLock = true; + } + metastore.acquireLock( + identity, + queryId, + transactionId, + !tableHandle.getPartitions().isPresent() ? ImmutableList.of(tableHandle.getSchemaTableName()) : ImmutableList.of(), + tableHandle.getPartitions().orElse(ImmutableList.of()), + operationType); + if (semiSharedLock) { + setLockFlagForTable(tableHandle); + } + return metastore.getTableWriteId(tableHandle.getSchemaName(), tableHandle.getTableName(), transactionId); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/IntegerStatistics.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/IntegerStatistics.java new file mode 100644 index 00000000..cc604ff1 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/IntegerStatistics.java @@ -0,0 +1,82 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import javax.annotation.concurrent.Immutable; + +import java.util.Objects; +import java.util.OptionalLong; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class IntegerStatistics +{ + private final OptionalLong min; + private final OptionalLong max; + + @JsonCreator + public IntegerStatistics( + @JsonProperty("min") OptionalLong min, + @JsonProperty("max") OptionalLong max) + { + this.min = requireNonNull(min, "min is null"); + this.max = requireNonNull(max, "max is null"); + } + + @JsonProperty + public OptionalLong getMin() + { + return min; + } + + @JsonProperty + public OptionalLong getMax() + { + return max; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + IntegerStatistics that = (IntegerStatistics) o; + return Objects.equals(min, that.min) && + Objects.equals(max, that.max); + } + + @Override + public int hashCode() + { + return Objects.hash(min, max); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("min", min) + .add("max", max) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/MetastoreClientFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/MetastoreClientFactory.java new file mode 100644 index 00000000..11ce8445 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/MetastoreClientFactory.java @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.net.HostAndPort; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreClient; +import org.apache.thrift.transport.TTransportException; + +/** + * Thrift HiveMetastore for FusionInsight + * + * @since 2020-03-10 + */ +public interface MetastoreClientFactory +{ + /** + * to create thrift metastore client + * + * @param address thrift hosts and ports + * @return ThriftMetastoreClient client + * @throws TTransportException transport exception + */ + ThriftMetastoreClient create(HostAndPort address) throws TTransportException; +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/MetastoreConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/MetastoreConfig.java new file mode 100644 index 00000000..955d14ba --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/MetastoreConfig.java @@ -0,0 +1,62 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import io.airlift.configuration.Config; + +import javax.validation.constraints.NotNull; + +public class MetastoreConfig +{ + private String metastoreType = "thrift"; + private String metastoreClientFactoryImp = ""; + private String thriftMetastoreImp = ""; + + @NotNull + public String getMetastoreType() + { + return metastoreType; + } + + @Config("hive.metastore") + public MetastoreConfig setMetastoreType(String metastoreType) + { + this.metastoreType = metastoreType; + return this; + } + + public String getMetastoreClientFactoryImp() + { + return metastoreClientFactoryImp; + } + + @Config("hive.metastore.client-factory-imp") + public MetastoreConfig setMetastoreClientFactoryImp(String metastoreClientFactoryImp) + { + this.metastoreClientFactoryImp = metastoreClientFactoryImp; + return this; + } + + public String getThriftMetastoreImp() + { + return thriftMetastoreImp; + } + + @Config("hive.metastore.thrift-imp") + public MetastoreConfig setThriftMetastoreImp(String thriftMetastoreImp) + { + this.thriftMetastoreImp = thriftMetastoreImp; + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/MetastoreUtil.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/MetastoreUtil.java new file mode 100644 index 00000000..652eb3e0 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/MetastoreUtil.java @@ -0,0 +1,302 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableMultimap; +import io.prestosql.plugin.hive.PartitionOfflineException; +import io.prestosql.plugin.hive.TableOfflineException; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableNotFoundException; +import org.apache.hadoop.hive.common.FileUtils; +import org.apache.hadoop.hive.metastore.ProtectMode; + +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Properties; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.isNullOrEmpty; +import static io.prestosql.plugin.hive.HiveSplitManager.PRESTO_OFFLINE; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.security.PrincipalType.USER; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.metastore.ColumnType.typeToThriftType; +import static org.apache.hadoop.hive.metastore.ProtectMode.getProtectModeFromString; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.BUCKET_COUNT; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.BUCKET_FIELD_NAME; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_OUTPUT_FORMAT; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_LOCATION; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_NAME; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_DDL; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; + +public class MetastoreUtil +{ + public static final String META_PARTITION_COLUMNS = "partition_metadata"; + public static final String COLUMN_COMMENTS = "columns.comments"; + + private MetastoreUtil() + { + } + + public static Properties getHiveSchema(Table table) + { + // Mimics function in Hive: MetaStoreUtils.getTableMetadata(Table) + return getHiveSchema( + table.getStorage(), + table.getDataColumns(), + table.getDataColumns(), + table.getParameters(), + table.getDatabaseName(), + table.getTableName(), + table.getPartitionColumns()); + } + + public static Properties getHiveSchema(Partition partition, Table table) + { + // Mimics function in Hive: MetaStoreUtils.getSchema(Partition, Table) + return getHiveSchema( + partition.getStorage(), + partition.getColumns(), + table.getDataColumns(), + table.getParameters(), + table.getDatabaseName(), + table.getTableName(), + table.getPartitionColumns()); + } + + private static Properties getHiveSchema( + Storage sd, + List dataColumns, + List tableDataColumns, + Map parameters, + String databaseName, + String tableName, + List partitionKeys) + { + // Mimics function in Hive: + // MetaStoreUtils.getSchema(StorageDescriptor, StorageDescriptor, Map, String, String, List) + + Properties schema = new Properties(); + + schema.setProperty(FILE_INPUT_FORMAT, sd.getStorageFormat().getInputFormat()); + schema.setProperty(FILE_OUTPUT_FORMAT, sd.getStorageFormat().getOutputFormat()); + + schema.setProperty(META_TABLE_NAME, databaseName + "." + tableName); + schema.setProperty(META_TABLE_LOCATION, sd.getLocation()); + + if (sd.getBucketProperty().isPresent()) { + schema.setProperty(BUCKET_FIELD_NAME, Joiner.on(",").join(sd.getBucketProperty().get().getBucketedBy())); + schema.setProperty(BUCKET_COUNT, Integer.toString(sd.getBucketProperty().get().getBucketCount())); + } + else { + schema.setProperty(BUCKET_COUNT, "0"); + } + + for (Map.Entry param : sd.getSerdeParameters().entrySet()) { + schema.setProperty(param.getKey(), (param.getValue() != null) ? param.getValue() : ""); + } + schema.setProperty(SERIALIZATION_LIB, sd.getStorageFormat().getSerDe()); + + StringBuilder columnNameBuilder = new StringBuilder(); + StringBuilder columnTypeBuilder = new StringBuilder(); + StringBuilder columnCommentBuilder = new StringBuilder(); + boolean first = true; + for (Column column : tableDataColumns) { + if (!first) { + columnNameBuilder.append(","); + columnTypeBuilder.append(":"); + columnCommentBuilder.append('\0'); + } + columnNameBuilder.append(column.getName()); + columnTypeBuilder.append(column.getType()); + columnCommentBuilder.append(column.getComment().orElse("")); + first = false; + } + String columnNames = columnNameBuilder.toString(); + String partitionColumnNames = dataColumns.stream() + .map(Column::getName) + .collect(Collectors.joining(",")); + String columnTypes = columnTypeBuilder.toString(); + schema.setProperty(META_TABLE_COLUMNS, columnNames); + schema.setProperty(META_PARTITION_COLUMNS, partitionColumnNames); + schema.setProperty(META_TABLE_COLUMN_TYPES, columnTypes); + schema.setProperty(COLUMN_COMMENTS, columnCommentBuilder.toString()); + + schema.setProperty(SERIALIZATION_DDL, toThriftDdl(tableName, dataColumns)); + + insertPartitionIntoProperties(partitionKeys, schema); + + if (parameters != null) { + for (Map.Entry entry : parameters.entrySet()) { + // add non-null parameters to the schema + if (entry.getValue() != null) { + schema.setProperty(entry.getKey(), entry.getValue()); + } + } + } + return schema; + } + + public static ProtectMode getProtectMode(Partition partition) + { + return getProtectMode(partition.getParameters()); + } + + public static ProtectMode getProtectMode(Table table) + { + return getProtectMode(table.getParameters()); + } + + public static String makePartitionName(Table table, Partition partition) + { + return makePartitionName(table.getPartitionColumns(), partition.getValues()); + } + + public static String makePartitionName(List partitionColumns, List values) + { + return toPartitionName(partitionColumns.stream().map(Column::getName).collect(toList()), values); + } + + public static String toPartitionName(List names, List values) + { + checkArgument(names.size() == values.size(), "partition value count must match partition column count"); + checkArgument(values.stream().allMatch(Objects::nonNull), "partition value must not be null"); + + return FileUtils.makePartName(names, values); + } + + public static String getPartitionLocation(Table table, Optional partition) + { + if (!partition.isPresent()) { + return table.getStorage().getLocation(); + } + return partition.get().getStorage().getLocation(); + } + + private static String toThriftDdl(String structName, List columns) + { + // Mimics function in Hive: + // MetaStoreUtils.getDDLFromFieldSchema(String, List) + StringBuilder ddl = new StringBuilder(); + ddl.append("struct "); + ddl.append(structName); + ddl.append(" { "); + boolean first = true; + for (Column column : columns) { + if (first) { + first = false; + } + else { + ddl.append(", "); + } + ddl.append(typeToThriftType(column.getType().getHiveTypeName().toString())); + ddl.append(' '); + ddl.append(column.getName()); + } + ddl.append("}"); + return ddl.toString(); + } + + private static ProtectMode getProtectMode(Map parameters) + { + if (!parameters.containsKey(ProtectMode.PARAMETER_NAME)) { + return new ProtectMode(); + } + else { + return getProtectModeFromString(parameters.get(ProtectMode.PARAMETER_NAME)); + } + } + + public static void verifyOnline(SchemaTableName tableName, Optional partitionName, ProtectMode protectMode, Map parameters) + { + if (protectMode.offline) { + if (partitionName.isPresent()) { + throw new PartitionOfflineException(tableName, partitionName.get(), false, null); + } + throw new TableOfflineException(tableName, false, null); + } + + String prestoOffline = parameters.get(PRESTO_OFFLINE); + if (!isNullOrEmpty(prestoOffline)) { + if (partitionName.isPresent()) { + throw new PartitionOfflineException(tableName, partitionName.get(), true, prestoOffline); + } + throw new TableOfflineException(tableName, true, prestoOffline); + } + } + + public static void verifyCanDropColumn(HiveMetastore metastore, HiveIdentity identity, String databaseName, String tableName, String columnName) + { + Table table = metastore.getTable(identity, databaseName, tableName) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + + if (table.getPartitionColumns().stream().anyMatch(column -> column.getName().equals(columnName))) { + throw new PrestoException(NOT_SUPPORTED, "Cannot drop partition columns"); + } + if (table.getDataColumns().size() <= 1) { + throw new PrestoException(NOT_SUPPORTED, "Cannot drop the only non-partition column in a table"); + } + if (table.getStorage().getBucketProperty().isPresent() && table.getStorage().getBucketProperty().get() + .getBucketedBy().stream().anyMatch(column -> column.equals(columnName))) { + throw new PrestoException(NOT_SUPPORTED, "Cannot drop bucketing columns"); + } + } + + public static PrincipalPrivileges buildInitialPrivilegeSet(String tableOwner) + { + HivePrincipal owner = new HivePrincipal(USER, tableOwner); + return new PrincipalPrivileges( + ImmutableMultimap.builder() + .put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.SELECT, true, owner, owner)) + .put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.INSERT, true, owner, owner)) + .put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.UPDATE, true, owner, owner)) + .put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.DELETE, true, owner, owner)) + .build(), + ImmutableMultimap.of()); + } + + public static void insertPartitionIntoProperties(List partitionColumns, Properties schema) + { + StringBuilder partString = new StringBuilder(); + String partStringSep = ""; + StringBuilder partTypesString = new StringBuilder(); + String partTypesStringSep = ""; + for (Column partKey : partitionColumns) { + partString.append(partStringSep); + partString.append(partKey.getName()); + partTypesString.append(partTypesStringSep); + partTypesString.append(partKey.getType().getHiveTypeName().toString()); + if (partStringSep.length() == 0) { + partStringSep = "/"; + partTypesStringSep = ":"; + } + } + if (partString.length() > 0) { + schema.setProperty(META_TABLE_PARTITION_COLUMNS, partString.toString()); + schema.setProperty(META_TABLE_PARTITION_COLUMN_TYPES, partTypesString.toString()); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Partition.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Partition.java new file mode 100644 index 00000000..730d9ec7 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Partition.java @@ -0,0 +1,217 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.spi.connector.SchemaTableName; + +import javax.annotation.concurrent.Immutable; + +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.function.Consumer; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class Partition +{ + private final String databaseName; + private final String tableName; + private final List values; + private final Storage storage; + private final List columns; + private final Map parameters; + + @JsonCreator + public Partition( + @JsonProperty("databaseName") String databaseName, + @JsonProperty("tableName") String tableName, + @JsonProperty("values") List values, + @JsonProperty("storage") Storage storage, + @JsonProperty("columns") List columns, + @JsonProperty("parameters") Map parameters) + { + this.databaseName = requireNonNull(databaseName, "databaseName is null"); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.values = ImmutableList.copyOf(requireNonNull(values, "values is null")); + this.storage = requireNonNull(storage, "storage is null"); + this.columns = ImmutableList.copyOf(requireNonNull(columns, "columns is null")); + this.parameters = ImmutableMap.copyOf(requireNonNull(parameters, "parameters is null")); + } + + @JsonProperty + public String getDatabaseName() + { + return databaseName; + } + + @JsonProperty + public String getTableName() + { + return tableName; + } + + @JsonIgnore + public SchemaTableName getSchemaTableName() + { + return new SchemaTableName(databaseName, tableName); + } + + @JsonProperty + public List getValues() + { + return values; + } + + @JsonProperty + public Storage getStorage() + { + return storage; + } + + @JsonProperty + public List getColumns() + { + return columns; + } + + @JsonProperty + public Map getParameters() + { + return parameters; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("databaseName", databaseName) + .add("tableName", tableName) + .add("values", values) + .toString(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + Partition partition = (Partition) o; + return Objects.equals(databaseName, partition.databaseName) && + Objects.equals(tableName, partition.tableName) && + Objects.equals(values, partition.values) && + Objects.equals(storage, partition.storage) && + Objects.equals(columns, partition.columns) && + Objects.equals(parameters, partition.parameters); + } + + @Override + public int hashCode() + { + return Objects.hash(databaseName, tableName, values, storage, columns, parameters); + } + + public static Builder builder() + { + return new Builder(); + } + + public static Builder builder(Partition partition) + { + return new Builder(partition); + } + + public static class Builder + { + private final Storage.Builder storageBuilder; + private String databaseName; + private String tableName; + private List values; + private List columns; + private Map parameters = ImmutableMap.of(); + + private Builder() + { + this.storageBuilder = Storage.builder(); + } + + private Builder(Partition partition) + { + this.storageBuilder = Storage.builder(partition.getStorage()); + this.databaseName = partition.getDatabaseName(); + this.tableName = partition.getTableName(); + this.values = partition.getValues(); + this.columns = partition.getColumns(); + this.parameters = partition.getParameters(); + } + + public Builder setDatabaseName(String databaseName) + { + this.databaseName = databaseName; + return this; + } + + public Builder setTableName(String tableName) + { + this.tableName = tableName; + return this; + } + + public Builder setValues(List values) + { + this.values = values; + return this; + } + + public Storage.Builder getStorageBuilder() + { + return storageBuilder; + } + + public Builder withStorage(Consumer consumer) + { + consumer.accept(storageBuilder); + return this; + } + + public Builder setColumns(List columns) + { + this.columns = columns; + return this; + } + + public Builder setParameters(Map parameters) + { + this.parameters = parameters; + return this; + } + + public Partition build() + { + return new Partition(databaseName, tableName, values, storageBuilder.build(), columns, parameters); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/PartitionFilter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/PartitionFilter.java new file mode 100644 index 00000000..b40e6f60 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/PartitionFilter.java @@ -0,0 +1,87 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; + +import javax.annotation.concurrent.Immutable; + +import java.util.List; +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class PartitionFilter +{ + private final HiveTableName hiveTableName; + private final List parts; + + @JsonCreator + public PartitionFilter(@JsonProperty("hiveTableName") HiveTableName hiveTableName, @JsonProperty("parts") List parts) + { + this.hiveTableName = requireNonNull(hiveTableName, "hiveTableName is null"); + this.parts = ImmutableList.copyOf(requireNonNull(parts, "parts is null")); + } + + public static PartitionFilter partitionFilter(String databaseName, String tableName, List parts) + { + return new PartitionFilter(HiveTableName.hiveTableName(databaseName, tableName), parts); + } + + @JsonProperty + public HiveTableName getHiveTableName() + { + return hiveTableName; + } + + @JsonProperty + public List getParts() + { + return parts; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("hiveTableName", hiveTableName) + .add("parts", parts) + .toString(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + PartitionFilter other = (PartitionFilter) o; + return Objects.equals(hiveTableName, other.hiveTableName) && + Objects.equals(parts, other.parts); + } + + @Override + public int hashCode() + { + return Objects.hash(hiveTableName, parts); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/PartitionWithStatistics.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/PartitionWithStatistics.java new file mode 100644 index 00000000..df989428 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/PartitionWithStatistics.java @@ -0,0 +1,62 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import io.prestosql.plugin.hive.PartitionStatistics; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.prestosql.plugin.hive.HiveUtil.toPartitionValues; +import static java.util.Objects.requireNonNull; + +public class PartitionWithStatistics +{ + private final Partition partition; + private final String partitionName; + private final PartitionStatistics statistics; + private final boolean updateStats; + + public PartitionWithStatistics(Partition partition, String partitionName, PartitionStatistics statistics) + { + this(partition, partitionName, statistics, true); + } + + public PartitionWithStatistics(Partition partition, String partitionName, PartitionStatistics statistics, boolean updateStats) + { + this.partition = requireNonNull(partition, "partition is null"); + this.partitionName = requireNonNull(partitionName, "partitionName is null"); + checkArgument(toPartitionValues(partitionName).equals(partition.getValues()), "unexpected partition name: %s != %s", partitionName, partition.getValues()); + this.statistics = requireNonNull(statistics, "statistics is null"); + this.updateStats = updateStats; + } + + public Partition getPartition() + { + return partition; + } + + public String getPartitionName() + { + return partitionName; + } + + public PartitionStatistics getStatistics() + { + return statistics; + } + + public boolean isUpdateStats() + { + return updateStats; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/PrincipalPrivileges.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/PrincipalPrivileges.java new file mode 100644 index 00000000..222d648b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/PrincipalPrivileges.java @@ -0,0 +1,64 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.collect.ImmutableSetMultimap; +import com.google.common.collect.Multimap; +import com.google.common.collect.SetMultimap; + +import java.util.Set; + +import static com.google.common.collect.ImmutableListMultimap.toImmutableListMultimap; +import static io.prestosql.spi.security.PrincipalType.ROLE; +import static io.prestosql.spi.security.PrincipalType.USER; +import static java.util.Objects.requireNonNull; +import static java.util.function.Function.identity; + +public class PrincipalPrivileges +{ + private final SetMultimap userPrivileges; + private final SetMultimap rolePrivileges; + + public PrincipalPrivileges( + Multimap userPrivileges, + Multimap rolePrivileges) + { + this.userPrivileges = ImmutableSetMultimap.copyOf(requireNonNull(userPrivileges, "userPrivileges is null")); + this.rolePrivileges = ImmutableSetMultimap.copyOf(requireNonNull(rolePrivileges, "rolePrivileges is null")); + } + + public static PrincipalPrivileges fromHivePrivilegeInfos(Set hivePrivileges) + { + Multimap userPrivileges = hivePrivileges + .stream() + .filter(privilege -> privilege.getGrantee().getType() == USER) + .collect(toImmutableListMultimap(privilege -> privilege.getGrantee().getName(), identity())); + + Multimap rolePrivileges = hivePrivileges + .stream() + .filter(privilege -> privilege.getGrantee().getType() == ROLE) + .collect(toImmutableListMultimap(privilege -> privilege.getGrantee().getName(), identity())); + return new PrincipalPrivileges(userPrivileges, rolePrivileges); + } + + public SetMultimap getUserPrivileges() + { + return userPrivileges; + } + + public SetMultimap getRolePrivileges() + { + return rolePrivileges; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/RecordingHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/RecordingHiveMetastore.java new file mode 100644 index 00000000..eabd0d80 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/RecordingHiveMetastore.java @@ -0,0 +1,682 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.airlift.json.JsonCodec; +import io.prestosql.plugin.hive.ForRecordingHiveMetastore; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.Type; +import org.weakref.jmx.Managed; + +import javax.annotation.concurrent.Immutable; +import javax.inject.Inject; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.airlift.json.JsonCodec.jsonCodec; +import static io.prestosql.plugin.hive.metastore.HivePartitionName.hivePartitionName; +import static io.prestosql.plugin.hive.metastore.HiveTableName.hiveTableName; +import static io.prestosql.plugin.hive.metastore.PartitionFilter.partitionFilter; +import static io.prestosql.spi.StandardErrorCode.NOT_FOUND; +import static java.nio.file.Files.readAllBytes; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.MILLISECONDS; + +public class RecordingHiveMetastore + implements HiveMetastore +{ + private static final JsonCodec RECORDING_CODEC = jsonCodec(Recording.class); + + private final HiveMetastore delegate; + private final Path recordingPath; + private final boolean replay; + + private volatile Optional> allDatabases = Optional.empty(); + private volatile Optional> allRoles = Optional.empty(); + + private final Cache> databaseCache; + private final Cache> tableCache; + private final Cache> supportedColumnStatisticsCache; + private final Cache tableStatisticsCache; + private final Cache, Map> partitionStatisticsCache; + private final Cache>> allTablesCache; + private final Cache>> allViewsCache; + private final Cache> partitionCache; + private final Cache>> partitionNamesCache; + private final Cache>> partitionNamesByPartsCache; + private final Cache, Map>> partitionsByNamesCache; + private final Cache> tablePrivilegesCache; + private final Cache> roleGrantsCache; + + @Inject + public RecordingHiveMetastore(@ForRecordingHiveMetastore HiveMetastore delegate, HiveConfig hiveConfig) + throws IOException + { + this.delegate = requireNonNull(delegate, "delegate is null"); + requireNonNull(hiveConfig, "hiveConfig is null"); + this.recordingPath = Paths.get(requireNonNull(hiveConfig.getRecordingPath(), "recordingPath is null")); + this.replay = hiveConfig.isReplay(); + + databaseCache = createCache(hiveConfig); + tableCache = createCache(hiveConfig); + supportedColumnStatisticsCache = createCache(hiveConfig); + tableStatisticsCache = createCache(hiveConfig); + partitionStatisticsCache = createCache(hiveConfig); + allTablesCache = createCache(hiveConfig); + allViewsCache = createCache(hiveConfig); + partitionCache = createCache(hiveConfig); + partitionNamesCache = createCache(hiveConfig); + partitionNamesByPartsCache = createCache(hiveConfig); + partitionsByNamesCache = createCache(hiveConfig); + tablePrivilegesCache = createCache(hiveConfig); + roleGrantsCache = createCache(hiveConfig); + + if (replay) { + loadRecording(); + } + } + + @VisibleForTesting + void loadRecording() + throws IOException + { + Recording recording = RECORDING_CODEC.fromJson(readAllBytes(recordingPath)); + + allDatabases = recording.getAllDatabases(); + allRoles = recording.getAllRoles(); + databaseCache.putAll(toMap(recording.getDatabases())); + tableCache.putAll(toMap(recording.getTables())); + supportedColumnStatisticsCache.putAll(toMap(recording.getSupportedColumnStatistics())); + tableStatisticsCache.putAll(toMap(recording.getTableStatistics())); + partitionStatisticsCache.putAll(toMap(recording.getPartitionStatistics())); + allTablesCache.putAll(toMap(recording.getAllTables())); + allViewsCache.putAll(toMap(recording.getAllViews())); + partitionCache.putAll(toMap(recording.getPartitions())); + partitionNamesCache.putAll(toMap(recording.getPartitionNames())); + partitionNamesByPartsCache.putAll(toMap(recording.getPartitionNamesByParts())); + partitionsByNamesCache.putAll(toMap(recording.getPartitionsByNames())); + tablePrivilegesCache.putAll(toMap(recording.getTablePrivileges())); + roleGrantsCache.putAll(toMap(recording.getRoleGrants())); + } + + private static Cache createCache(HiveConfig hiveConfig) + { + if (hiveConfig.isReplay()) { + return CacheBuilder.newBuilder() + .build(); + } + + return CacheBuilder.newBuilder() + .expireAfterWrite(hiveConfig.getRecordingDuration().toMillis(), MILLISECONDS) + .build(); + } + + @Managed + public void writeRecording() + throws IOException + { + if (replay) { + throw new IllegalStateException("Cannot write recording in replay mode"); + } + + Recording recording = new Recording( + allDatabases, + allRoles, + toPairs(databaseCache), + toPairs(tableCache), + toPairs(supportedColumnStatisticsCache), + toPairs(tableStatisticsCache), + toPairs(partitionStatisticsCache), + toPairs(allTablesCache), + toPairs(allViewsCache), + toPairs(partitionCache), + toPairs(partitionNamesCache), + toPairs(partitionNamesByPartsCache), + toPairs(partitionsByNamesCache), + toPairs(tablePrivilegesCache), + toPairs(roleGrantsCache)); + + Files.write(recordingPath, RECORDING_CODEC.toJsonBytes(recording)); + } + + private static Map toMap(List> pairs) + { + return pairs.stream() + .collect(ImmutableMap.toImmutableMap(Pair::getKey, Pair::getValue)); + } + + private static List> toPairs(Cache cache) + { + return cache.asMap().entrySet().stream() + .map(entry -> new Pair<>(entry.getKey(), entry.getValue())) + .collect(toImmutableList()); + } + + @Override + public Optional getDatabase(String databaseName) + { + return loadValue(databaseCache, databaseName, () -> delegate.getDatabase(databaseName)); + } + + @Override + public List getAllDatabases() + { + if (replay) { + return allDatabases.orElseThrow(() -> new PrestoException(NOT_FOUND, "Missing entry for all databases")); + } + + List result = delegate.getAllDatabases(); + allDatabases = Optional.of(result); + return result; + } + + @Override + public Optional
getTable(HiveIdentity identity, String databaseName, String tableName) + { + return loadValue(tableCache, hiveTableName(databaseName, tableName), () -> delegate.getTable(identity, databaseName, tableName)); + } + + @Override + public Set getSupportedColumnStatistics(Type type) + { + return loadValue(supportedColumnStatisticsCache, type.getTypeSignature().toString(), () -> delegate.getSupportedColumnStatistics(type)); + } + + @Override + public PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) + { + return loadValue( + tableStatisticsCache, + hiveTableName(table.getDatabaseName(), table.getTableName()), + () -> delegate.getTableStatistics(identity, table)); + } + + @Override + public Map getPartitionStatistics(HiveIdentity identity, Table table, List partitions) + { + return loadValue( + partitionStatisticsCache, + partitions.stream() + .map(partition -> hivePartitionName(table.getDatabaseName(), table.getTableName(), partition.getValues())) + .collect(Collectors.toSet()), + () -> delegate.getPartitionStatistics(identity, table, partitions)); + } + + @Override + public void updateTableStatistics(HiveIdentity identity, String databaseName, String tableName, Function update) + { + verifyRecordingMode(); + delegate.updateTableStatistics(identity, databaseName, tableName, update); + } + + @Override + public void updatePartitionStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, Function update) + { + verifyRecordingMode(); + delegate.updatePartitionStatistics(identity, databaseName, tableName, partitionName, update); + } + + @Override + public void updatePartitionsStatistics(HiveIdentity identity, String databaseName, String tableName, Map> partNamesUpdateFunctionMap) + { + partNamesUpdateFunctionMap.entrySet().stream().forEach(e -> { + updatePartitionStatistics(identity, databaseName, tableName, e.getKey(), e.getValue()); + }); + } + + @Override + public Optional> getAllTables(String databaseName) + { + return loadValue(allTablesCache, databaseName, () -> delegate.getAllTables(databaseName)); + } + + @Override + public Optional> getAllViews(String databaseName) + { + return loadValue(allViewsCache, databaseName, () -> delegate.getAllViews(databaseName)); + } + + @Override + public void createDatabase(HiveIdentity identity, Database database) + { + verifyRecordingMode(); + delegate.createDatabase(identity, database); + } + + @Override + public void dropDatabase(HiveIdentity identity, String databaseName) + { + verifyRecordingMode(); + delegate.dropDatabase(identity, databaseName); + } + + @Override + public void renameDatabase(HiveIdentity identity, String databaseName, String newDatabaseName) + { + verifyRecordingMode(); + delegate.renameDatabase(identity, databaseName, newDatabaseName); + } + + @Override + public void createTable(HiveIdentity identity, Table table, PrincipalPrivileges principalPrivileges) + { + verifyRecordingMode(); + delegate.createTable(identity, table, principalPrivileges); + } + + @Override + public void dropTable(HiveIdentity identity, String databaseName, String tableName, boolean deleteData) + { + verifyRecordingMode(); + delegate.dropTable(identity, databaseName, tableName, deleteData); + } + + @Override + public void replaceTable(HiveIdentity identity, String databaseName, String tableName, Table newTable, PrincipalPrivileges principalPrivileges) + { + verifyRecordingMode(); + delegate.replaceTable(identity, databaseName, tableName, newTable, principalPrivileges); + } + + @Override + public void renameTable(HiveIdentity identity, String databaseName, String tableName, String newDatabaseName, String newTableName) + { + verifyRecordingMode(); + delegate.renameTable(identity, databaseName, tableName, newDatabaseName, newTableName); + } + + @Override + public void commentTable(HiveIdentity identity, String databaseName, String tableName, Optional comment) + { + verifyRecordingMode(); + delegate.commentTable(identity, databaseName, tableName, comment); + } + + @Override + public void addColumn(HiveIdentity identity, String databaseName, String tableName, String columnName, HiveType columnType, String columnComment) + { + verifyRecordingMode(); + delegate.addColumn(identity, databaseName, tableName, columnName, columnType, columnComment); + } + + @Override + public void renameColumn(HiveIdentity identity, String databaseName, String tableName, String oldColumnName, String newColumnName) + { + verifyRecordingMode(); + delegate.renameColumn(identity, databaseName, tableName, oldColumnName, newColumnName); + } + + @Override + public void dropColumn(HiveIdentity identity, String databaseName, String tableName, String columnName) + { + verifyRecordingMode(); + delegate.dropColumn(identity, databaseName, tableName, columnName); + } + + @Override + public Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues) + { + return loadValue( + partitionCache, + hivePartitionName(databaseName, tableName, partitionValues), + () -> delegate.getPartition(identity, databaseName, tableName, partitionValues)); + } + + @Override + public Optional> getPartitionNames(HiveIdentity identity, String databaseName, String tableName) + { + return loadValue( + partitionNamesCache, + hiveTableName(databaseName, tableName), + () -> delegate.getPartitionNames(identity, databaseName, tableName)); + } + + @Override + public Optional> getPartitionNamesByParts(HiveIdentity identity, String databaseName, String tableName, List parts) + { + return loadValue( + partitionNamesByPartsCache, + partitionFilter(databaseName, tableName, parts), + () -> delegate.getPartitionNamesByParts(identity, databaseName, tableName, parts)); + } + + @Override + public Map> getPartitionsByNames(HiveIdentity identity, String databaseName, String tableName, List partitionNames) + { + return loadValue( + partitionsByNamesCache, + getHivePartitionNames(databaseName, tableName, ImmutableSet.copyOf(partitionNames)), + () -> delegate.getPartitionsByNames(identity, databaseName, tableName, partitionNames)); + } + + @Override + public void addPartitions(HiveIdentity identity, String databaseName, String tableName, List partitions) + { + verifyRecordingMode(); + delegate.addPartitions(identity, databaseName, tableName, partitions); + } + + @Override + public void dropPartition(HiveIdentity identity, String databaseName, String tableName, List parts, boolean deleteData) + { + verifyRecordingMode(); + delegate.dropPartition(identity, databaseName, tableName, parts, deleteData); + } + + @Override + public void alterPartition(HiveIdentity identity, String databaseName, String tableName, PartitionWithStatistics partition) + { + verifyRecordingMode(); + delegate.alterPartition(identity, databaseName, tableName, partition); + } + + @Override + public Set listTablePrivileges(String databaseName, String tableName, HivePrincipal principal) + { + return loadValue( + tablePrivilegesCache, + new UserTableKey(principal, databaseName, tableName), + () -> delegate.listTablePrivileges(databaseName, tableName, principal)); + } + + @Override + public void grantTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + verifyRecordingMode(); + delegate.grantTablePrivileges(databaseName, tableName, grantee, privileges); + } + + @Override + public void revokeTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + verifyRecordingMode(); + delegate.revokeTablePrivileges(databaseName, tableName, grantee, privileges); + } + + private Set getHivePartitionNames(String databaseName, String tableName, Set partitionNames) + { + return partitionNames.stream() + .map(partitionName -> HivePartitionName.hivePartitionName(databaseName, tableName, partitionName)) + .collect(ImmutableSet.toImmutableSet()); + } + + @Override + public void createRole(String role, String grantor) + { + verifyRecordingMode(); + delegate.createRole(role, grantor); + } + + @Override + public void dropRole(String role) + { + verifyRecordingMode(); + delegate.dropRole(role); + } + + @Override + public Set listRoles() + { + if (replay) { + return allRoles.orElseThrow(() -> new PrestoException(NOT_FOUND, "Missing entry for roles")); + } + + Set result = delegate.listRoles(); + allRoles = Optional.of(result); + return result; + } + + @Override + public void grantRoles(Set roles, Set grantees, boolean withAdminOption, HivePrincipal grantor) + { + verifyRecordingMode(); + delegate.grantRoles(roles, grantees, withAdminOption, grantor); + } + + @Override + public void revokeRoles(Set roles, Set grantees, boolean adminOptionFor, HivePrincipal grantor) + { + verifyRecordingMode(); + delegate.revokeRoles(roles, grantees, adminOptionFor, grantor); + } + + @Override + public Set listRoleGrants(HivePrincipal principal) + { + return loadValue( + roleGrantsCache, + principal, + () -> delegate.listRoleGrants(principal)); + } + + @Override + public boolean isImpersonationEnabled() + { + return delegate.isImpersonationEnabled(); + } + + private V loadValue(Cache cache, K key, Supplier valueSupplier) + { + if (replay) { + return Optional.ofNullable(cache.getIfPresent(key)) + .orElseThrow(() -> new PrestoException(NOT_FOUND, "Missing entry found for key: " + key)); + } + + V value = valueSupplier.get(); + cache.put(key, value); + return value; + } + + private void verifyRecordingMode() + { + if (replay) { + throw new IllegalStateException("Cannot perform Metastore updates in replay mode"); + } + } + + @Immutable + public static class Recording + { + private final Optional> allDatabases; + private final Optional> allRoles; + private final List>> databases; + private final List>> tables; + private final List>> supportedColumnStatistics; + private final List> tableStatistics; + private final List, Map>> partitionStatistics; + private final List>>> allTables; + private final List>>> allViews; + private final List>> partitions; + private final List>>> partitionNames; + private final List>>> partitionNamesByParts; + private final List, Map>>> partitionsByNames; + private final List>> tablePrivileges; + private final List>> roleGrants; + + @JsonCreator + public Recording( + @JsonProperty("allDatabases") Optional> allDatabases, + @JsonProperty("allRoles") Optional> allRoles, + @JsonProperty("databases") List>> databases, + @JsonProperty("tables") List>> tables, + @JsonProperty("supportedColumnStatistics") List>> supportedColumnStatistics, + @JsonProperty("tableStatistics") List> tableStatistics, + @JsonProperty("partitionStatistics") List, Map>> partitionStatistics, + @JsonProperty("allTables") List>>> allTables, + @JsonProperty("allViews") List>>> allViews, + @JsonProperty("partitions") List>> partitions, + @JsonProperty("partitionNames") List>>> partitionNames, + @JsonProperty("partitionNamesByParts") List>>> partitionNamesByParts, + @JsonProperty("partitionsByNames") List, Map>>> partitionsByNames, + @JsonProperty("tablePrivileges") List>> tablePrivileges, + @JsonProperty("roleGrants") List>> roleGrants) + { + this.allDatabases = allDatabases; + this.allRoles = allRoles; + this.databases = databases; + this.tables = tables; + this.supportedColumnStatistics = supportedColumnStatistics; + this.tableStatistics = tableStatistics; + this.partitionStatistics = partitionStatistics; + this.allTables = allTables; + this.allViews = allViews; + this.partitions = partitions; + this.partitionNames = partitionNames; + this.partitionNamesByParts = partitionNamesByParts; + this.partitionsByNames = partitionsByNames; + this.tablePrivileges = tablePrivileges; + this.roleGrants = roleGrants; + } + + @JsonProperty + public Optional> getAllDatabases() + { + return allDatabases; + } + + @JsonProperty + public Optional> getAllRoles() + { + return allRoles; + } + + @JsonProperty + public List>> getDatabases() + { + return databases; + } + + @JsonProperty + public List>> getTables() + { + return tables; + } + + @JsonProperty + public List>> getSupportedColumnStatistics() + { + return supportedColumnStatistics; + } + + @JsonProperty + public List> getTableStatistics() + { + return tableStatistics; + } + + @JsonProperty + public List, Map>> getPartitionStatistics() + { + return partitionStatistics; + } + + @JsonProperty + public List>>> getAllTables() + { + return allTables; + } + + @JsonProperty + public List>>> getAllViews() + { + return allViews; + } + + @JsonProperty + public List>> getPartitions() + { + return partitions; + } + + @JsonProperty + public List>>> getPartitionNames() + { + return partitionNames; + } + + @JsonProperty + public List>>> getPartitionNamesByParts() + { + return partitionNamesByParts; + } + + @JsonProperty + public List, Map>>> getPartitionsByNames() + { + return partitionsByNames; + } + + @JsonProperty + public List>> getTablePrivileges() + { + return tablePrivileges; + } + + @JsonProperty + public List>> getRoleGrants() + { + return roleGrants; + } + } + + @Immutable + public static class Pair + { + private final K key; + private final V value; + + @JsonCreator + public Pair(@JsonProperty("key") K key, @JsonProperty("value") V value) + { + this.key = requireNonNull(key, "key is null"); + this.value = requireNonNull(value, "value is null"); + } + + @JsonProperty + public K getKey() + { + return key; + } + + @JsonProperty + public V getValue() + { + return value; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/SemiTransactionalHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/SemiTransactionalHiveMetastore.java new file mode 100644 index 00000000..9e1cefec --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/SemiTransactionalHiveMetastore.java @@ -0,0 +1,3373 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Joiner; +import com.google.common.base.Supplier; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.ListeningExecutorService; +import com.google.common.util.concurrent.MoreExecutors; +import io.airlift.concurrent.MoreFutures; +import io.airlift.log.Logger; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.HiveACIDWriteType; +import io.prestosql.plugin.hive.HiveBasicStatistics; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HiveMetastoreClosure; +import io.prestosql.plugin.hive.HiveSessionProperties; +import io.prestosql.plugin.hive.HiveTableHandle; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.HiveVacuumTableHandle; +import io.prestosql.plugin.hive.LocationHandle.WriteMode; +import io.prestosql.plugin.hive.PartitionNotFoundException; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.PartitionUpdate; +import io.prestosql.plugin.hive.VacuumCleaner; +import io.prestosql.plugin.hive.VacuumEligibleTableCollector; +import io.prestosql.plugin.hive.VacuumTableInfoForCleaner; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.StandardErrorCode; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableAlreadyExistsException; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.security.PrincipalType; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.Trash; +import org.apache.hadoop.hive.common.ValidTxnWriteIdList; +import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; +import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; +import org.apache.hadoop.hive.ql.io.AcidUtils; + +import javax.annotation.concurrent.GuardedBy; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalLong; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executor; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_PATH_ALREADY_EXISTS; +import static io.prestosql.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME; +import static io.prestosql.plugin.hive.HiveUtil.isPrestoView; +import static io.prestosql.plugin.hive.HiveUtil.toPartitionValues; +import static io.prestosql.plugin.hive.HiveWriteUtils.createDirectory; +import static io.prestosql.plugin.hive.HiveWriteUtils.getChildren; +import static io.prestosql.plugin.hive.HiveWriteUtils.getFileStatus; +import static io.prestosql.plugin.hive.HiveWriteUtils.pathExists; +import static io.prestosql.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_NEW_DIRECTORY; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege.OWNERSHIP; +import static io.prestosql.plugin.hive.util.Statistics.ReduceOperator.SUBTRACT; +import static io.prestosql.plugin.hive.util.Statistics.merge; +import static io.prestosql.plugin.hive.util.Statistics.reduce; +import static io.prestosql.spi.StandardErrorCode.ALREADY_EXISTS; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.StandardErrorCode.TRANSACTION_CONFLICT; +import static io.prestosql.spi.security.PrincipalType.USER; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static org.apache.hadoop.hive.common.FileUtils.makePartName; +import static org.apache.hadoop.hive.metastore.TableType.MANAGED_TABLE; +import static org.apache.hadoop.hive.metastore.conf.MetastoreConf.ConfVars.TXN_TIMEOUT; +import static org.apache.hadoop.hive.metastore.conf.MetastoreConf.getTimeVar; + +public class SemiTransactionalHiveMetastore +{ + private static final Logger log = Logger.get(SemiTransactionalHiveMetastore.class); + + private final HiveMetastore delegate; + private final HiveMetastoreClosure closure; + private final HdfsEnvironment hdfsEnvironment; + private final Executor renameExecutor; + private final ScheduledExecutorService vacuumExecutorService; + private final Duration configuredVacuumCleanupInterval; + private final boolean skipDeletionForAlter; + private final boolean skipTargetCleanupOnRollback; + private final ScheduledExecutorService heartbeatExecutor; + private final Optional configuredTransactionHeartbeatInterval; + private final ListeningExecutorService hiveMetastoreClientService; + + private boolean throwOnCleanupFailure; + private int partitionCommitBatchSize; + + @GuardedBy("this") + private final Map> tableActions = new HashMap<>(); + @GuardedBy("this") + private final Map, Action>> partitionActions = new ConcurrentHashMap<>(); + @GuardedBy("this") + private final List declaredIntentionsToWrite = new ArrayList<>(); + @GuardedBy("this") + private ExclusiveOperation bufferedExclusiveOperation; + @GuardedBy("this") + private State state = State.EMPTY; + + @GuardedBy("this") + private Optional currentQueryId = Optional.empty(); + @GuardedBy("this") + private Optional> hiveTransactionSupplier = Optional.empty(); + // hiveTransactionSupplier is used to lazily open hive transaction, currentHiveTransaction is needed to do hive transaction cleanup only if a transaction was opened + @GuardedBy("this") + private Optional currentHiveTransaction = Optional.empty(); + + private List vacuumCleanerTasks = new ArrayList<>(); + //In case of transaction tables, multiple vacuum operations can run in parallel creating the exact same data. + //But during rename, since both source and destination are directories (ex: base/delta/delete_delta), + // due to race condition rename on directories may result in corrupted directories. + //ex: base_000000/base_000000/bucket_00000 + //To avoid this, in case of vacuum rename should happen at file level instead of directory level. + private boolean isVacuumIncluded; + + private HiveTableHandle tableHandle; + + public SemiTransactionalHiveMetastore( + HdfsEnvironment hdfsEnvironment, + HiveMetastore delegate, + Executor renameExecutor, + ScheduledExecutorService vacuumExecutorService, + Duration vacuumCleanupInterval, + boolean skipDeletionForAlter, + boolean skipTargetCleanupOnRollback, + Optional hiveTransactionHeartbeatInterval, + ScheduledExecutorService heartbeatService, + ScheduledExecutorService hiveMetastoreClientService, + int hmsWriteBatchSize) + { + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.delegate = requireNonNull(delegate, "delegate is null"); + this.renameExecutor = requireNonNull(renameExecutor, "renameExecutor is null"); + this.vacuumExecutorService = requireNonNull(vacuumExecutorService, "vacuumExecutorService is null"); + this.configuredVacuumCleanupInterval = requireNonNull(vacuumCleanupInterval, "vacuumCleanupInterval is null"); + this.skipDeletionForAlter = skipDeletionForAlter; + this.skipTargetCleanupOnRollback = skipTargetCleanupOnRollback; + this.heartbeatExecutor = heartbeatService; + this.configuredTransactionHeartbeatInterval = requireNonNull(hiveTransactionHeartbeatInterval, "hiveTransactionHeartbeatInterval is null"); + this.hiveMetastoreClientService = MoreExecutors.listeningDecorator(hiveMetastoreClientService); + this.closure = new HiveMetastoreClosure(delegate); + this.partitionCommitBatchSize = hmsWriteBatchSize; + } + + public synchronized List getAllDatabases() + { + checkReadable(); + return delegate.getAllDatabases(); + } + + public synchronized Optional getDatabase(String databaseName) + { + checkReadable(); + return delegate.getDatabase(databaseName); + } + + public synchronized Optional> getAllTables(String databaseName) + { + checkReadable(); + if (!tableActions.isEmpty()) { + throw new UnsupportedOperationException("Listing all tables after adding/dropping/altering tables/views in a transaction is not supported"); + } + return delegate.getAllTables(databaseName); + } + + public synchronized Optional
getTable(HiveIdentity identity, String databaseName, String tableName) + { + checkReadable(); + Action tableAction = tableActions.get(new SchemaTableName(databaseName, tableName)); + if (tableAction == null) { + return delegate.getTable(identity, databaseName, tableName); + } + switch (tableAction.getType()) { + case ADD: + case ALTER: + case INSERT_EXISTING: + return Optional.of(tableAction.getData().getTable()); + case DROP: + return Optional.empty(); + default: + throw new IllegalStateException("Unknown action type"); + } + } + + public synchronized Set getSupportedColumnStatistics(Type type) + { + return delegate.getSupportedColumnStatistics(type); + } + + public synchronized PartitionStatistics getTableStatistics(HiveIdentity identity, String databaseName, String tableName) + { + checkReadable(); + Action tableAction = tableActions.get(new SchemaTableName(databaseName, tableName)); + if (tableAction == null) { + return closure.getTableStatistics(identity, databaseName, tableName); + } + switch (tableAction.getType()) { + case ADD: + case ALTER: + case INSERT_EXISTING: + return tableAction.getData().getStatistics(); + case DROP: + return PartitionStatistics.empty(); + default: + throw new IllegalStateException("Unknown action type"); + } + } + + public synchronized Map getPartitionStatistics(HiveIdentity identity, String databaseName, String tableName, Set partitionNames, Optional
table) + { + checkReadable(); + if (!table.isPresent()) { + return ImmutableMap.of(); + } + TableSource tableSource = getTableSource(databaseName, tableName); + Map, Action> partitionActionsOfTable = partitionActions.computeIfAbsent(table.get().getSchemaTableName(), k -> new HashMap<>()); + ImmutableSet.Builder partitionNamesToQuery = ImmutableSet.builder(); + ImmutableMap.Builder resultBuilder = ImmutableMap.builder(); + for (String partitionName : partitionNames) { + List partitionValues = toPartitionValues(partitionName); + Action partitionAction = partitionActionsOfTable.get(partitionValues); + if (partitionAction == null) { + switch (tableSource) { + case PRE_EXISTING_TABLE: + partitionNamesToQuery.add(partitionName); + break; + case CREATED_IN_THIS_TRANSACTION: + resultBuilder.put(partitionName, PartitionStatistics.empty()); + break; + default: + throw new UnsupportedOperationException("unknown table source"); + } + } + else { + resultBuilder.put(partitionName, partitionAction.getData().getStatistics()); + } + } + + Map delegateResult = closure.getPartitionStatistics(identity, databaseName, tableName, partitionNamesToQuery.build()); + if (!delegateResult.isEmpty()) { + resultBuilder.putAll(delegateResult); + } + else { + partitionNamesToQuery.build().forEach(partitionName -> resultBuilder.put(partitionName, PartitionStatistics.empty())); + } + return resultBuilder.build(); + } + + /** + * This method can only be called when the table is known to exist + */ + @GuardedBy("this") + private TableSource getTableSource(String databaseName, String tableName) + { + checkHoldsLock(); + + checkReadable(); + Action tableAction = tableActions.get(new SchemaTableName(databaseName, tableName)); + if (tableAction == null) { + return TableSource.PRE_EXISTING_TABLE; + } + switch (tableAction.getType()) { + case ADD: + return TableSource.CREATED_IN_THIS_TRANSACTION; + case DROP: + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + case ALTER: + case INSERT_EXISTING: + return TableSource.PRE_EXISTING_TABLE; + default: + throw new IllegalStateException("Unknown action type"); + } + } + + public synchronized HivePageSinkMetadata generatePageSinkMetadata(HiveIdentity identity, SchemaTableName schemaTableName) + { + checkReadable(); + Optional
table = getTable(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName()); + if (!table.isPresent()) { + return new HivePageSinkMetadata(schemaTableName, Optional.empty(), ImmutableMap.of()); + } + Map, Action> partitionActionMap = partitionActions.get(schemaTableName); + Map, Optional> modifiedPartitionMap; + if (partitionActionMap == null) { + modifiedPartitionMap = ImmutableMap.of(); + } + else { + ImmutableMap.Builder, Optional> modifiedPartitionMapBuilder = ImmutableMap.builder(); + for (Map.Entry, Action> entry : partitionActionMap.entrySet()) { + modifiedPartitionMapBuilder.put(entry.getKey(), getPartitionFromPartitionAction(entry.getValue())); + } + modifiedPartitionMap = modifiedPartitionMapBuilder.build(); + } + return new HivePageSinkMetadata( + schemaTableName, + table, + modifiedPartitionMap); + } + + public synchronized Optional> getAllViews(String databaseName) + { + checkReadable(); + if (!tableActions.isEmpty()) { + throw new UnsupportedOperationException("Listing all tables after adding/dropping/altering tables/views in a transaction is not supported"); + } + return delegate.getAllViews(databaseName); + } + + public synchronized void createDatabase(HiveIdentity identity, Database database) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.createDatabase(identity, database)); + } + + public synchronized void dropDatabase(HiveIdentity identity, String schemaName) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.dropDatabase(identity, schemaName)); + } + + public synchronized void renameDatabase(HiveIdentity identity, String source, String target) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.renameDatabase(identity, source, target)); + } + + // TODO: Allow updating statistics for 2 tables in the same transaction + public synchronized void setTableStatistics(HiveIdentity identity, Table table, PartitionStatistics tableStatistics) + { + setExclusive((delegate, hdfsEnvironment) -> + delegate.updateTableStatistics(identity, table.getDatabaseName(), table.getTableName(), statistics -> updatePartitionStatistics(statistics, tableStatistics))); + } + + // TODO: Allow updating statistics for 2 tables in the same transaction + public synchronized void setPartitionStatistics(HiveIdentity identity, Table table, Map, PartitionStatistics> partitionStatisticsMap) + { + setExclusive((delegate, hdfsEnvironment) -> + partitionStatisticsMap.forEach((partitionValues, newPartitionStats) -> + delegate.updatePartitionStatistics( + identity, + table.getDatabaseName(), + table.getTableName(), + getPartitionName(table, partitionValues), + oldPartitionStats -> updatePartitionStatistics(oldPartitionStats, newPartitionStats)))); + } + + // For HiveBasicStatistics, we only overwrite the original statistics if the new one is not empty. + // For HiveColumnStatistics, we always overwrite every statistics. + // TODO: Collect file count, on-disk size and in-memory size during ANALYZE + private PartitionStatistics updatePartitionStatistics(PartitionStatistics oldPartitionStats, PartitionStatistics newPartitionStats) + { + HiveBasicStatistics oldBasicStatistics = oldPartitionStats.getBasicStatistics(); + HiveBasicStatistics newBasicStatistics = newPartitionStats.getBasicStatistics(); + HiveBasicStatistics updatedBasicStatistics = new HiveBasicStatistics( + firstPresent(newBasicStatistics.getFileCount(), oldBasicStatistics.getFileCount()), + firstPresent(newBasicStatistics.getRowCount(), oldBasicStatistics.getRowCount()), + firstPresent(newBasicStatistics.getInMemoryDataSizeInBytes(), oldBasicStatistics.getInMemoryDataSizeInBytes()), + firstPresent(newBasicStatistics.getOnDiskDataSizeInBytes(), oldBasicStatistics.getOnDiskDataSizeInBytes())); + return new PartitionStatistics(updatedBasicStatistics, newPartitionStats.getColumnStatistics()); + } + + private static OptionalLong firstPresent(OptionalLong first, OptionalLong second) + { + return first.isPresent() ? first : second; + } + + /** + * {@code currentLocation} needs to be supplied if a writePath exists for the table. + */ + public synchronized void createTable( + ConnectorSession session, + Table table, + PrincipalPrivileges principalPrivileges, + Optional currentPath, + boolean ignoreExisting, + PartitionStatistics statistics) + { + setShared(); + // When creating a table, it should never have partition actions. This is just a sanity check. + checkNoPartitionAction(table.getDatabaseName(), table.getTableName()); + Action oldTableAction = tableActions.get(table.getSchemaTableName()); + HiveIdentity identity = new HiveIdentity(session); + TableAndMore tableAndMore = new TableAndMore(table, identity, Optional.of(principalPrivileges), currentPath, Optional.empty(), ignoreExisting, statistics, statistics, + HiveSessionProperties.isCollectColumnStatisticsOnWrite(session)); + if (oldTableAction == null) { + HdfsContext hdfsContext = new HdfsContext(session, table.getDatabaseName(), table.getTableName()); + tableActions.put(table.getSchemaTableName(), new Action<>(ActionType.ADD, tableAndMore, hdfsContext, identity)); + return; + } + switch (oldTableAction.getType()) { + case DROP: + if (!oldTableAction.getHdfsContext().getIdentity().getUser().equals(session.getUser())) { + throw new PrestoException(TRANSACTION_CONFLICT, "Operation on the same table with different user in the same transaction is not supported"); + } + HdfsContext hdfsContext = new HdfsContext(session, table.getDatabaseName(), table.getTableName()); + tableActions.put(table.getSchemaTableName(), new Action<>(ActionType.ALTER, tableAndMore, hdfsContext, identity)); + break; + case ADD: + case ALTER: + case INSERT_EXISTING: + throw new TableAlreadyExistsException(table.getSchemaTableName()); + default: + throw new IllegalStateException("Unknown action type"); + } + } + + public synchronized void dropTable(ConnectorSession session, String databaseName, String tableName) + { + setShared(); + // Dropping table with partition actions requires cleaning up staging data, which is not implemented yet. + checkNoPartitionAction(databaseName, tableName); + SchemaTableName schemaTableName = new SchemaTableName(databaseName, tableName); + Action oldTableAction = tableActions.get(schemaTableName); + if (oldTableAction == null || oldTableAction.getType() == ActionType.ALTER) { + HdfsContext hdfsContext = new HdfsContext(session, databaseName, tableName); + HiveIdentity identity = new HiveIdentity(session); + tableActions.put(schemaTableName, new Action<>(ActionType.DROP, null, hdfsContext, identity)); + return; + } + switch (oldTableAction.getType()) { + case DROP: + throw new TableNotFoundException(schemaTableName); + case ADD: + case ALTER: + case INSERT_EXISTING: + throw new UnsupportedOperationException("dropping a table added/modified in the same transaction is not supported"); + default: + throw new IllegalStateException("Unknown action type"); + } + } + + public synchronized void replaceView(HiveIdentity identity, String databaseName, String tableName, Table table, PrincipalPrivileges principalPrivileges) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.replaceTable(identity, databaseName, tableName, table, principalPrivileges)); + } + + public synchronized void renameTable(HiveIdentity identity, String databaseName, String tableName, String newDatabaseName, String newTableName) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.renameTable(identity, databaseName, tableName, newDatabaseName, newTableName)); + } + + public synchronized void commentTable(HiveIdentity identity, String databaseName, String tableName, Optional comment) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.commentTable(identity, databaseName, tableName, comment)); + } + + public synchronized void addColumn(HiveIdentity identity, String databaseName, String tableName, String columnName, HiveType columnType, String columnComment) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.addColumn(identity, databaseName, tableName, columnName, columnType, columnComment)); + } + + public synchronized void renameColumn(HiveIdentity identity, String databaseName, String tableName, String oldColumnName, String newColumnName) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.renameColumn(identity, databaseName, tableName, oldColumnName, newColumnName)); + } + + public synchronized void dropColumn(HiveIdentity identity, String databaseName, String tableName, String columnName) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.dropColumn(identity, databaseName, tableName, columnName)); + } + + public synchronized void finishInsertIntoExistingTable( + ConnectorSession session, + String databaseName, + String tableName, + Path currentLocation, + List fileNames, + PartitionStatistics statisticsUpdate, + HiveACIDWriteType acidWriteType) + { + // Data can only be inserted into partitions and unpartitioned tables. They can never be inserted into a partitioned table. + // Therefore, this method assumes that the table is unpartitioned. + setShared(); + HiveIdentity identity = new HiveIdentity(session); + isVacuumIncluded |= HiveACIDWriteType.isVacuum(acidWriteType); + SchemaTableName schemaTableName = new SchemaTableName(databaseName, tableName); + Action oldTableAction = tableActions.get(schemaTableName); + if (oldTableAction == null) { + Table table = delegate.getTable(identity, databaseName, tableName) + .orElseThrow(() -> new TableNotFoundException(schemaTableName)); + HdfsContext hdfsContext = new HdfsContext(session, databaseName, tableName); + PartitionStatistics mergedStatistics = statisticsUpdate; + boolean updateStats = canUpdateStats(session, acidWriteType); + if (updateStats) { + PartitionStatistics currentStatistics = getTableStatistics(identity, databaseName, tableName); + mergedStatistics = merge(currentStatistics, statisticsUpdate); + } + tableActions.put( + schemaTableName, + new Action<>( + ActionType.INSERT_EXISTING, + new TableAndMore( + table, + identity, + Optional.empty(), + Optional.of(currentLocation), + Optional.of(fileNames), + false, + mergedStatistics, + statisticsUpdate, + updateStats), + hdfsContext, + identity)); + return; + } + + switch (oldTableAction.getType()) { + case DROP: + throw new TableNotFoundException(schemaTableName); + case ADD: + case ALTER: + case INSERT_EXISTING: + throw new UnsupportedOperationException("Inserting into an unpartitioned table that were added, altered, or inserted into in the same transaction is not supported"); + default: + throw new IllegalStateException("Unknown action type"); + } + } + + public synchronized void truncateUnpartitionedTable(ConnectorSession session, String databaseName, String tableName) + { + checkReadable(); + Optional
table = getTable(new HiveIdentity(session), databaseName, tableName); + SchemaTableName schemaTableName = new SchemaTableName(databaseName, tableName); + if (!table.isPresent()) { + throw new TableNotFoundException(schemaTableName); + } + if (!table.get().getTableType().equals(MANAGED_TABLE.toString())) { + throw new PrestoException(NOT_SUPPORTED, "Cannot delete from non-managed Hive table"); + } + if (!table.get().getPartitionColumns().isEmpty()) { + throw new IllegalArgumentException("Table is partitioned"); + } + + Path path = new Path(table.get().getStorage().getLocation()); + HdfsContext context = new HdfsContext(session, databaseName, tableName); + boolean isAutoPurge = "true".equalsIgnoreCase(table.get().getParameters().get("auto.purge")); + setExclusive((delegate, hdfsEnvironment) -> { + RecursiveDeleteResult recursiveDeleteResult; + if (isAutoPurge) { + recursiveDeleteResult = recursiveDeleteFiles(hdfsEnvironment, context, path, ImmutableSet.of(""), false); + } + else { + recursiveDeleteResult = moveToTrash(hdfsEnvironment, context, path); + } + + if (!recursiveDeleteResult.getNotDeletedEligibleItems().isEmpty()) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, format( + "Error deleting from unpartitioned table %s. These items can not be deleted: %s", + schemaTableName, + recursiveDeleteResult.getNotDeletedEligibleItems())); + } + }); + } + + public synchronized Optional> getPartitionNames(HiveIdentity identity, String databaseName, String tableName) + { + Optional
table = getTable(identity, databaseName, tableName); + return doGetPartitionNames(identity, databaseName, tableName, Optional.empty(), table); + } + + public synchronized Optional> getPartitionNamesByParts(HiveIdentity identity, String databaseName, String tableName, List parts, Table table) + { + return doGetPartitionNames(identity, databaseName, tableName, Optional.of(parts), Optional.of(table)); + } + + @GuardedBy("this") + private Optional> doGetPartitionNames(HiveIdentity identity, String databaseName, String tableName, Optional> parts, Optional
table) + { + checkHoldsLock(); + + checkReadable(); + if (!table.isPresent()) { + return Optional.empty(); + } + List partitionNames; + TableSource tableSource = getTableSource(databaseName, tableName); + switch (tableSource) { + case CREATED_IN_THIS_TRANSACTION: + partitionNames = ImmutableList.of(); + break; + case PRE_EXISTING_TABLE: { + Optional> partitionNameResult; + if (parts.isPresent()) { + partitionNameResult = delegate.getPartitionNamesByParts(identity, databaseName, tableName, parts.get()); + } + else { + partitionNameResult = delegate.getPartitionNames(identity, databaseName, tableName); + } + if (!partitionNameResult.isPresent()) { + throw new PrestoException(TRANSACTION_CONFLICT, format("Table %s.%s was dropped by another transaction", databaseName, tableName)); + } + partitionNames = partitionNameResult.get(); + break; + } + default: + throw new UnsupportedOperationException("Unknown table source"); + } + Map, Action> partitionActionsOfTable = partitionActions.computeIfAbsent(table.get().getSchemaTableName(), k -> new HashMap<>()); + ImmutableList.Builder resultBuilder = ImmutableList.builder(); + // alter/remove newly-altered/dropped partitions from the results from underlying metastore + for (String partitionName : partitionNames) { + List partitionValues = toPartitionValues(partitionName); + Action partitionAction = partitionActionsOfTable.get(partitionValues); + if (partitionAction == null) { + resultBuilder.add(partitionName); + continue; + } + switch (partitionAction.getType()) { + case ADD: + throw new PrestoException(TRANSACTION_CONFLICT, format("Another transaction created partition %s in table %s.%s", partitionValues, databaseName, tableName)); + case DROP: + // do nothing + break; + case ALTER: + case INSERT_EXISTING: + resultBuilder.add(partitionName); + break; + default: + throw new IllegalStateException("Unknown action type"); + } + } + // add newly-added partitions to the results from underlying metastore + if (!partitionActionsOfTable.isEmpty()) { + List columnNames = table.get().getPartitionColumns().stream().map(Column::getName).collect(Collectors.toList()); + for (Action partitionAction : partitionActionsOfTable.values()) { + if (partitionAction.getType() == ActionType.ADD) { + List values = partitionAction.getData().getPartition().getValues(); + if (!parts.isPresent() || partitionValuesMatch(values, parts.get())) { + resultBuilder.add(makePartName(columnNames, values)); + } + } + } + } + return Optional.of(resultBuilder.build()); + } + + private static boolean partitionValuesMatch(List values, List pattern) + { + checkArgument(values.size() == pattern.size()); + for (int i = 0; i < values.size(); i++) { + if (pattern.get(i).isEmpty()) { + // empty string match everything + continue; + } + if (values.get(i).equals(pattern.get(i))) { + return false; + } + } + return true; + } + + public synchronized Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues) + { + checkReadable(); + TableSource tableSource = getTableSource(databaseName, tableName); + Map, Action> partitionActionsOfTable = partitionActions.computeIfAbsent(new SchemaTableName(databaseName, tableName), k -> new HashMap<>()); + Action partitionAction = partitionActionsOfTable.get(partitionValues); + if (partitionAction != null) { + return getPartitionFromPartitionAction(partitionAction); + } + switch (tableSource) { + case PRE_EXISTING_TABLE: + return delegate.getPartition(identity, databaseName, tableName, partitionValues); + case CREATED_IN_THIS_TRANSACTION: + return Optional.empty(); + default: + throw new UnsupportedOperationException("unknown table source"); + } + } + + public synchronized Map> getPartitionsByNames(HiveIdentity identity, String databaseName, String tableName, List partitionNames) + { + checkReadable(); + TableSource tableSource = getTableSource(databaseName, tableName); + Map, Action> partitionActionsOfTable = partitionActions.computeIfAbsent(new SchemaTableName(databaseName, tableName), k -> new HashMap<>()); + ImmutableList.Builder partitionNamesToQuery = ImmutableList.builder(); + ImmutableMap.Builder> resultBuilder = ImmutableMap.builder(); + for (String partitionName : partitionNames) { + List partitionValues = toPartitionValues(partitionName); + Action partitionAction = partitionActionsOfTable.get(partitionValues); + if (partitionAction == null) { + switch (tableSource) { + case PRE_EXISTING_TABLE: + partitionNamesToQuery.add(partitionName); + break; + case CREATED_IN_THIS_TRANSACTION: + resultBuilder.put(partitionName, Optional.empty()); + break; + default: + throw new UnsupportedOperationException("unknown table source"); + } + } + else { + resultBuilder.put(partitionName, getPartitionFromPartitionAction(partitionAction)); + } + } + Map> delegateResult = delegate.getPartitionsByNames(identity, databaseName, tableName, partitionNamesToQuery.build()); + resultBuilder.putAll(delegateResult); + return resultBuilder.build(); + } + + private static Optional getPartitionFromPartitionAction(Action partitionAction) + { + switch (partitionAction.getType()) { + case ADD: + case ALTER: + case INSERT_EXISTING: + return Optional.of(partitionAction.getData().getAugmentedPartitionForInTransactionRead()); + case DROP: + return Optional.empty(); + default: + throw new IllegalStateException("Unknown action type"); + } + } + + public synchronized void addPartition( + ConnectorSession session, + String databaseName, + String tableName, + Partition partition, + Path currentLocation, + PartitionStatistics statistics, + HiveACIDWriteType acidWriteType) + { + setShared(); + checkArgument(getPrestoQueryId(partition).isPresent()); + Map, Action> partitionActionsOfTable = partitionActions.computeIfAbsent(new SchemaTableName(databaseName, tableName), k -> new ConcurrentHashMap<>()); + Action oldPartitionAction = partitionActionsOfTable.get(partition.getValues()); + HdfsContext hdfsContext = new HdfsContext(session, databaseName, tableName); + HiveIdentity identity = new HiveIdentity(session); + boolean canUpdateStats = canUpdateStats(session, acidWriteType); + this.partitionCommitBatchSize = HiveSessionProperties.getMetastoreWriteBatchSize(session); + + if (oldPartitionAction == null) { + partitionActionsOfTable.put( + partition.getValues(), + new Action<>(ActionType.ADD, + new PartitionAndMore(identity, partition, currentLocation, Optional.empty(), statistics, statistics, canUpdateStats), + hdfsContext, identity)); + return; + } + switch (oldPartitionAction.getType()) { + case DROP: { + if (!oldPartitionAction.getHdfsContext().getIdentity().getUser().equals(session.getUser())) { + throw new PrestoException(TRANSACTION_CONFLICT, "Operation on the same partition with different user in the same transaction is not supported"); + } + partitionActionsOfTable.put( + partition.getValues(), + new Action<>(ActionType.ALTER, new PartitionAndMore(identity, partition, currentLocation, Optional.empty(), statistics, statistics, canUpdateStats), hdfsContext, identity)); + break; + } + case ADD: + case ALTER: + case INSERT_EXISTING: + throw new PrestoException(ALREADY_EXISTS, format("Partition already exists for table '%s.%s': %s", databaseName, tableName, partition.getValues())); + default: + throw new IllegalStateException("Unknown action type"); + } + } + + public synchronized void dropPartition(ConnectorSession session, String databaseName, String tableName, List partitionValues) + { + setShared(); + Map, Action> partitionActionsOfTable = partitionActions.computeIfAbsent(new SchemaTableName(databaseName, tableName), k -> new ConcurrentHashMap<>()); + Action oldPartitionAction = partitionActionsOfTable.get(partitionValues); + if (oldPartitionAction == null) { + HdfsContext hdfsContext = new HdfsContext(session, databaseName, tableName); + HiveIdentity identity = new HiveIdentity(session); + partitionActionsOfTable.put(partitionValues, new Action<>(ActionType.DROP, null, hdfsContext, identity)); + return; + } + switch (oldPartitionAction.getType()) { + case DROP: + throw new PartitionNotFoundException(new SchemaTableName(databaseName, tableName), partitionValues); + case ADD: + case ALTER: + case INSERT_EXISTING: + throw new PrestoException( + NOT_SUPPORTED, + format("dropping a partition added in the same transaction is not supported: %s %s %s", databaseName, tableName, partitionValues)); + default: + throw new IllegalStateException("Unknown action type"); + } + } + + public synchronized void finishInsertIntoExistingPartition( + ConnectorSession session, + String databaseName, + String tableName, + List partitionValues, + Path currentLocation, + List fileNames, + PartitionStatistics statisticsUpdate, + HiveACIDWriteType acidWriteType) + { + setShared(); + isVacuumIncluded |= HiveACIDWriteType.isVacuum(acidWriteType); + HiveIdentity identity = new HiveIdentity(session); + SchemaTableName schemaTableName = new SchemaTableName(databaseName, tableName); + Map, Action> partitionActionsOfTable = partitionActions.computeIfAbsent(schemaTableName, k -> new LinkedHashMap<>()); + Action oldPartitionAction = partitionActionsOfTable.get(partitionValues); + if (oldPartitionAction == null) { + Partition partition = delegate.getPartition(identity, databaseName, tableName, partitionValues) + .orElseThrow(() -> new PartitionNotFoundException(schemaTableName, partitionValues)); + String partitionName = getPartitionName(identity, databaseName, tableName, partitionValues); + PartitionStatistics mergedStatistics = statisticsUpdate; + boolean updateStats = canUpdateStats(session, acidWriteType); + if (updateStats) { + PartitionStatistics currentStatistics = closure.getPartitionStatistics(identity, databaseName, tableName, ImmutableSet.of(partitionName)).get(partitionName); + if (currentStatistics == null) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "currentStatistics is null"); + } + mergedStatistics = merge(currentStatistics, statisticsUpdate); + } + HdfsContext context = new HdfsContext(session, databaseName, tableName); + partitionActionsOfTable.put( + partitionValues, + new Action<>( + ActionType.INSERT_EXISTING, + new PartitionAndMore( + identity, + partition, + currentLocation, + Optional.of(fileNames), + mergedStatistics, + statisticsUpdate, + updateStats), + context, + identity)); + return; + } + + switch (oldPartitionAction.getType()) { + case DROP: + throw new PartitionNotFoundException(schemaTableName, partitionValues); + case ADD: + case ALTER: + case INSERT_EXISTING: + throw new UnsupportedOperationException("Inserting into a partition that were added, altered, or inserted into in the same transaction is not supported"); + default: + throw new IllegalStateException("Unknown action type"); + } + } + + private boolean canUpdateStats(ConnectorSession session, HiveACIDWriteType acidWriteType) + { + //Skip stats update for Update/Delete/Vacuum + boolean updateStats; + if (HiveACIDWriteType.VACUUM_UNIFY == acidWriteType) { + updateStats = false; + } + else { + updateStats = HiveSessionProperties.isCollectColumnStatisticsOnWrite(session) && + !HiveACIDWriteType.isUpdateOrDelete(acidWriteType) && + !(HiveACIDWriteType.VACUUM == acidWriteType); + } + return updateStats; + } + + private String getPartitionName(HiveIdentity identity, String databaseName, String tableName, List partitionValues) + { + Table table = getTable(identity, databaseName, tableName) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + return getPartitionName(table, partitionValues); + } + + private String getPartitionName(Table table, List partitionValues) + { + List columnNames = table.getPartitionColumns().stream() + .map(Column::getName) + .collect(toImmutableList()); + return makePartName(columnNames, partitionValues); + } + + public synchronized void createRole(String role, String grantor) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.createRole(role, grantor)); + } + + public synchronized void dropRole(String role) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.dropRole(role)); + } + + public synchronized Set listRoles() + { + checkReadable(); + return delegate.listRoles(); + } + + public synchronized void grantRoles(Set roles, Set grantees, boolean withAdminOption, HivePrincipal grantor) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.grantRoles(roles, grantees, withAdminOption, grantor)); + } + + public synchronized void revokeRoles(Set roles, Set grantees, boolean adminOptionFor, HivePrincipal grantor) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.revokeRoles(roles, grantees, adminOptionFor, grantor)); + } + + public synchronized Set listRoleGrants(HivePrincipal principal) + { + checkReadable(); + return delegate.listRoleGrants(principal); + } + + /** + * listColumnPrivileges + * @param databaseName databaseName + * @param tableName tableName + * @param columnName columnName + * @param principal principal + * @return HivePrivilegeInfo + */ + public synchronized Set listColumnPrivileges(String databaseName, String tableName, + String columnName, HivePrincipal principal) + { + return delegate.listColumnPrivileges(databaseName, tableName, columnName, principal); + } + + /** + * listSchemaPrivileges + * @param databaseName databaseName + * @param tableName tableName + * @param principal principal + * @return HivePrivilegeInfo + */ + public synchronized Set listSchemaPrivileges(String databaseName, String tableName, + HivePrincipal principal) + { + return delegate.listSchemaPrivileges(databaseName, tableName, principal); + } + + public synchronized Set listTablePrivileges(String databaseName, String tableName, HivePrincipal principal) + { + checkReadable(); + SchemaTableName schemaTableName = new SchemaTableName(databaseName, tableName); + Action tableAction = tableActions.get(schemaTableName); + if (tableAction == null) { + return delegate.listTablePrivileges(databaseName, tableName, principal); + } + switch (tableAction.getType()) { + case ADD: + case ALTER: { + if (principal.getType() == PrincipalType.ROLE) { + return ImmutableSet.of(); + } + if (!principal.getName().equals(tableAction.getData().getTable().getOwner())) { + return ImmutableSet.of(); + } + Collection privileges = tableAction.getData().getPrincipalPrivileges().getUserPrivileges().get(principal.getName()); + return ImmutableSet.builder() + .addAll(privileges) + .add(new HivePrivilegeInfo(OWNERSHIP, true, new HivePrincipal(USER, principal.getName()), new HivePrincipal(USER, principal.getName()))) + .build(); + } + case INSERT_EXISTING: + return delegate.listTablePrivileges(databaseName, tableName, principal); + case DROP: + throw new TableNotFoundException(schemaTableName); + default: + throw new IllegalStateException("Unknown action type"); + } + } + + public synchronized void grantTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.grantTablePrivileges(databaseName, tableName, grantee, privileges)); + } + + public synchronized void revokeTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + setExclusive((delegate, hdfsEnvironment) -> delegate.revokeTablePrivileges(databaseName, tableName, grantee, privileges)); + } + + public synchronized void declareIntentionToWrite(ConnectorSession session, WriteMode writeMode, Path stagingPathRoot, SchemaTableName schemaTableName) + { + setShared(); + if (writeMode == WriteMode.DIRECT_TO_TARGET_EXISTING_DIRECTORY) { + Map, Action> partitionActionsOfTable = partitionActions.get(schemaTableName); + if (partitionActionsOfTable != null && !partitionActionsOfTable.isEmpty()) { + throw new PrestoException(NOT_SUPPORTED, "Can not insert into a table with a partition that has been modified in the same transaction when Presto is configured to skip temporary directories."); + } + } + HdfsContext hdfsContext = new HdfsContext(session, schemaTableName.getSchemaName(), schemaTableName.getTableName()); + HiveIdentity identity = new HiveIdentity(session); + declaredIntentionsToWrite.add(new DeclaredIntentionToWrite(writeMode, hdfsContext, identity, session.getQueryId(), stagingPathRoot, schemaTableName)); + } + + public synchronized void commit() + { + try { + switch (state) { + case EMPTY: + //release locks if any. + commitTransaction(); + break; + case SHARED_OPERATION_BUFFERED: + commitShared(); + break; + case EXCLUSIVE_OPERATION_BUFFERED: + requireNonNull(bufferedExclusiveOperation, "bufferedExclusiveOperation is null"); + bufferedExclusiveOperation.execute(delegate, hdfsEnvironment); + break; + case FINISHED: + throw new IllegalStateException("Tried to commit buffered metastore operations after transaction has been committed/aborted"); + default: + throw new IllegalStateException("Unknown state"); + } + } + finally { + state = State.FINISHED; + } + } + + public void submitCleanupTasks() + { + if (vacuumCleanerTasks.size() > 0) { + vacuumCleanerTasks.forEach(c -> c.submitVacuumCleanupTask()); + } + if (tableHandle != null) { + VacuumEligibleTableCollector.finishVacuum(tableHandle.getSchemaTableName().toString()); + } + } + + public synchronized void rollback() + { + try { + switch (state) { + case EMPTY: + //release locks if any. + abortTransaction(); + case EXCLUSIVE_OPERATION_BUFFERED: + break; + case SHARED_OPERATION_BUFFERED: + rollbackShared(); + break; + case FINISHED: + throw new IllegalStateException("Tried to rollback buffered metastore operations after transaction has been committed/aborted"); + default: + throw new IllegalStateException("Unknown state"); + } + } + finally { + state = State.FINISHED; + } + } + + public void beginQuery(ConnectorSession session) + { + String queryId = session.getQueryId(); + HiveIdentity identity = new HiveIdentity(session); + + synchronized (this) { + checkState( + !currentQueryId.isPresent(), + "Query already begun: %s while starting query %s", + currentQueryId, + queryId); + currentQueryId = Optional.of(queryId); + + if (!hiveTransactionSupplier.isPresent()) { + hiveTransactionSupplier = Optional.of(() -> { + long heartbeatInterval = configuredTransactionHeartbeatInterval + .map(Duration::toMillis) + .orElseGet(this::getServerExpectedHeartbeatIntervalMillis); + long transactionId = delegate.openTransaction(identity); + log.debug("Using hive transaction %s for query %s", transactionId, queryId); + + ScheduledFuture heartbeatTask = heartbeatExecutor.scheduleAtFixedRate( + () -> delegate.sendTransactionHeartbeat(identity, transactionId), + 0, + heartbeatInterval, + MILLISECONDS); + + return new HiveTransaction(identity, transactionId, heartbeatTask); + }); + } + } + } + + private long getServerExpectedHeartbeatIntervalMillis() + { + String hiveServerTransactionTimeout = delegate.getConfigValue(TXN_TIMEOUT.getVarname()).orElseGet(() -> TXN_TIMEOUT.getDefaultVal().toString()); + Configuration configuration = new Configuration(false); + configuration.set(TXN_TIMEOUT.toString(), hiveServerTransactionTimeout); + return getTimeVar(configuration, TXN_TIMEOUT, MILLISECONDS) / 2; + } + + public synchronized Optional getValidWriteIds(ConnectorSession session, HiveTableHandle tableHandle, boolean isVacuum) + { + String queryId = session.getQueryId(); + checkState(currentQueryId.equals(Optional.of(queryId)), "Invalid query id %s while current query is", queryId, currentQueryId); + if (!AcidUtils.isTransactionalTable(tableHandle.getTableParameters().orElseThrow(() -> new IllegalStateException("tableParameters missing")))) { + return Optional.empty(); + } + + if (!currentHiveTransaction.isPresent()) { + currentHiveTransaction = Optional.of(hiveTransactionSupplier + .orElseThrow(() -> new IllegalStateException("hiveTransactionSupplier is not set")) + .get()); + } + + return Optional.of(currentHiveTransaction.get().getValidWriteIds(delegate, tableHandle, queryId, isVacuum)); + } + + public synchronized Optional getTableWriteId(ConnectorSession session, HiveTableHandle tableHandle, HiveACIDWriteType writeType) + { + String queryId = session.getQueryId(); + checkState(currentQueryId.equals(Optional.of(queryId)), "Invalid query id %s while current query is", queryId, currentQueryId); + if (!AcidUtils.isTransactionalTable(tableHandle.getTableParameters().orElseThrow(() -> new IllegalStateException("tableParameters missing")))) { + return Optional.empty(); + } + + if (!currentHiveTransaction.isPresent()) { + currentHiveTransaction = Optional.of(hiveTransactionSupplier + .orElseThrow(() -> new IllegalStateException("hiveTransactionSupplier is not set")) + .get()); + } + + return Optional.of(currentHiveTransaction.get().getTableWriteId(delegate, tableHandle, writeType, queryId)); + } + + public synchronized void cleanupQuery(ConnectorSession session) + { + String queryId = session.getQueryId(); + checkState(currentQueryId.equals(Optional.of(queryId)), "Invalid query id %s while current query is", queryId, currentQueryId); + currentQueryId = Optional.empty(); + } + + private void commitTransaction() + { + Optional transaction = currentHiveTransaction; + + if (!transaction.isPresent()) { + return; + } + + long transactionId = transaction.get().getTransactionId(); + // Any failure around aborted transactions, etc would be handled by Hive Metastore commit and PrestoException will be thrown + delegate.commitTransaction(transaction.get().getIdentity(), transactionId); + + currentHiveTransaction = Optional.empty(); + hiveTransactionSupplier = Optional.empty(); + ScheduledFuture heartbeatTask = transaction.get().getHeartbeatTask(); + heartbeatTask.cancel(true); + } + + private void abortTransaction() + { + Optional transaction = currentHiveTransaction; + + if (!transaction.isPresent()) { + return; + } + + long transactionId = transaction.get().getTransactionId(); + // Any failure around aborted transactions, etc would be handled by Hive Metastore commit and PrestoException will be thrown + delegate.abortTransaction(transaction.get().getIdentity(), transactionId); + + currentHiveTransaction = Optional.empty(); + hiveTransactionSupplier = Optional.empty(); + ScheduledFuture heartbeatTask = transaction.get().getHeartbeatTask(); + heartbeatTask.cancel(true); + } + + @GuardedBy("this") + private void commitShared() + { + checkHoldsLock(); + + Committer committer = new Committer(); + try { + List> tableActionsFutures = this.tableActions.entrySet().stream() + .map(entry -> hiveMetastoreClientService.submit(() -> { + SchemaTableName schemaTableName = entry.getKey(); + Action action = entry.getValue(); + switch (action.getType()) { + case DROP: + committer.prepareDropTable(action.getIdentity(), schemaTableName); + break; + case ALTER: + committer.prepareAlterTable(action.getHdfsContext(), action.getIdentity(), action.getData()); + break; + case ADD: + committer.prepareAddTable(action.getHdfsContext(), action.getData()); + break; + case INSERT_EXISTING: + committer.prepareInsertExistingTable(action.getHdfsContext(), action.getData()); + break; + default: + throw new IllegalStateException("Unknown action type"); + } + })).collect(Collectors.toList()); + waitForCompletion(tableActionsFutures, "Table Actions preparation"); + + for (Map.Entry, Action>> tableEntry : partitionActions.entrySet()) { + SchemaTableName schemaTableName = tableEntry.getKey(); + Collection> values = tableEntry.getValue().values(); + if (values.isEmpty()) { + continue; + } + HiveIdentity identity = values.iterator().next().getIdentity(); + Table table = getTable(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(schemaTableName)); + List> partitionActionsFutures = tableEntry.getValue().entrySet().stream() + .map(partitionEntry -> hiveMetastoreClientService.submit(() -> { + List partitionValues = partitionEntry.getKey(); + Action action = partitionEntry.getValue(); + switch (action.getType()) { + case DROP: + committer.prepareDropPartition(action.getIdentity(), schemaTableName, partitionValues); + break; + case ALTER: + committer.prepareAlterPartition(table, action.getHdfsContext(), action.getIdentity(), action.getData()); + break; + case ADD: + committer.prepareAddPartition(table, action.getHdfsContext(), action.getIdentity(), action.getData()); + break; + case INSERT_EXISTING: + committer.prepareInsertExistingPartition(table, action.getHdfsContext(), action.getIdentity(), action.getData()); + break; + default: + throw new IllegalStateException("Unknown action type"); + } + })).collect(Collectors.toList()); + waitForCompletion(partitionActionsFutures, "Partitions Actions preparation for table " + schemaTableName.toString()); + } + + // Wait for all renames submitted for "INSERT_EXISTING" action to finish + committer.waitForAsyncRenames(); + + // At this point, all file system operations, whether asynchronously issued or not, have completed successfully. + // We are moving on to metastore operations now. + + committer.executeAddTableOperations(); + committer.executeAlterTableOperations(); + committer.executeAlterPartitionOperations(); + committer.executeAddPartitionOperations(); + committer.executeUpdateStatisticsOperations(); + //finally commit the transaction + commitTransaction(); + } + catch (Throwable t) { + committer.cancelUnstartedAsyncRenames(); + + committer.undoUpdateStatisticsOperations(); + committer.undoAddPartitionOperations(); + committer.undoAddTableOperations(); + + committer.waitForAsyncRenamesSuppressThrowables(); + + // fileRenameFutures must all come back before any file system cleanups are carried out. + // Otherwise, files that should be deleted may be created after cleanup is done. + committer.executeCleanupTasksForAbort(declaredIntentionsToWrite); + + committer.executeRenameTasksForAbort(); + + // Partition directory must be put back before relevant metastore operation can be undone + committer.undoAlterTableOperations(); + committer.undoAlterPartitionOperations(); + + rollbackShared(); + + throw t; + } + + try { + // After this line, operations are no longer reversible. + // The next section will deal with "dropping table/partition". Commit may still fail in + // this section. Even if commit fails, cleanups, instead of rollbacks, will be executed. + + committer.executeIrreversibleMetastoreOperations(); + + // If control flow reached this point, this commit is considered successful no matter + // what happens later. The only kind of operations that haven't been carried out yet + // are cleanups. + + // The program control flow will go to finally next. And cleanup will run because + // moveForwardInFinally has been set to false. + } + finally { + // In this method, all operations are best-effort clean up operations. + // If any operation fails, the error will be logged and ignored. + // Additionally, other clean up operations should still be attempted. + + // Execute deletion tasks + committer.executeDeletionTasksForFinish(); + + // Clean up empty staging directories (that may recursively contain empty directories) + committer.deleteEmptyStagingDirectories(declaredIntentionsToWrite); + } + } + + private static void waitForCompletion(List> tableActionsFuture, String operation) + { + ListenableFuture> listListenableFuture = Futures.allAsList(tableActionsFuture); + try { + listListenableFuture.get(); + } + catch (InterruptedException | ExecutionException e) { + if (e.getCause() instanceof PrestoException) { + throw (PrestoException) e.getCause(); + } + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Error during " + operation, e.getCause()); + } + } + + public void initiateVacuumCleanupTasks(HiveVacuumTableHandle vacuumTableHandle, + ConnectorSession session, + List partitionUpdates) + { + if (vacuumTableHandle.getRanges() != null && vacuumTableHandle.getRanges().size() > 0) { + HdfsContext hdfsContext = new HdfsContext(session, vacuumTableHandle.getSchemaName(), vacuumTableHandle.getTableName()); + + VacuumTableInfoForCleaner info; + long maxId = Long.MIN_VALUE; + if (vacuumTableHandle.isUnifyVacuum()) { + maxId = vacuumTableHandle.getLocationHandle().getJsonSerializablewriteIdInfo().get().getMaxWriteId(); + } + else { + for (List partitionRange : vacuumTableHandle.getRanges().values()) { + for (HiveVacuumTableHandle.Range range : partitionRange) { + if (maxId < range.getMax()) { + maxId = range.getMax(); + } + } + } + } + for (int index = 0; index < partitionUpdates.size(); index++) { + info = new VacuumTableInfoForCleaner(vacuumTableHandle.getSchemaName(), + vacuumTableHandle.getTableName(), + partitionUpdates.get(index).getName(), + maxId, + partitionUpdates.get(index).getTargetPath()); + vacuumCleanerTasks.add(new VacuumCleaner(info, this, hdfsEnvironment, hdfsContext)); + } + } + } + + public void refreshMetastoreCache() + { + delegate.refreshMetastoreCache(); + } + + private class Committer + { + private final AtomicBoolean fileRenameCancelled = new AtomicBoolean(false); + private final List> fileRenameFutures = new CopyOnWriteArrayList<>(); + + // File system + // For file system changes, only operations outside of writing paths (as specified in declared intentions to write) + // need to MOVE_BACKWARD tasks scheduled. Files in writing paths are handled by rollbackShared(). + private final List deletionTasksForFinish = new CopyOnWriteArrayList<>(); + private final List cleanUpTasksForAbort = new CopyOnWriteArrayList<>(); + private final List renameTasksForAbort = new CopyOnWriteArrayList<>(); + + // Metastore + private final List addTableOperations = new CopyOnWriteArrayList<>(); + private final List alterTableOperations = new CopyOnWriteArrayList<>(); + private final Map partitionAdders = new ConcurrentHashMap<>(); + private final List alterPartitionOperations = new CopyOnWriteArrayList<>(); + private final List updateStatisticsOperations = new CopyOnWriteArrayList<>(); + private final List partitionNames = new ArrayList<>(); + private final List metastoreDeleteOperations = new CopyOnWriteArrayList<>(); + + // Flag for better error message + private boolean deleteOnly = true; + + private void prepareDropTable(HiveIdentity identity, SchemaTableName schemaTableName) + { + metastoreDeleteOperations.add(new IrreversibleMetastoreOperation( + format("drop table %s", schemaTableName), + () -> delegate.dropTable(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName(), true))); + } + + private void prepareAlterTable(HdfsContext hdfsContext, HiveIdentity identity, TableAndMore tableAndMore) + { + deleteOnly = false; + + Table table = tableAndMore.getTable(); + String targetLocation = table.getStorage().getLocation(); + Table oldTable = delegate.getTable(identity, table.getDatabaseName(), table.getTableName()) + .orElseThrow(() -> new PrestoException(TRANSACTION_CONFLICT, "The table that this transaction modified was deleted in another transaction. " + table.getSchemaTableName())); + String oldTableLocation = oldTable.getStorage().getLocation(); + Path oldTablePath = new Path(oldTableLocation); + + // Location of the old table and the new table can be different because we allow arbitrary directories through LocationService. + // If the location of the old table is the same as the location of the new table: + // * Rename the old data directory to a temporary path with a special suffix + // * Remember we will need to delete that directory at the end if transaction successfully commits + // * Remember we will need to undo the rename if transaction aborts + // Otherwise, + // * Remember we will need to delete the location of the old partition at the end if transaction successfully commits + if (targetLocation.equals(oldTableLocation)) { + String queryId = hdfsContext.getQueryId().orElseThrow(() -> new IllegalArgumentException("query ID not present")); + Path oldTableStagingPath = new Path(oldTablePath.getParent(), "_temp_" + oldTablePath.getName() + "_" + queryId); + renameDirectory( + hdfsContext, + hdfsEnvironment, + oldTablePath, + oldTableStagingPath, + () -> renameTasksForAbort.add(new DirectoryRenameTask(hdfsContext, oldTableStagingPath, oldTablePath))); + if (!skipDeletionForAlter) { + deletionTasksForFinish.add(new DirectoryDeletionTask(hdfsContext, oldTableStagingPath)); + } + } + else { + if (!skipDeletionForAlter) { + deletionTasksForFinish.add(new DirectoryDeletionTask(hdfsContext, oldTablePath)); + } + } + + Path currentPath = tableAndMore.getCurrentLocation() + .orElseThrow(() -> new IllegalArgumentException("location should be present for alter table")); + Path targetPath = new Path(targetLocation); + if (!targetPath.equals(currentPath)) { + renameDirectory( + hdfsContext, + hdfsEnvironment, + currentPath, + targetPath, + () -> cleanUpTasksForAbort.add(new DirectoryCleanUpTask(hdfsContext, targetPath, true))); + } + // Partition alter must happen regardless of whether original and current location is the same + // because metadata might change: e.g. storage format, column types, etc + alterTableOperations.add(new AlterTableOperation(tableAndMore.getIdentity(), tableAndMore.getTable(), oldTable, tableAndMore.getPrincipalPrivileges())); + + updateStatisticsOperations.add(new UpdateStatisticsOperation( + tableAndMore.getIdentity(), + table.getSchemaTableName(), + Optional.empty(), + tableAndMore.getStatisticsUpdate(), + false)); + } + + private void prepareAddTable(HdfsContext context, TableAndMore tableAndMore) + { + deleteOnly = false; + + Table table = tableAndMore.getTable(); + if (table.getTableType().equals(MANAGED_TABLE.name())) { + String targetLocation = table.getStorage().getLocation(); + checkArgument(!targetLocation.isEmpty(), "target location is empty"); + Optional currentPath = tableAndMore.getCurrentLocation(); + Path targetPath = new Path(targetLocation); + if (table.getPartitionColumns().isEmpty() && currentPath.isPresent()) { + // CREATE TABLE AS SELECT unpartitioned table + if (targetPath.equals(currentPath.get())) { + // Target path and current path are the same. Therefore, directory move is not needed. + } + else { + renameDirectory( + context, + hdfsEnvironment, + currentPath.get(), + targetPath, + () -> cleanUpTasksForAbort.add(new DirectoryCleanUpTask(context, targetPath, true))); + } + } + else { + // CREATE TABLE AS SELECT partitioned table, or + // CREATE TABLE partitioned/unpartitioned table (without data) + if (pathExists(context, hdfsEnvironment, targetPath)) { + if (currentPath.isPresent() && currentPath.get().equals(targetPath)) { + // It is okay to skip directory creation when currentPath is equal to targetPath + // because the directory may have been created when creating partition directories. + // However, it is important to note that the two being equal does not guarantee + // a directory had been created. + } + else { + throw new PrestoException( + HiveErrorCode.HIVE_PATH_ALREADY_EXISTS, + format("Unable to create directory %s: target directory already exists", targetPath)); + } + } + else { + cleanUpTasksForAbort.add(new DirectoryCleanUpTask(context, targetPath, true)); + createDirectory(context, hdfsEnvironment, targetPath); + } + } + } + addTableOperations.add(new CreateTableOperation(tableAndMore.getIdentity(), table, tableAndMore.getPrincipalPrivileges(), tableAndMore.isIgnoreExisting())); + if (!isPrestoView(table) && tableAndMore.isUpdateStats()) { + updateStatisticsOperations.add(new UpdateStatisticsOperation( + tableAndMore.getIdentity(), + table.getSchemaTableName(), + Optional.empty(), + tableAndMore.getStatisticsUpdate(), + false)); + } + } + + private void prepareInsertExistingTable(HdfsContext context, TableAndMore tableAndMore) + { + deleteOnly = false; + + Table table = tableAndMore.getTable(); + Path targetPath = new Path(table.getStorage().getLocation()); + Path currentPath = tableAndMore.getCurrentLocation().get(); + if (!targetPath.equals(currentPath)) { + asyncRename(hdfsEnvironment, renameExecutor, fileRenameCancelled, fileRenameFutures, context, currentPath, targetPath, tableAndMore.getFileNames().get(), + cleanUpTasksForAbort, isVacuumIncluded); + } + if (tableAndMore.isUpdateStats()) { + updateStatisticsOperations.add(new UpdateStatisticsOperation( + tableAndMore.getIdentity(), + table.getSchemaTableName(), + Optional.empty(), + tableAndMore.getStatisticsUpdate(), + true)); + } + } + + private void prepareDropPartition(HiveIdentity identity, SchemaTableName schemaTableName, List partitionValues) + { + metastoreDeleteOperations.add(new IrreversibleMetastoreOperation( + format("drop partition %s.%s %s", schemaTableName.getSchemaName(), schemaTableName.getTableName(), partitionValues), + () -> delegate.dropPartition(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName(), partitionValues, true))); + } + + private void prepareAlterPartition(Table table, HdfsContext hdfsContext, HiveIdentity identity, PartitionAndMore partitionAndMore) + { + deleteOnly = false; + + Partition partition = partitionAndMore.getPartition(); + String targetLocation = partition.getStorage().getLocation(); + Optional oldPartition = delegate.getPartition(identity, partition.getDatabaseName(), partition.getTableName(), partition.getValues()); + if (!oldPartition.isPresent()) { + throw new PrestoException( + TRANSACTION_CONFLICT, + format("The partition that this transaction modified was deleted in another transaction. %s %s", partition.getTableName(), partition.getValues())); + } + String partitionName = getPartitionName(table, partition.getValues()); + PartitionStatistics oldPartitionStatistics = getExistingPartitionStatistics(identity, partition, partitionName); + String oldPartitionLocation = oldPartition.get().getStorage().getLocation(); + Path oldPartitionPath = new Path(oldPartitionLocation); + + // Location of the old partition and the new partition can be different because we allow arbitrary directories through LocationService. + // If the location of the old partition is the same as the location of the new partition: + // * Rename the old data directory to a temporary path with a special suffix + // * Remember we will need to delete that directory at the end if transaction successfully commits + // * Remember we will need to undo the rename if transaction aborts + // Otherwise, + // * Remember we will need to delete the location of the old partition at the end if transaction successfully commits + if (targetLocation.equals(oldPartitionLocation)) { + String queryId = hdfsContext.getQueryId().orElseThrow(() -> new IllegalArgumentException("query ID not present")); + Path oldPartitionStagingPath = new Path(oldPartitionPath.getParent(), "_temp_" + oldPartitionPath.getName() + "_" + queryId); + renameDirectory( + hdfsContext, + hdfsEnvironment, + oldPartitionPath, + oldPartitionStagingPath, + () -> renameTasksForAbort.add(new DirectoryRenameTask(hdfsContext, oldPartitionStagingPath, oldPartitionPath))); + if (!skipDeletionForAlter) { + deletionTasksForFinish.add(new DirectoryDeletionTask(hdfsContext, oldPartitionStagingPath)); + } + } + else { + if (!skipDeletionForAlter) { + deletionTasksForFinish.add(new DirectoryDeletionTask(hdfsContext, oldPartitionPath)); + } + } + + Path currentPath = partitionAndMore.getCurrentLocation(); + Path targetPath = new Path(targetLocation); + if (!targetPath.equals(currentPath)) { + renameDirectory( + hdfsContext, + hdfsEnvironment, + currentPath, + targetPath, + () -> cleanUpTasksForAbort.add(new DirectoryCleanUpTask(hdfsContext, targetPath, true))); + } + // Partition alter must happen regardless of whether original and current location is the same + // because metadata might change: e.g. storage format, column types, etc + alterPartitionOperations.add(new AlterPartitionOperation( + partitionAndMore.getIdentity(), + new PartitionWithStatistics(partition, partitionName, partitionAndMore.getStatisticsUpdate()), + new PartitionWithStatistics(oldPartition.get(), partitionName, oldPartitionStatistics))); + } + + private PartitionStatistics getExistingPartitionStatistics(HiveIdentity identity, Partition partition, String partitionName) + { + try { + PartitionStatistics statistics = closure.getPartitionStatistics(identity, partition.getDatabaseName(), partition.getTableName(), ImmutableSet.of(partitionName)) + .get(partitionName); + if (statistics == null) { + throw new PrestoException( + TRANSACTION_CONFLICT, + format("The partition that this transaction modified was deleted in another transaction. %s %s", partition.getTableName(), partition.getValues())); + } + return statistics; + } + catch (PrestoException e) { + if (e.getErrorCode().equals(HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS.toErrorCode())) { + log.warn( + e, + "Corrupted statistics found when altering partition. Table: %s.%s. Partition: %s", + partition.getDatabaseName(), + partition.getTableName(), + partition.getValues()); + return PartitionStatistics.empty(); + } + throw e; + } + } + + private void prepareAddPartition(Table table, HdfsContext hdfsContext, HiveIdentity identity, PartitionAndMore partitionAndMore) + { + deleteOnly = false; + + Partition partition = partitionAndMore.getPartition(); + String targetLocation = partition.getStorage().getLocation(); + Path currentPath = partitionAndMore.getCurrentLocation(); + Path targetPath = new Path(targetLocation); + + PartitionAdder partitionAdder = partitionAdders.computeIfAbsent( + partition.getSchemaTableName(), + ignored -> new PartitionAdder(partitionAndMore.getIdentity(), partition.getDatabaseName(), partition.getTableName(), delegate, + partitionCommitBatchSize, updateStatisticsOperations)); + + if (pathExists(hdfsContext, hdfsEnvironment, currentPath)) { + if (!targetPath.equals(currentPath)) { + renameNewPartitionDirectory( + hdfsContext, + hdfsEnvironment, + currentPath, + targetPath, + cleanUpTasksForAbort); + } + } + else { + cleanUpTasksForAbort.add(new DirectoryCleanUpTask(hdfsContext, targetPath, true)); + createDirectory(hdfsContext, hdfsEnvironment, targetPath); + } + String partitionName = getPartitionName(table, partition.getValues()); + partitionAdder.addPartition(new PartitionWithStatistics(partition, partitionName, partitionAndMore.getStatisticsUpdate(), + partitionAndMore.isUpdateStats())); + } + + private void prepareInsertExistingPartition(Table table, HdfsContext hdfsContext, HiveIdentity identity, PartitionAndMore partitionAndMore) + { + deleteOnly = false; + + Partition partition = partitionAndMore.getPartition(); + Path targetPath = new Path(partition.getStorage().getLocation()); + Path currentPath = partitionAndMore.getCurrentLocation(); + if (!targetPath.equals(currentPath)) { + asyncRename(hdfsEnvironment, renameExecutor, fileRenameCancelled, fileRenameFutures, hdfsContext, currentPath, targetPath, partitionAndMore.getFileNames(), + cleanUpTasksForAbort, isVacuumIncluded); + } + if (partitionAndMore.isUpdateStats()) { + updateStatisticsOperations.add(new UpdateStatisticsOperation( + partitionAndMore.getIdentity(), + partition.getSchemaTableName(), + Optional.of(getPartitionName(table, partition.getValues())), + partitionAndMore.getStatisticsUpdate(), + true)); + } + } + + private void executeCleanupTasksForAbort(Collection declaredIntentionsToWrite) + { + Set queryIds = declaredIntentionsToWrite.stream() + .map(DeclaredIntentionToWrite::getQueryId) + .collect(toImmutableSet()); + for (DirectoryCleanUpTask cleanUpTask : cleanUpTasksForAbort) { + recursiveDeleteFilesAndLog(cleanUpTask.getContext(), cleanUpTask.getPath(), queryIds, cleanUpTask.isDeleteEmptyDirectory(), "temporary directory commit abort"); + } + } + + private void executeDeletionTasksForFinish() + { + for (DirectoryDeletionTask deletionTask : deletionTasksForFinish) { + if (!deleteRecursivelyIfExists(deletionTask.getContext(), hdfsEnvironment, deletionTask.getPath())) { + logCleanupFailure("Error deleting directory %s", deletionTask.getPath().toString()); + } + } + } + + private void executeRenameTasksForAbort() + { + for (DirectoryRenameTask directoryRenameTask : renameTasksForAbort) { + try { + // Ignore the task if the source directory doesn't exist. + // This is probably because the original rename that we are trying to undo here never succeeded. + if (pathExists(directoryRenameTask.getContext(), hdfsEnvironment, directoryRenameTask.getRenameFrom())) { + renameDirectory(directoryRenameTask.getContext(), hdfsEnvironment, directoryRenameTask.getRenameFrom(), directoryRenameTask.getRenameTo(), () -> {}); + } + } + catch (Throwable throwable) { + logCleanupFailure(throwable, "failed to undo rename of partition directory: %s to %s", directoryRenameTask.getRenameFrom(), directoryRenameTask.getRenameTo()); + } + } + } + + private void deleteEmptyStagingDirectories(List declaredIntentionsToWrite) + { + for (DeclaredIntentionToWrite declaredIntentionToWrite : declaredIntentionsToWrite) { + if (declaredIntentionToWrite.getMode() != WriteMode.STAGE_AND_MOVE_TO_TARGET_DIRECTORY) { + continue; + } + Path path = declaredIntentionToWrite.getRootPath(); + recursiveDeleteFilesAndLog(declaredIntentionToWrite.getHdfsContext(), path, ImmutableSet.of(), true, "staging directory cleanup"); + } + } + + private void waitForAsyncRenames() + { + for (CompletableFuture fileRenameFuture : fileRenameFutures) { + MoreFutures.getFutureValue(fileRenameFuture, PrestoException.class); + } + } + + private void waitForAsyncRenamesSuppressThrowables() + { + for (CompletableFuture future : fileRenameFutures) { + try { + future.get(); + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + catch (Throwable t) { + // ignore + } + } + } + + private void cancelUnstartedAsyncRenames() + { + fileRenameCancelled.set(true); + } + + private void executeAddTableOperations() + { + for (CreateTableOperation addTableOperation : addTableOperations) { + addTableOperation.run(delegate); + } + } + + private void executeAlterTableOperations() + { + for (AlterTableOperation alterTableOperation : alterTableOperations) { + alterTableOperation.run(delegate); + } + } + + private void executeAlterPartitionOperations() + { + for (AlterPartitionOperation alterPartitionOperation : alterPartitionOperations) { + alterPartitionOperation.run(delegate); + } + } + + private void executeAddPartitionOperations() + { + for (PartitionAdder partitionAdder : partitionAdders.values()) { + partitionAdder.execute(hiveMetastoreClientService); + } + } + + private void executeUpdateStatisticsOperations() + { + List partitionUpdateStatisticsOperations = new ArrayList<>(); + HiveIdentity identity = null; + for (UpdateStatisticsOperation operation : updateStatisticsOperations) { + if (operation.partitionName.isPresent()) { + partitionUpdateStatisticsOperations.add(operation); + if (identity == null) { + identity = operation.identity; + } + } + else { + operation.run(delegate); + } + } + if (partitionUpdateStatisticsOperations.size() > 0) { + SchemaTableName schemaTableName = partitionUpdateStatisticsOperations.get(0).tableName; + updatePartitionsStatistics(identity, delegate, schemaTableName, partitionUpdateStatisticsOperations); + } + } + + private void updatePartitionsStatistics(HiveIdentity identity, HiveMetastore metastore, SchemaTableName schemaTableName, List partitionUpdateStatisticsOperations) + { + List partitionNames = new ArrayList<>(); + Map> partNamesUpdateFunctionMap = new HashMap<>(); + for (UpdateStatisticsOperation operation : partitionUpdateStatisticsOperations) { + partNamesUpdateFunctionMap.put(operation.partitionName.get(), operation::updateStatistics); + partitionNames.add(operation.partitionName.get()); + } + if (partitionNames.size() == partNamesUpdateFunctionMap.size() && partitionUpdateStatisticsOperations.size() > 0) { + metastore.updatePartitionsStatistics(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName(), partNamesUpdateFunctionMap); + } + } + + private void undoAddPartitionOperations() + { + for (PartitionAdder partitionAdder : partitionAdders.values()) { + List> partitionsFailedToRollback = partitionAdder.rollback(); + if (!partitionsFailedToRollback.isEmpty()) { + logCleanupFailure("Failed to rollback: add_partition for partitions %s.%s %s", + partitionAdder.getSchemaName(), + partitionAdder.getTableName(), + partitionsFailedToRollback.stream()); + } + } + } + + private void undoAddTableOperations() + { + for (CreateTableOperation addTableOperation : addTableOperations) { + try { + addTableOperation.undo(delegate); + } + catch (Throwable throwable) { + logCleanupFailure(throwable, "failed to rollback: %s", addTableOperation.getDescription()); + } + } + } + + private void undoAlterTableOperations() + { + for (AlterTableOperation alterTableOperation : alterTableOperations) { + try { + alterTableOperation.undo(delegate); + } + catch (Throwable throwable) { + logCleanupFailure(throwable, "failed to rollback: %s", alterTableOperation.getDescription()); + } + } + } + + private void undoAlterPartitionOperations() + { + for (AlterPartitionOperation alterPartitionOperation : alterPartitionOperations) { + try { + alterPartitionOperation.undo(delegate); + } + catch (Throwable throwable) { + logCleanupFailure(throwable, "failed to rollback: %s", alterPartitionOperation.getDescription()); + } + } + } + + private void undoUpdateStatisticsOperations() + { + for (UpdateStatisticsOperation operation : updateStatisticsOperations) { + try { + operation.undo(delegate); + } + catch (Throwable throwable) { + logCleanupFailure(throwable, "failed to rollback: %s", operation.getDescription()); + } + } + } + + private void executeIrreversibleMetastoreOperations() + { + List failedIrreversibleOperationDescriptions = new ArrayList<>(); + List suppressedExceptions = new ArrayList<>(); + boolean anySucceeded = false; + for (IrreversibleMetastoreOperation irreversibleMetastoreOperation : metastoreDeleteOperations) { + try { + irreversibleMetastoreOperation.run(); + anySucceeded = true; + } + catch (Throwable t) { + failedIrreversibleOperationDescriptions.add(irreversibleMetastoreOperation.getDescription()); + // A limit is needed to avoid having a huge exception object. 5 was chosen arbitrarily. + if (suppressedExceptions.size() < 5) { + suppressedExceptions.add(t); + } + } + } + if (!suppressedExceptions.isEmpty()) { + StringBuilder message = new StringBuilder(); + if (deleteOnly && !anySucceeded) { + message.append("The following metastore delete operations failed: "); + } + else { + message.append("The transaction didn't commit cleanly. All operations other than the following delete operations were completed: "); + } + Joiner.on("; ").appendTo(message, failedIrreversibleOperationDescriptions); + + PrestoException prestoException = new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, message.toString()); + suppressedExceptions.forEach(prestoException::addSuppressed); + throw prestoException; + } + } + } + + @GuardedBy("this") + private void rollbackShared() + { + checkHoldsLock(); + + //Abort transaction, if any + abortTransaction(); + + for (DeclaredIntentionToWrite declaredIntentionToWrite : declaredIntentionsToWrite) { + switch (declaredIntentionToWrite.getMode()) { + case STAGE_AND_MOVE_TO_TARGET_DIRECTORY: + case DIRECT_TO_TARGET_NEW_DIRECTORY: { + // For STAGE_AND_MOVE_TO_TARGET_DIRECTORY, there is no need to cleanup the target directory as + // it will only be written to during the commit call and the commit call cleans up after failures. + if ((declaredIntentionToWrite.getMode() == DIRECT_TO_TARGET_NEW_DIRECTORY) && skipTargetCleanupOnRollback) { + break; + } + + Path rootPath = declaredIntentionToWrite.getRootPath(); + + // In the case of DIRECT_TO_TARGET_NEW_DIRECTORY, if the directory is not guaranteed to be unique + // for the query, it is possible that another query or compute engine may see the directory, wrote + // data to it, and exported it through metastore. Therefore it may be argued that cleanup of staging + // directories must be carried out conservatively. To be safe, we only delete files that start or + // end with the query IDs in this transaction. + recursiveDeleteFilesAndLog( + declaredIntentionToWrite.getHdfsContext(), + rootPath, + ImmutableSet.of(declaredIntentionToWrite.getQueryId()), + true, + format("staging/target_new directory rollback for table %s", declaredIntentionToWrite.getSchemaTableName())); + break; + } + case DIRECT_TO_TARGET_EXISTING_DIRECTORY: { + Set pathsToClean = new HashSet<>(); + + // Check the base directory of the declared intention + // * existing partition may also be in this directory + // * this is where new partitions are created + Path baseDirectory = declaredIntentionToWrite.getRootPath(); + pathsToClean.add(baseDirectory); + + HiveIdentity identity = declaredIntentionToWrite.getIdentity(); + SchemaTableName schemaTableName = declaredIntentionToWrite.getSchemaTableName(); + Optional
table = delegate.getTable(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName()); + if (table.isPresent()) { + // check every existing partition that is outside for the base directory + if (!table.get().getPartitionColumns().isEmpty()) { + List partitionNames = delegate.getPartitionNames(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName()) + .orElse(ImmutableList.of()); + for (List partitionNameBatch : Iterables.partition(partitionNames, 10)) { + Collection> partitions = delegate.getPartitionsByNames(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName(), partitionNameBatch).values(); + partitions.stream() + .filter(Optional::isPresent) + .map(Optional::get) + .map(partition -> partition.getStorage().getLocation()) + .map(Path::new) + .filter(path -> !isSameOrParent(baseDirectory, path)) + .forEach(pathsToClean::add); + } + } + } + else { + logCleanupFailure( + "Error rolling back write to table %s.%s. Data directory may contain temporary data. Table was dropped in another transaction.", + schemaTableName.getSchemaName(), + schemaTableName.getTableName()); + } + + // delete any file that starts or ends with the query ID + for (Path path : pathsToClean) { + // TODO: It is a known deficiency that some empty directory does not get cleaned up in S3. + // We can not delete any of the directories here since we do not know who created them. + recursiveDeleteFilesAndLog( + declaredIntentionToWrite.getHdfsContext(), + path, + ImmutableSet.of(declaredIntentionToWrite.getQueryId()), + false, + format("target_existing directory rollback for table %s", schemaTableName)); + } + + break; + } + default: + throw new UnsupportedOperationException("Unknown write mode"); + } + } + } + + @VisibleForTesting + public synchronized void testOnlyCheckIsReadOnly() + { + if (state != State.EMPTY) { + throw new AssertionError("Test did not commit or rollback"); + } + } + + @VisibleForTesting + public void testOnlyThrowOnCleanupFailures() + { + throwOnCleanupFailure = true; + } + + @GuardedBy("this") + private void checkReadable() + { + checkHoldsLock(); + + switch (state) { + case EMPTY: + case SHARED_OPERATION_BUFFERED: + return; + case EXCLUSIVE_OPERATION_BUFFERED: + throw new PrestoException(NOT_SUPPORTED, "Unsupported combination of operations in a single transaction"); + case FINISHED: + throw new IllegalStateException("Tried to access metastore after transaction has been committed/aborted"); + } + } + + @GuardedBy("this") + private void setShared() + { + checkHoldsLock(); + + checkReadable(); + state = State.SHARED_OPERATION_BUFFERED; + } + + @GuardedBy("this") + private void setExclusive(ExclusiveOperation exclusiveOperation) + { + checkHoldsLock(); + + if (state != State.EMPTY) { + throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, "Unsupported combination of operations in a single transaction"); + } + state = State.EXCLUSIVE_OPERATION_BUFFERED; + bufferedExclusiveOperation = exclusiveOperation; + } + + @GuardedBy("this") + private void checkNoPartitionAction(String databaseName, String tableName) + { + checkHoldsLock(); + + Map, Action> partitionActionsOfTable = partitionActions.get(new SchemaTableName(databaseName, tableName)); + if (partitionActionsOfTable != null && !partitionActionsOfTable.isEmpty()) { + throw new PrestoException(NOT_SUPPORTED, "Cannot make schema changes to a table/view with modified partitions in the same transaction"); + } + } + + private static boolean isSameOrParent(Path parent, Path child) + { + int parentDepth = parent.depth(); + int childDepth = child.depth(); + if (parentDepth > childDepth) { + return false; + } + for (int i = childDepth; i > parentDepth; i--) { + child = child.getParent(); + } + return parent.equals(child); + } + + private void logCleanupFailure(String format, Object... args) + { + if (throwOnCleanupFailure) { + throw new RuntimeException(format(format, args)); + } + log.warn(format, args); + } + + private void logCleanupFailure(Throwable t, String format, Object... args) + { + if (throwOnCleanupFailure) { + throw new RuntimeException(format(format, args), t); + } + log.warn(t, format, args); + } + + private static void asyncRename( + HdfsEnvironment hdfsEnvironment, + Executor executor, + AtomicBoolean cancelled, + List> fileRenameFutures, + HdfsContext context, + Path currentPath, + Path targetPath, + List fileNames, + List cleanUpTasksForAbort, + boolean useDirectExecutor) + { + FileSystem fileSystem; + try { + fileSystem = hdfsEnvironment.getFileSystem(context, currentPath); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, format("Error moving data files to final location. Error listing directory %s", currentPath), e); + } + + //Remove duplicates + fileNames = fileNames.stream().distinct().collect(Collectors.toList()); + + //In case of concurrent vacuums on same partitioned table, + // different partitions must be renamed in same sequence to avoid conflicts. So rename synchronously + Executor renameExecutor = (useDirectExecutor) ? MoreExecutors.directExecutor() : executor; + for (String fileName : fileNames) { + Path source = new Path(currentPath, fileName); + Path target = new Path(targetPath, fileName); + fileRenameFutures.add(CompletableFuture.runAsync(() -> { + if (cancelled.get()) { + return; + } + try { + if (fileSystem.exists(target)) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, format("Error moving data files from %s to final location %s", source, target)); + } + if (!fileSystem.exists(target.getParent())) { + fileSystem.mkdirs(target.getParent()); + } + if (!fileSystem.rename(source, target)) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, format("Error moving data files from %s to final location %s", source, target)); + } + cleanUpTasksForAbort.add(new DirectoryCleanUpTask(context, target, true)); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, format("Error moving data files from %s to final location %s", source, target), e); + } + }, renameExecutor)); + } + } + + private void recursiveDeleteFilesAndLog(HdfsContext context, Path directory, Set queryIds, boolean deleteEmptyDirectories, String reason) + { + RecursiveDeleteResult recursiveDeleteResult = recursiveDeleteFiles( + hdfsEnvironment, + context, + directory, + queryIds, + deleteEmptyDirectories); + if (!recursiveDeleteResult.getNotDeletedEligibleItems().isEmpty()) { + logCleanupFailure( + "Error deleting directory %s for %s. Some eligible items can not be deleted: %s.", + directory.toString(), + reason, + recursiveDeleteResult.getNotDeletedEligibleItems()); + } + else if (deleteEmptyDirectories && !recursiveDeleteResult.isDirectoryNoLongerExists()) { + logCleanupFailure( + "Error deleting directory %s for %s. Can not delete the directory.", + directory.toString(), + reason); + } + } + + private static Optional getPrestoQueryId(Table table) + { + return Optional.ofNullable(table.getParameters().get(PRESTO_QUERY_ID_NAME)); + } + + private static Optional getPrestoQueryId(Partition partition) + { + return Optional.ofNullable(partition.getParameters().get(PRESTO_QUERY_ID_NAME)); + } + + private void checkHoldsLock() + { + // This method serves a similar purpose at runtime as GuardedBy on method serves during static analysis. + // This method should not have significant performance impact. If it does, it may be reasonably to remove this method. + // This intentionally does not use checkState. + if (!Thread.holdsLock(this)) { + throw new IllegalStateException(format("Thread must hold a lock on the %s", getClass().getSimpleName())); + } + } + + /** + * Attempts to remove the file or empty directory. + * + * @return true if the location no longer exists + */ + private static boolean deleteIfExists(FileSystem fileSystem, Path path, boolean recursive) + { + try { + // attempt to delete the path + if (fileSystem.delete(path, recursive)) { + return true; + } + + // delete failed + // check if path still exists + return !fileSystem.exists(path); + } + catch (FileNotFoundException ignored) { + // path was already removed or never existed + log.debug("path may be removed or never existed", ignored); + return true; + } + catch (IOException ignored) { + log.error("Hdfs RPC Call Error", ignored); + } + return false; + } + + // Since parallel inserts to add same partitions may race against each other and fail during rename, + // its better to handle renames at first level children from source path + // and corresponding cleanups to avoid whole partition deletion in case any insert fails. + private static void renameNewPartitionDirectory(HdfsContext context, + HdfsEnvironment hdfsEnvironment, + Path source, + Path target, + List cleanUpTasksForAbort) + { + FileStatus fileStatus = getFileStatus(context, hdfsEnvironment, source); + if (fileStatus.isDirectory()) { + FileStatus[] children = getChildren(context, hdfsEnvironment, source); + for (FileStatus child : children) { + Path subTarget = new Path(target, child.getPath().getName()); + renameDirectory(context, hdfsEnvironment, child.getPath(), + subTarget, + () -> cleanUpTasksForAbort.add(new DirectoryCleanUpTask(context, subTarget, true))); + } + } + else { + renameDirectory(context, hdfsEnvironment, source, target, + () -> cleanUpTasksForAbort.add(new DirectoryCleanUpTask(context, target, true))); + } + } + + private static void renameDirectory(HdfsEnvironment.HdfsContext context, HdfsEnvironment hdfsEnvironment, Path source, Path target, Runnable runWhenRenameSuccess) + { + if (pathExists(context, hdfsEnvironment, target)) { + throw new PrestoException(HIVE_PATH_ALREADY_EXISTS, + format("Unable to rename from %s to %s: target directory already exists", source, target)); + } + + if (!pathExists(context, hdfsEnvironment, target.getParent())) { + createDirectory(context, hdfsEnvironment, target.getParent()); + } + + try { + if (!hdfsEnvironment.getFileSystem(context, source).rename(source, target)) { + throw new PrestoException(HIVE_FILESYSTEM_ERROR, format("Failed to rename %s to %s: rename returned false", source, target)); + } + runWhenRenameSuccess.run(); + } + catch (IOException e) { + throw new PrestoException(HIVE_FILESYSTEM_ERROR, format("Failed to rename %s to %s", source, target), e); + } + } + + /** + * Attempts to remove the file or empty directory. + * + * @return true if the location no longer exists + */ + private static boolean deleteRecursivelyIfExists(HdfsEnvironment.HdfsContext context, HdfsEnvironment hdfsEnvironment, Path path) + { + FileSystem fileSystem; + try { + fileSystem = hdfsEnvironment.getFileSystem(context, path); + } + catch (IOException ignored) { + log.error("Hdfs RPC Call Error", ignored); + return false; + } + + return deleteIfExists(fileSystem, path, true); + } + + private static RecursiveDeleteResult doRecursiveDeleteFiles(FileSystem fileSystem, Path directory, Set queryIds, boolean deleteEmptyDirectories) + { + // don't delete hidden presto directories + if (directory.getName().startsWith(".presto")) { + return new RecursiveDeleteResult(false, ImmutableList.of()); + } + + if (directory.getName().startsWith(".staging")) { + if (deleteIfExists(fileSystem, directory, true)) { + return new RecursiveDeleteResult(true, ImmutableList.of()); + } + } + + FileStatus[] allFiles; + try { + allFiles = fileSystem.listStatus(directory); + } + catch (IOException e) { + ImmutableList.Builder notDeletedItems = ImmutableList.builder(); + notDeletedItems.add(directory.toString() + "/**"); + return new RecursiveDeleteResult(false, notDeletedItems.build()); + } + + boolean allDescendentsDeleted = true; + ImmutableList.Builder notDeletedEligibleItems = ImmutableList.builder(); + for (FileStatus fileStatus : allFiles) { + if (fileStatus.isFile()) { + Path filePath = fileStatus.getPath(); + String fileName = filePath.getName(); + boolean eligible = false; + // never delete presto dot files + if (!fileName.startsWith(".presto") && !fileName.startsWith("_orc")) { + eligible = queryIds.stream().anyMatch(id -> fileName.startsWith(id) || fileName.endsWith(id) || fileName.startsWith("bucket_")); + } + if (eligible) { + if (!deleteIfExists(fileSystem, filePath, false)) { + allDescendentsDeleted = false; + notDeletedEligibleItems.add(filePath.toString()); + } + } + else { + allDescendentsDeleted = false; + } + } + else if (fileStatus.isDirectory()) { + RecursiveDeleteResult subResult = doRecursiveDeleteFiles(fileSystem, fileStatus.getPath(), queryIds, deleteEmptyDirectories); + if (!subResult.isDirectoryNoLongerExists()) { + allDescendentsDeleted = false; + } + if (!subResult.getNotDeletedEligibleItems().isEmpty()) { + notDeletedEligibleItems.addAll(subResult.getNotDeletedEligibleItems()); + } + } + else { + allDescendentsDeleted = false; + notDeletedEligibleItems.add(fileStatus.getPath().toString()); + } + } + if (allDescendentsDeleted && deleteEmptyDirectories) { + verify(notDeletedEligibleItems.build().isEmpty()); + if (!deleteIfExists(fileSystem, directory, false)) { + return new RecursiveDeleteResult(false, ImmutableList.of(directory.toString() + "/")); + } + return new RecursiveDeleteResult(true, ImmutableList.of()); + } + return new RecursiveDeleteResult(false, notDeletedEligibleItems.build()); + } + + /** + * Attempt to recursively remove eligible files and/or directories in {@code directory}. + *

+ * When {@code queryIds} is not present, all files (but not necessarily directories) will be + * ineligible. If all files shall be deleted, you can use an empty string as {@code queryIds}. + *

+ * When {@code deleteEmptySubDirectory} is true, any empty directory (including directories that + * were originally empty, and directories that become empty after files prefixed or suffixed with + * {@code queryIds} are deleted) will be eligible. + *

+ * This method will not delete anything that's neither a directory nor a file. + * + * @param queryIds prefix or suffix of files that should be deleted + * @param deleteEmptyDirectories whether empty directories should be deleted + */ + private static RecursiveDeleteResult recursiveDeleteFiles(HdfsEnvironment hdfsEnvironment, HdfsEnvironment.HdfsContext context, Path directory, Set queryIds, boolean deleteEmptyDirectories) + { + FileSystem fileSystem; + try { + fileSystem = hdfsEnvironment.getFileSystem(context, directory); + + if (!fileSystem.exists(directory)) { + return new RecursiveDeleteResult(true, ImmutableList.of()); + } + } + catch (IOException e) { + ImmutableList.Builder notDeletedItems = ImmutableList.builder(); + notDeletedItems.add(directory.toString() + "/**"); + return new RecursiveDeleteResult(false, notDeletedItems.build()); + } + + return doRecursiveDeleteFiles(fileSystem, directory, queryIds, deleteEmptyDirectories); + } + + public static class RecursiveDeleteResult + { + private final boolean directoryNoLongerExists; + private final List notDeletedEligibleItems; + + public RecursiveDeleteResult(boolean directoryNoLongerExists, List notDeletedEligibleItems) + { + this.directoryNoLongerExists = directoryNoLongerExists; + this.notDeletedEligibleItems = notDeletedEligibleItems; + } + + public boolean isDirectoryNoLongerExists() + { + return directoryNoLongerExists; + } + + public List getNotDeletedEligibleItems() + { + return notDeletedEligibleItems; + } + } + + private static RecursiveDeleteResult moveToTrash(HdfsEnvironment hdfsEnvironment, HdfsEnvironment.HdfsContext context, Path directory) + { + FileSystem fileSystem; + try { + fileSystem = hdfsEnvironment.getFileSystem(context, directory); + if (!fileSystem.exists(directory)) { + return new RecursiveDeleteResult(true, ImmutableList.of()); + } + } + catch (IOException ioe) { + log.error("Hdfs RPC Call Error", ioe); + return new RecursiveDeleteResult(false, ImmutableList.of(directory.toString() + "/**")); + } + + return doMoveToTrash(fileSystem, directory, hdfsEnvironment, context); + } + + private static RecursiveDeleteResult doMoveToTrash(FileSystem fileSystem, Path directory, HdfsEnvironment hdfsEnvironment, HdfsEnvironment.HdfsContext context) + { + // don't delete hidden presto directories + if (directory.getName().startsWith(".presto")) { + return new RecursiveDeleteResult(false, ImmutableList.of()); + } + + FileStatus[] allFiles; + try { + allFiles = fileSystem.listStatus(directory); + } + catch (IOException ioe) { + return new RecursiveDeleteResult(false, ImmutableList.of(directory.toString() + "/**")); + } + + boolean allDeleted = true; + ImmutableList.Builder notDeletedItems = ImmutableList.builder(); + for (FileStatus fileStatus : allFiles) { + if (fileStatus.isFile()) { + Path filePath = fileStatus.getPath(); + String fileName = filePath.getName(); + // never delete presto dot files + if (!fileName.startsWith(".presto") && !fileName.startsWith("_orc")) { + boolean isSuccess = moveFileToTrash(fileSystem, filePath, hdfsEnvironment, context); + if (!isSuccess) { + allDeleted = false; + notDeletedItems.add(filePath.toString()); + } + } + } + else if (fileStatus.isDirectory()) { + RecursiveDeleteResult subResult = doMoveToTrash(fileSystem, fileStatus.getPath(), hdfsEnvironment, context); + if (!subResult.isDirectoryNoLongerExists()) { + allDeleted = false; + } + + if (!subResult.getNotDeletedEligibleItems().isEmpty()) { + notDeletedItems.addAll(subResult.getNotDeletedEligibleItems()); + } + } + else { + allDeleted = false; + notDeletedItems.add(fileStatus.getPath().toString()); + } + } + + if (allDeleted) { + return new RecursiveDeleteResult(true, ImmutableList.of()); + } + + return new RecursiveDeleteResult(false, notDeletedItems.build()); + } + + private static boolean moveFileToTrash(FileSystem fileSystem, Path path, HdfsEnvironment hdfsEnvironment, HdfsEnvironment.HdfsContext context) + { + try { + Trash trash = new Trash(fileSystem, hdfsEnvironment.getConfiguration(context, path)); + boolean result = trash.moveToTrash(path); + if (result) { + return true; + } + } + catch (IOException ignored) { + log.warn(format("move file %s to trash failed and force to delete it.", path.toString()), ignored); + } + + return deleteIfExists(fileSystem, path, false); + } + + private enum State + { + EMPTY, + SHARED_OPERATION_BUFFERED, + EXCLUSIVE_OPERATION_BUFFERED, + FINISHED, + } + + private enum ActionType + { + DROP, + ADD, + ALTER, + INSERT_EXISTING + } + + private enum TableSource + { + CREATED_IN_THIS_TRANSACTION, + PRE_EXISTING_TABLE, + // RECREATED_IN_THIS_TRANSACTION is a possible case, but it is not supported with the current implementation + } + + public static class Action + { + private final ActionType type; + private final T data; + private final HdfsContext hdfsContext; + private final HiveIdentity identity; + + public Action(ActionType type, T data, HdfsContext hdfsContext, HiveIdentity identity) + { + this.type = requireNonNull(type, "type is null"); + if (type == ActionType.DROP) { + checkArgument(data == null, "data is not null"); + } + else { + requireNonNull(data, "data is null"); + } + this.data = data; + this.hdfsContext = requireNonNull(hdfsContext, "hdfsContext is null"); + this.identity = requireNonNull(identity, "identity is null"); + } + + public ActionType getType() + { + return type; + } + + public T getData() + { + checkState(type != ActionType.DROP); + return data; + } + + public HdfsContext getHdfsContext() + { + return hdfsContext; + } + + public HiveIdentity getIdentity() + { + return identity; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("type", type) + .add("data", data) + .toString(); + } + } + + private static class TableAndMore + { + private final Table table; + private final HiveIdentity identity; + private final Optional principalPrivileges; + private final Optional currentLocation; // unpartitioned table only + private final Optional> fileNames; + private final boolean ignoreExisting; + private final PartitionStatistics statistics; + private final PartitionStatistics statisticsUpdate; + private final boolean updateStats; + + public TableAndMore( + Table table, + HiveIdentity identity, + Optional principalPrivileges, + Optional currentLocation, + Optional> fileNames, + boolean ignoreExisting, + PartitionStatistics statistics, + PartitionStatistics statisticsUpdate, + boolean updateStats) + { + this.table = requireNonNull(table, "table is null"); + this.identity = requireNonNull(identity, "identity is null"); + this.principalPrivileges = requireNonNull(principalPrivileges, "principalPrivileges is null"); + this.currentLocation = requireNonNull(currentLocation, "currentLocation is null"); + this.fileNames = requireNonNull(fileNames, "fileNames is null"); + this.ignoreExisting = ignoreExisting; + this.statistics = requireNonNull(statistics, "statistics is null"); + this.statisticsUpdate = requireNonNull(statisticsUpdate, "statisticsUpdate is null"); + this.updateStats = updateStats; + checkArgument(!table.getStorage().getLocation().isEmpty() || !currentLocation.isPresent(), "currentLocation can not be supplied for table without location"); + checkArgument(!fileNames.isPresent() || currentLocation.isPresent(), "fileNames can be supplied only when currentLocation is supplied"); + } + + public boolean isIgnoreExisting() + { + return ignoreExisting; + } + + public Table getTable() + { + return table; + } + + public HiveIdentity getIdentity() + { + return identity; + } + + public PrincipalPrivileges getPrincipalPrivileges() + { + checkState(principalPrivileges.isPresent()); + return principalPrivileges.get(); + } + + public Optional getCurrentLocation() + { + return currentLocation; + } + + public Optional> getFileNames() + { + return fileNames; + } + + public PartitionStatistics getStatistics() + { + return statistics; + } + + public PartitionStatistics getStatisticsUpdate() + { + return statisticsUpdate; + } + + public boolean isUpdateStats() + { + return updateStats; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("table", table) + .add("principalPrivileges", principalPrivileges) + .add("currentLocation", currentLocation) + .add("fileNames", fileNames) + .add("ignoreExisting", ignoreExisting) + .add("statistics", statistics) + .add("statisticsUpdate", statisticsUpdate) + .toString(); + } + } + + private static class PartitionAndMore + { + private final HiveIdentity identity; + private final Partition partition; + private final Path currentLocation; + private final Optional> fileNames; + private final PartitionStatistics statistics; + private final PartitionStatistics statisticsUpdate; + private final boolean updateStats; + + public PartitionAndMore(HiveIdentity identity, Partition partition, Path currentLocation, Optional> fileNames, PartitionStatistics statistics, PartitionStatistics statisticsUpdate, boolean updateStats) + { + this.identity = requireNonNull(identity, "identity is null"); + this.partition = requireNonNull(partition, "partition is null"); + this.currentLocation = requireNonNull(currentLocation, "currentLocation is null"); + this.fileNames = requireNonNull(fileNames, "fileNames is null"); + this.statistics = requireNonNull(statistics, "statistics is null"); + this.statisticsUpdate = requireNonNull(statisticsUpdate, "statisticsUpdate is null"); + this.updateStats = updateStats; + } + + public HiveIdentity getIdentity() + { + return identity; + } + + public Partition getPartition() + { + return partition; + } + + public Path getCurrentLocation() + { + return currentLocation; + } + + public List getFileNames() + { + checkState(fileNames.isPresent()); + return fileNames.get(); + } + + public PartitionStatistics getStatistics() + { + return statistics; + } + + public PartitionStatistics getStatisticsUpdate() + { + return statisticsUpdate; + } + + public Partition getAugmentedPartitionForInTransactionRead() + { + // This method augments the location field of the partition to the staging location. + // This way, if the partition is accessed in an ongoing transaction, staged data + // can be found and accessed. + Partition partition = this.partition; + String currentLocation = this.currentLocation.toString(); + if (!currentLocation.equals(partition.getStorage().getLocation())) { + partition = Partition.builder(partition) + .withStorage(storage -> storage.setLocation(currentLocation)) + .build(); + } + return partition; + } + + public boolean isUpdateStats() + { + return updateStats; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("partition", partition) + .add("currentLocation", currentLocation) + .add("fileNames", fileNames) + .toString(); + } + } + + private static class DeclaredIntentionToWrite + { + private final WriteMode mode; + private final HdfsContext hdfsContext; + private final HiveIdentity identity; + private final String queryId; + private final Path rootPath; + private final SchemaTableName schemaTableName; + + public DeclaredIntentionToWrite(WriteMode mode, HdfsContext hdfsContext, HiveIdentity identity, String queryId, Path stagingPathRoot, SchemaTableName schemaTableName) + { + this.mode = requireNonNull(mode, "mode is null"); + this.hdfsContext = requireNonNull(hdfsContext, "hdfsContext is null"); + this.identity = requireNonNull(identity, "identity is null"); + this.queryId = requireNonNull(queryId, "queryId is null"); + this.rootPath = requireNonNull(stagingPathRoot, "stagingPathRoot is null"); + this.schemaTableName = requireNonNull(schemaTableName, "schemaTableName is null"); + } + + public WriteMode getMode() + { + return mode; + } + + public HdfsContext getHdfsContext() + { + return hdfsContext; + } + + public HiveIdentity getIdentity() + { + return identity; + } + + public String getQueryId() + { + return queryId; + } + + public Path getRootPath() + { + return rootPath; + } + + public SchemaTableName getSchemaTableName() + { + return schemaTableName; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("mode", mode) + .add("hdfsContext", hdfsContext) + .add("identity", identity) + .add("queryId", queryId) + .add("rootPath", rootPath) + .add("schemaTableName", schemaTableName) + .toString(); + } + } + + private static class DirectoryCleanUpTask + { + private final HdfsContext context; + private final Path path; + private final boolean deleteEmptyDirectory; + + public DirectoryCleanUpTask(HdfsContext context, Path path, boolean deleteEmptyDirectory) + { + this.context = context; + this.path = path; + this.deleteEmptyDirectory = deleteEmptyDirectory; + } + + public HdfsContext getContext() + { + return context; + } + + public Path getPath() + { + return path; + } + + public boolean isDeleteEmptyDirectory() + { + return deleteEmptyDirectory; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("context", context) + .add("path", path) + .add("deleteEmptyDirectory", deleteEmptyDirectory) + .toString(); + } + } + + private static class DirectoryDeletionTask + { + private final HdfsContext context; + private final Path path; + + public DirectoryDeletionTask(HdfsContext context, Path path) + { + this.context = context; + this.path = path; + } + + public HdfsContext getContext() + { + return context; + } + + public Path getPath() + { + return path; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("context", context) + .add("path", path) + .toString(); + } + } + + private static class DirectoryRenameTask + { + private final HdfsContext context; + private final Path renameFrom; + private final Path renameTo; + + public DirectoryRenameTask(HdfsContext context, Path renameFrom, Path renameTo) + { + this.context = requireNonNull(context, "context is null"); + this.renameFrom = requireNonNull(renameFrom, "renameFrom is null"); + this.renameTo = requireNonNull(renameTo, "renameTo is null"); + } + + public HdfsContext getContext() + { + return context; + } + + public Path getRenameFrom() + { + return renameFrom; + } + + public Path getRenameTo() + { + return renameTo; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("context", context) + .add("renameFrom", renameFrom) + .add("renameTo", renameTo) + .toString(); + } + } + + private static class IrreversibleMetastoreOperation + { + private final String description; + private final Runnable action; + + public IrreversibleMetastoreOperation(String description, Runnable action) + { + this.description = requireNonNull(description, "description is null"); + this.action = requireNonNull(action, "action is null"); + } + + public String getDescription() + { + return description; + } + + public void run() + { + action.run(); + } + } + + private static class CreateTableOperation + { + private final HiveIdentity identity; + private final Table newTable; + private final PrincipalPrivileges privileges; + private boolean tableCreated; + private final boolean ignoreExisting; + private final String queryId; + + public CreateTableOperation(HiveIdentity identity, Table newTable, PrincipalPrivileges privileges, boolean ignoreExisting) + { + this.identity = requireNonNull(identity, "identity is null"); + requireNonNull(newTable, "newTable is null"); + this.newTable = newTable; + this.privileges = requireNonNull(privileges, "privileges is null"); + this.ignoreExisting = ignoreExisting; + this.queryId = getPrestoQueryId(newTable).orElseThrow(() -> new IllegalArgumentException("Query id is not present")); + } + + public String getDescription() + { + return format("add table %s.%s", newTable.getDatabaseName(), newTable.getTableName()); + } + + public void run(HiveMetastore metastore) + { + boolean done = false; + try { + metastore.createTable(identity, newTable, privileges); + done = true; + } + catch (RuntimeException e) { + try { + Optional

existingTable = metastore.getTable(identity, newTable.getDatabaseName(), newTable.getTableName()); + if (existingTable.isPresent()) { + Table table = existingTable.get(); + Optional existingTableQueryId = getPrestoQueryId(table); + if (existingTableQueryId.isPresent() && existingTableQueryId.get().equals(queryId)) { + // ignore table if it was already created by the same query during retries + done = true; + } + else { + // If the table definition in the metastore is different than what this tx wants to create + // then there is a conflict (e.g., current tx wants to create T(a: bigint), + // but another tx already created T(a: varchar)). + // This may be a problem if there is an insert after this step. + if (!hasTheSameSchema(newTable, table)) { + e = new PrestoException(TRANSACTION_CONFLICT, format("Table already exists with a different schema: '%s'", newTable.getTableName())); + } + else { + done = ignoreExisting; + } + } + } + } + catch (RuntimeException ignored) { + // When table could not be fetched from metastore, it is not known whether the table was added. + // Deleting the table when aborting commit has the risk of deleting table not added in this transaction. + // Not deleting the table may leave garbage behind. The former is much more dangerous than the latter. + // Therefore, the table is not considered added. + } + + if (!done) { + throw e; + } + } + tableCreated = true; + } + + private boolean hasTheSameSchema(Table newTable, Table existingTable) + { + List newTableColumns = newTable.getDataColumns(); + List existingTableColumns = existingTable.getDataColumns(); + + if (newTableColumns.size() != existingTableColumns.size()) { + return false; + } + + for (Column existingColumn : existingTableColumns) { + if (newTableColumns.stream() + .noneMatch(newColumn -> newColumn.getName().equals(existingColumn.getName()) + && newColumn.getType().equals(existingColumn.getType()))) { + return false; + } + } + return true; + } + + public void undo(HiveMetastore metastore) + { + if (!tableCreated) { + return; + } + metastore.dropTable(identity, newTable.getDatabaseName(), newTable.getTableName(), false); + } + } + + private static class AlterTableOperation + { + private final HiveIdentity identity; + private final Table newTable; + private final Table oldTable; + private final PrincipalPrivileges principalPrivileges; + private boolean undo; + + public AlterTableOperation(HiveIdentity identity, Table newTable, Table oldTable, PrincipalPrivileges principalPrivileges) + { + this.identity = requireNonNull(identity, "identity is null"); + this.newTable = requireNonNull(newTable, "newTable is null"); + this.oldTable = requireNonNull(oldTable, "oldTable is null"); + this.principalPrivileges = requireNonNull(principalPrivileges, "principalPrivileges is null"); + checkArgument(newTable.getDatabaseName().equals(oldTable.getDatabaseName())); + checkArgument(newTable.getTableName().equals(oldTable.getTableName())); + } + + public String getDescription() + { + return format( + "alter table %s.%s", + newTable.getDatabaseName(), + newTable.getTableName()); + } + + public void run(HiveMetastore metastore) + { + undo = true; + metastore.replaceTable(identity, newTable.getDatabaseName(), newTable.getTableName(), newTable, principalPrivileges); + } + + public void undo(HiveMetastore metastore) + { + if (!undo) { + return; + } + + metastore.replaceTable(identity, oldTable.getDatabaseName(), oldTable.getTableName(), oldTable, principalPrivileges); + } + } + + private static class AlterPartitionOperation + { + private final HiveIdentity identity; + private final PartitionWithStatistics newPartition; + private final PartitionWithStatistics oldPartition; + private boolean undo; + + public AlterPartitionOperation(HiveIdentity identity, PartitionWithStatistics newPartition, PartitionWithStatistics oldPartition) + { + this.identity = requireNonNull(identity, "identity is null"); + this.newPartition = requireNonNull(newPartition, "newPartition is null"); + this.oldPartition = requireNonNull(oldPartition, "oldPartition is null"); + checkArgument(newPartition.getPartition().getDatabaseName().equals(oldPartition.getPartition().getDatabaseName())); + checkArgument(newPartition.getPartition().getTableName().equals(oldPartition.getPartition().getTableName())); + checkArgument(newPartition.getPartition().getValues().equals(oldPartition.getPartition().getValues())); + } + + public String getDescription() + { + return format( + "alter partition %s.%s %s", + newPartition.getPartition().getDatabaseName(), + newPartition.getPartition().getTableName(), + newPartition.getPartition().getValues()); + } + + public void run(HiveMetastore metastore) + { + undo = true; + metastore.alterPartition(identity, newPartition.getPartition().getDatabaseName(), newPartition.getPartition().getTableName(), newPartition); + } + + public void undo(HiveMetastore metastore) + { + if (!undo) { + return; + } + metastore.alterPartition(identity, oldPartition.getPartition().getDatabaseName(), oldPartition.getPartition().getTableName(), oldPartition); + } + } + + private static class UpdateStatisticsOperation + { + private final HiveIdentity identity; + private final SchemaTableName tableName; + private final Optional partitionName; + private final PartitionStatistics statistics; + private final boolean merge; + + private boolean done; + + public UpdateStatisticsOperation(HiveIdentity identity, SchemaTableName tableName, Optional partitionName, PartitionStatistics statistics, boolean merge) + { + this.identity = requireNonNull(identity, "identity is null"); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.partitionName = requireNonNull(partitionName, "partitionValues is null"); + this.statistics = requireNonNull(statistics, "statistics is null"); + this.merge = merge; + } + + public void run(HiveMetastore metastore) + { + if (partitionName.isPresent()) { + metastore.updatePartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), partitionName.get(), this::updateStatistics); + } + else { + metastore.updateTableStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), this::updateStatistics); + } + done = true; + } + + public void undo(HiveMetastore metastore) + { + if (!done) { + return; + } + if (partitionName.isPresent()) { + metastore.updatePartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), partitionName.get(), this::resetStatistics); + } + else { + metastore.updateTableStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), this::resetStatistics); + } + } + + public String getDescription() + { + if (partitionName.isPresent()) { + return format("replace partition parameters %s %s", tableName, partitionName.get()); + } + return format("replace table parameters %s", tableName); + } + + private PartitionStatistics updateStatistics(PartitionStatistics currentStatistics) + { + if (currentStatistics.getBasicStatistics().getRowCount().orElse(1) == 0) { + return statistics; + } + return merge ? merge(currentStatistics, statistics) : statistics; + } + + private PartitionStatistics resetStatistics(PartitionStatistics currentStatistics) + { + return new PartitionStatistics(reduce(currentStatistics.getBasicStatistics(), statistics.getBasicStatistics(), SUBTRACT), ImmutableMap.of()); + } + } + + private static class PartitionAdder + { + private final HiveIdentity identity; + private final String schemaName; + private final String tableName; + private final HiveMetastore metastore; + private final int batchSize; + private final List partitions; + private List> createdPartitionValues = new ArrayList<>(); + private final List updateStatisticsOperations; + + public PartitionAdder(HiveIdentity identity, String schemaName, String tableName, HiveMetastore metastore, int batchSize, + List updateStatisticsOperations) + { + this.identity = identity; + this.schemaName = schemaName; + this.tableName = tableName; + this.metastore = metastore; + this.batchSize = batchSize; + this.partitions = new ArrayList<>(batchSize); + this.updateStatisticsOperations = updateStatisticsOperations; + } + + public String getSchemaName() + { + return schemaName; + } + + public String getTableName() + { + return tableName; + } + + public synchronized void addPartition(PartitionWithStatistics partition) + { + checkArgument(getPrestoQueryId(partition.getPartition()).isPresent()); + partitions.add(partition); + } + + public void execute(ListeningExecutorService hiveMetastoreClientService) + { + List> batchedPartitions = Lists.partition(partitions, batchSize); + List> futures = batchedPartitions.stream() + .map(batch -> hiveMetastoreClientService.submit(() -> addPartitionBatch(batch))) + .collect(Collectors.toList()); + waitForCompletion(futures, "add partition"); + partitions.clear(); + } + + private void addPartitionBatch(List batch) + { + try { + metastore.addPartitions(identity, schemaName, tableName, batch); + for (PartitionWithStatistics partition : batch) { + createdPartitionValues.add(partition.getPartition().getValues()); + } + } + catch (Throwable t) { + // Add partition to the created list conservatively. + // Some metastore implementations are known to violate the "all or none" guarantee for add_partitions call. + boolean batchCompletelyAdded = true; + for (PartitionWithStatistics partition : batch) { + try { + Optional remotePartition = metastore.getPartition(identity, schemaName, tableName, partition.getPartition().getValues()); + if (remotePartition.isPresent()) { + // getPrestoQueryId(partition) is guaranteed to be non-empty. It is asserted in PartitionAdder.addPartition. + if (getPrestoQueryId(remotePartition.get()).equals(getPrestoQueryId(partition.getPartition()))) { + createdPartitionValues.add(partition.getPartition().getValues()); + } + else if (partition.isUpdateStats()) { + //If the remote partition is present and its not created by current query then update the statistics. + updateStatisticsOperations.add(new UpdateStatisticsOperation(identity, partition.getPartition().getSchemaTableName(), + Optional.of(partition.getPartitionName()), + partition.getStatistics(), true)); + } + } + else { + batchCompletelyAdded = false; + } + } + catch (Throwable ignored) { + // When partition could not be fetched from metastore, it is not known whether the partition was added. + // Deleting the partition when aborting commit has the risk of deleting partition not added in this transaction. + // Not deleting the partition may leave garbage behind. The former is much more dangerous than the latter. + // Therefore, the partition is not added to the createdPartitionValues list here. + batchCompletelyAdded = false; + } + } + // If all the partitions were added successfully, the add_partition operation was actually successful. + // For some reason, it threw an exception (communication failure, retry failure after communication failure, etc). + // But we would consider it successful anyways. + if (!batchCompletelyAdded) { + if (t instanceof TableNotFoundException) { + throw new PrestoException(HiveErrorCode.HIVE_TABLE_DROPPED_DURING_QUERY, t); + } + throw t; + } + } + } + + public List> rollback() + { + // drop created partitions + List> partitionsFailedToRollback = new ArrayList<>(); + for (List createdPartitionValue : createdPartitionValues) { + try { + metastore.dropPartition(identity, schemaName, tableName, createdPartitionValue, false); + } + catch (PartitionNotFoundException e) { + // Maybe some one deleted the partition we added. + // Anyways, we are good because the partition is not there anymore. + } + catch (Throwable t) { + partitionsFailedToRollback.add(createdPartitionValue); + } + } + createdPartitionValues = partitionsFailedToRollback; + return partitionsFailedToRollback; + } + } + + private interface ExclusiveOperation + { + void execute(HiveMetastore delegate, HdfsEnvironment hdfsEnvironment); + } + + public ScheduledExecutorService getVacuumExecutorService() + { + return vacuumExecutorService; + } + + public long getVacuumCleanupInterval() + { + return configuredVacuumCleanupInterval.toMillis(); + } + + public ShowLocksResponse showLocks(VacuumTableInfoForCleaner tableInfo) + { + ShowLocksRequest rqst = new ShowLocksRequest(); + + rqst.setDbname(tableInfo.getDbName()); + rqst.setTablename(tableInfo.getTableName()); + if (tableInfo.getPartitionName().length() > 0) { + rqst.setPartname(tableInfo.getPartitionName()); + } + + return delegate.showLocks(rqst); + } + + public void setVacuumTableHandle(HiveTableHandle vacuumTableHandle) + { + this.tableHandle = vacuumTableHandle; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/SortingColumn.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/SortingColumn.java new file mode 100644 index 00000000..9f92d9b9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/SortingColumn.java @@ -0,0 +1,127 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.SortOrder; + +import javax.annotation.concurrent.Immutable; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.prestosql.spi.block.SortOrder.ASC_NULLS_FIRST; +import static io.prestosql.spi.block.SortOrder.DESC_NULLS_LAST; +import static java.util.Objects.requireNonNull; + +@Immutable +public class SortingColumn +{ + public enum Order + { + ASCENDING(ASC_NULLS_FIRST, 1), + DESCENDING(DESC_NULLS_LAST, 0); + + private final SortOrder sortOrder; + private final int hiveOrder; + + Order(SortOrder sortOrder, int hiveOrder) + { + this.sortOrder = requireNonNull(sortOrder, "sortOrder is null"); + this.hiveOrder = hiveOrder; + } + + public SortOrder getSortOrder() + { + return sortOrder; + } + + public int getHiveOrder() + { + return hiveOrder; + } + + public static Order fromMetastoreApiOrder(int value, String tablePartitionName) + { + for (Order order : values()) { + if (value == order.getHiveOrder()) { + return order; + } + } + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Table/partition metadata has invalid sorting order: " + tablePartitionName); + } + } + + private final String columnName; + private final Order order; + + @JsonCreator + public SortingColumn( + @JsonProperty("columnName") String columnName, + @JsonProperty("order") Order order) + { + this.columnName = requireNonNull(columnName, "columnName is null"); + this.order = requireNonNull(order, "order is null"); + } + + @JsonProperty + public String getColumnName() + { + return columnName; + } + + @JsonProperty + public Order getOrder() + { + return order; + } + + public static SortingColumn fromMetastoreApiOrder(org.apache.hadoop.hive.metastore.api.Order order, String tablePartitionName) + { + return new SortingColumn(order.getCol(), Order.fromMetastoreApiOrder(order.getOrder(), tablePartitionName)); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("columnName", columnName) + .add("order", order) + .toString(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + SortingColumn that = (SortingColumn) o; + return Objects.equals(columnName, that.columnName) && + order == that.order; + } + + @Override + public int hashCode() + { + return Objects.hash(columnName, order); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Storage.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Storage.java new file mode 100644 index 00000000..86da5210 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Storage.java @@ -0,0 +1,186 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HiveBucketProperty; + +import javax.annotation.concurrent.Immutable; + +import java.util.Map; +import java.util.Objects; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class Storage +{ + private final StorageFormat storageFormat; + private final String location; + private final Optional bucketProperty; + private final boolean skewed; + private final Map serdeParameters; + + @JsonCreator + public Storage( + @JsonProperty("storageFormat") StorageFormat storageFormat, + @JsonProperty("location") String location, + @JsonProperty("bucketProperty") Optional bucketProperty, + @JsonProperty("skewed") boolean skewed, + @JsonProperty("serdeParameters") Map serdeParameters) + { + this.storageFormat = requireNonNull(storageFormat, "storageFormat is null"); + this.location = requireNonNull(location, "location is null"); + this.bucketProperty = requireNonNull(bucketProperty, "bucketProperty is null"); + this.skewed = skewed; + this.serdeParameters = ImmutableMap.copyOf(requireNonNull(serdeParameters, "serdeParameters is null")); + } + + @JsonProperty + public StorageFormat getStorageFormat() + { + return storageFormat; + } + + @JsonProperty + public String getLocation() + { + return location; + } + + @JsonProperty + public Optional getBucketProperty() + { + return bucketProperty; + } + + @JsonProperty + public boolean isSkewed() + { + return skewed; + } + + @JsonProperty + public Map getSerdeParameters() + { + return serdeParameters; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("skewed", skewed) + .add("storageFormat", storageFormat) + .add("location", location) + .add("bucketProperty", bucketProperty) + .add("serdeParameters", serdeParameters) + .toString(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + Storage storage = (Storage) o; + return skewed == storage.skewed && + Objects.equals(storageFormat, storage.storageFormat) && + Objects.equals(location, storage.location) && + Objects.equals(bucketProperty, storage.bucketProperty) && + Objects.equals(serdeParameters, storage.serdeParameters); + } + + @Override + public int hashCode() + { + return Objects.hash(skewed, storageFormat, location, bucketProperty, serdeParameters); + } + + public static Builder builder() + { + return new Builder(); + } + + public static Builder builder(Storage storage) + { + return new Builder(storage); + } + + public static class Builder + { + private StorageFormat storageFormat; + private String location; + private Optional bucketProperty = Optional.empty(); + private boolean skewed; + private Map serdeParameters = ImmutableMap.of(); + + private Builder() + { + } + + private Builder(Storage storage) + { + this.storageFormat = storage.storageFormat; + this.location = storage.location; + this.bucketProperty = storage.bucketProperty; + this.skewed = storage.skewed; + this.serdeParameters = storage.serdeParameters; + } + + public Builder setStorageFormat(StorageFormat storageFormat) + { + this.storageFormat = storageFormat; + return this; + } + + public Builder setLocation(String location) + { + this.location = location; + return this; + } + + public Builder setBucketProperty(Optional bucketProperty) + { + this.bucketProperty = bucketProperty; + return this; + } + + public Builder setSkewed(boolean skewed) + { + this.skewed = skewed; + return this; + } + + public Builder setSerdeParameters(Map serdeParameters) + { + this.serdeParameters = serdeParameters; + return this; + } + + public Storage build() + { + return new Storage(storageFormat, location, bucketProperty, skewed, serdeParameters); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/StorageFormat.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/StorageFormat.java new file mode 100644 index 00000000..c5a16687 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/StorageFormat.java @@ -0,0 +1,139 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.prestosql.plugin.hive.BaseStorageFormat; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.spi.PrestoException; + +import javax.annotation.concurrent.Immutable; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class StorageFormat +{ + public static final StorageFormat VIEW_STORAGE_FORMAT = StorageFormat.createNullable(null, null, null); + + private final String serDe; + private final String inputFormat; + private final String outputFormat; + + private StorageFormat(String serDe, String inputFormat, String outputFormat) + { + this.serDe = serDe; + this.inputFormat = inputFormat; + this.outputFormat = outputFormat; + } + + public String getSerDe() + { + if (serDe == null) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "SerDe is not present in StorageFormat"); + } + return serDe; + } + + public String getInputFormat() + { + if (inputFormat == null) { + throw new PrestoException(HiveErrorCode.HIVE_UNSUPPORTED_FORMAT, "InputFormat is not present in StorageFormat"); + } + return inputFormat; + } + + public String getOutputFormat() + { + if (outputFormat == null) { + throw new PrestoException(HiveErrorCode.HIVE_UNSUPPORTED_FORMAT, "OutputFormat is not present in StorageFormat"); + } + return outputFormat; + } + + @JsonProperty("serDe") + public String getSerDeNullable() + { + return serDe; + } + + @JsonProperty("inputFormat") + public String getInputFormatNullable() + { + return inputFormat; + } + + @JsonProperty("outputFormat") + public String getOutputFormatNullable() + { + return outputFormat; + } + + public static StorageFormat fromHiveStorageFormat(BaseStorageFormat hiveStorageFormat) + { + return new StorageFormat(hiveStorageFormat.getSerDe(), hiveStorageFormat.getInputFormat(), hiveStorageFormat.getOutputFormat()); + } + + public static StorageFormat create(String serde, String inputFormat, String outputFormat) + { + return new StorageFormat( + requireNonNull(serde, "serDe is null"), + requireNonNull(inputFormat, "inputFormat is null"), + requireNonNull(outputFormat, "outputFormat is null")); + } + + @JsonCreator + public static StorageFormat createNullable( + @JsonProperty("serDe") String serDe, + @JsonProperty("inputFormat") String inputFormat, + @JsonProperty("outputFormat") String outputFormat) + { + return new StorageFormat(serDe, inputFormat, outputFormat); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + StorageFormat that = (StorageFormat) o; + return Objects.equals(serDe, that.serDe) && + Objects.equals(inputFormat, that.inputFormat) && + Objects.equals(outputFormat, that.outputFormat); + } + + @Override + public int hashCode() + { + return Objects.hash(serDe, inputFormat, outputFormat); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("serDe", serDe) + .add("inputFormat", inputFormat) + .add("outputFormat", outputFormat) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Table.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Table.java new file mode 100644 index 00000000..5bd4cb81 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/Table.java @@ -0,0 +1,339 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.spi.connector.SchemaTableName; + +import javax.annotation.concurrent.Immutable; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.function.Consumer; +import java.util.stream.Stream; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class Table +{ + private final String databaseName; + private final String tableName; + private final String owner; + private final String tableType; // This is not an enum because some Hive implementations define additional table types. + private final List dataColumns; + private final List partitionColumns; + private final Storage storage; + private final Map parameters; + private final Optional viewOriginalText; + private final Optional viewExpandedText; + + @JsonCreator + public Table( + @JsonProperty("databaseName") String databaseName, + @JsonProperty("tableName") String tableName, + @JsonProperty("owner") String owner, + @JsonProperty("tableType") String tableType, + @JsonProperty("storage") Storage storage, + @JsonProperty("dataColumns") List dataColumns, + @JsonProperty("partitionColumns") List partitionColumns, + @JsonProperty("parameters") Map parameters, + @JsonProperty("viewOriginalText") Optional viewOriginalText, + @JsonProperty("viewExpandedText") Optional viewExpandedText) + { + this.databaseName = requireNonNull(databaseName, "databaseName is null"); + this.tableName = requireNonNull(tableName, "tableName is null"); + this.owner = requireNonNull(owner, "owner is null"); + this.tableType = requireNonNull(tableType, "tableType is null"); + this.storage = requireNonNull(storage, "storage is null"); + this.dataColumns = ImmutableList.copyOf(requireNonNull(dataColumns, "dataColumns is null")); + this.partitionColumns = ImmutableList.copyOf(requireNonNull(partitionColumns, "partitionColumns is null")); + this.parameters = ImmutableMap.copyOf(requireNonNull(parameters, "parameters is null")); + this.viewOriginalText = requireNonNull(viewOriginalText, "viewOriginalText is null"); + this.viewExpandedText = requireNonNull(viewExpandedText, "viewExpandedText is null"); + } + + @JsonProperty + public String getDatabaseName() + { + return databaseName; + } + + @JsonProperty + public String getTableName() + { + return tableName; + } + + @JsonIgnore + public SchemaTableName getSchemaTableName() + { + return new SchemaTableName(databaseName, tableName); + } + + @JsonProperty + public String getOwner() + { + return owner; + } + + @JsonProperty + public String getTableType() + { + return tableType; + } + + @JsonProperty + public List getDataColumns() + { + return dataColumns; + } + + @JsonProperty + public List getPartitionColumns() + { + return partitionColumns; + } + + public Optional getColumn(String name) + { + return Stream.concat(partitionColumns.stream(), dataColumns.stream()) + .filter(column -> column.getName().equals(name)) + .findFirst(); + } + + @JsonProperty + public Storage getStorage() + { + return storage; + } + + @JsonProperty + public Map getParameters() + { + return parameters; + } + + @JsonProperty + public Optional getViewOriginalText() + { + return viewOriginalText; + } + + @JsonProperty + public Optional getViewExpandedText() + { + return viewExpandedText; + } + + public static Builder builder() + { + return new Builder(); + } + + public static Builder builder(Table table) + { + return new Builder(table); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("databaseName", databaseName) + .add("tableName", tableName) + .add("owner", owner) + .add("tableType", tableType) + .add("dataColumns", dataColumns) + .add("partitionColumns", partitionColumns) + .add("storage", storage) + .add("parameters", parameters) + .add("viewOriginalText", viewOriginalText) + .add("viewExpandedText", viewExpandedText) + .toString(); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + Table table = (Table) o; + return Objects.equals(databaseName, table.databaseName) && + Objects.equals(tableName, table.tableName) && + Objects.equals(owner, table.owner) && + Objects.equals(tableType, table.tableType) && + Objects.equals(dataColumns, table.dataColumns) && + Objects.equals(partitionColumns, table.partitionColumns) && + Objects.equals(storage, table.storage) && + Objects.equals(parameters, table.parameters) && + Objects.equals(viewOriginalText, table.viewOriginalText) && + Objects.equals(viewExpandedText, table.viewExpandedText); + } + + @Override + public int hashCode() + { + return Objects.hash( + databaseName, + tableName, + owner, + tableType, + dataColumns, + partitionColumns, + storage, + parameters, + viewOriginalText, + viewExpandedText); + } + + public static class Builder + { + private final Storage.Builder storageBuilder; + private String databaseName; + private String tableName; + private String owner; + private String tableType; + private List dataColumns = new ArrayList<>(); + private List partitionColumns = new ArrayList<>(); + private Map parameters = new LinkedHashMap<>(); + private Optional viewOriginalText = Optional.empty(); + private Optional viewExpandedText = Optional.empty(); + + private Builder() + { + storageBuilder = Storage.builder(); + } + + private Builder(Table table) + { + databaseName = table.databaseName; + tableName = table.tableName; + owner = table.owner; + tableType = table.tableType; + storageBuilder = Storage.builder(table.getStorage()); + dataColumns = new ArrayList<>(table.dataColumns); + partitionColumns = new ArrayList<>(table.partitionColumns); + parameters = new LinkedHashMap<>(table.parameters); + viewOriginalText = table.viewOriginalText; + viewExpandedText = table.viewExpandedText; + } + + public Builder setDatabaseName(String databaseName) + { + this.databaseName = databaseName; + return this; + } + + public Builder setTableName(String tableName) + { + this.tableName = tableName; + return this; + } + + public Builder setOwner(String owner) + { + this.owner = owner; + return this; + } + + public Builder setTableType(String tableType) + { + this.tableType = tableType; + return this; + } + + public Storage.Builder getStorageBuilder() + { + return storageBuilder; + } + + public Builder setDataColumns(List dataColumns) + { + this.dataColumns = new ArrayList<>(dataColumns); + return this; + } + + public Builder addDataColumn(Column dataColumn) + { + this.dataColumns.add(dataColumn); + return this; + } + + public Builder setPartitionColumns(List partitionColumns) + { + this.partitionColumns = new ArrayList<>(partitionColumns); + return this; + } + + public Builder setParameters(Map parameters) + { + this.parameters = new LinkedHashMap<>(parameters); + return this; + } + + public Builder setParameter(String key, String value) + { + this.parameters.put(key, value); + return this; + } + + public Builder setViewOriginalText(Optional viewOriginalText) + { + this.viewOriginalText = viewOriginalText; + return this; + } + + public Builder setViewExpandedText(Optional viewExpandedText) + { + this.viewExpandedText = viewExpandedText; + return this; + } + + public Builder withStorage(Consumer consumer) + { + consumer.accept(storageBuilder); + return this; + } + + public Table build() + { + return new Table( + databaseName, + tableName, + owner, + tableType, + storageBuilder.build(), + dataColumns, + partitionColumns, + parameters, + viewOriginalText, + viewExpandedText); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/UserDatabaseKey.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/UserDatabaseKey.java new file mode 100644 index 00000000..43703394 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/UserDatabaseKey.java @@ -0,0 +1,79 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import javax.annotation.concurrent.Immutable; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class UserDatabaseKey +{ + private final String user; + private final String database; + + @JsonCreator + public UserDatabaseKey(@JsonProperty("user") String user, @JsonProperty("database") String database) + { + this.user = requireNonNull(user, "principalName is null"); + this.database = requireNonNull(database, "database is null"); + } + + @JsonProperty + public String getUser() + { + return user; + } + + @JsonProperty + public String getDatabase() + { + return database; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + UserDatabaseKey that = (UserDatabaseKey) o; + return Objects.equals(user, that.user) && + Objects.equals(database, that.database); + } + + @Override + public int hashCode() + { + return Objects.hash(user, database); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("principalName", user) + .add("database", database) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/UserTableKey.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/UserTableKey.java new file mode 100644 index 00000000..e63a91bb --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/UserTableKey.java @@ -0,0 +1,113 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; + +import javax.annotation.concurrent.Immutable; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +@Immutable +public class UserTableKey +{ + private final HivePrincipal principal; + private final String database; + private final String table; + private final String column; + + public UserTableKey(HivePrincipal principal, String database, String table) + { + // principal can be null when we want to list all privileges for admins + this(principal, database, table, null); + } + + @JsonCreator + public UserTableKey(@JsonProperty("principal") HivePrincipal principal, + @JsonProperty("database") String database, @JsonProperty("table") String table, + @JsonProperty("column") String column) + { + // principal can be null when we want to list all privileges for admins + this.principal = principal; + this.database = requireNonNull(database, "database is null"); + this.table = requireNonNull(table, "table is null"); + this.column = column; + } + + @JsonProperty + public HivePrincipal getPrincipal() + { + return principal; + } + + @JsonProperty + public String getDatabase() + { + return database; + } + + @JsonProperty + public String getTable() + { + return table; + } + + public boolean matches(String databaseName, String tableName) + { + return this.database.equals(databaseName) && this.table.equals(tableName); + } + + @JsonProperty + public String getColumn() + { + return column; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + UserTableKey that = (UserTableKey) o; + return Objects.equals(principal, that.principal) + && Objects.equals(table, that.table) + && Objects.equals(database, that.database) + && Objects.equals(column, that.column); + } + + @Override + public int hashCode() + { + return Objects.hash(principal, table, database, column); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("principal", principal) + .add("column", column) + .add("table", table) + .add("database", database) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/WriteHiveMetastoreRecordingProcedure.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/WriteHiveMetastoreRecordingProcedure.java new file mode 100644 index 00000000..daf61210 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/WriteHiveMetastoreRecordingProcedure.java @@ -0,0 +1,66 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.collect.ImmutableList; +import com.google.common.util.concurrent.RateLimiter; +import io.prestosql.spi.procedure.Procedure; + +import javax.inject.Inject; +import javax.inject.Provider; + +import java.io.IOException; +import java.lang.invoke.MethodHandle; + +import static io.prestosql.spi.block.MethodHandleUtil.methodHandle; +import static java.util.Objects.requireNonNull; + +public class WriteHiveMetastoreRecordingProcedure + implements Provider +{ + private static final MethodHandle WRITE_HIVE_METASTORE_RECORDING = methodHandle( + WriteHiveMetastoreRecordingProcedure.class, + "writeHiveMetastoreRecording"); + + private final RateLimiter rateLimiter = RateLimiter.create(0.2); + private final RecordingHiveMetastore recordingHiveMetastore; + + @Inject + public WriteHiveMetastoreRecordingProcedure(RecordingHiveMetastore recordingHiveMetastore) + { + this.recordingHiveMetastore = requireNonNull(recordingHiveMetastore, "recordingHiveMetastore is null"); + } + + @Override + public Procedure get() + { + return new Procedure( + "system", + "write_hive_metastore_recording", + ImmutableList.of(), + WRITE_HIVE_METASTORE_RECORDING.bindTo(this)); + } + + public void writeHiveMetastoreRecording() + { + try { + // limit rate of recording dumps to prevent IO and Presto saturation + rateLimiter.acquire(); + recordingHiveMetastore.writeRecording(); + } + catch (IOException ex) { + throw new RuntimeException(ex); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/DatabaseMetadata.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/DatabaseMetadata.java new file mode 100644 index 00000000..f687b9dd --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/DatabaseMetadata.java @@ -0,0 +1,89 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.file; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.spi.security.PrincipalType; + +import java.util.Map; +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +public class DatabaseMetadata +{ + private final String ownerName; + private final PrincipalType ownerType; + private final Optional comment; + private final Map parameters; + + @JsonCreator + public DatabaseMetadata( + @JsonProperty("ownerName") String ownerName, + @JsonProperty("ownerType") PrincipalType ownerType, + @JsonProperty("comment") Optional comment, + @JsonProperty("parameters") Map parameters) + { + this.ownerName = requireNonNull(ownerName, "ownerName is null"); + this.ownerType = requireNonNull(ownerType, "ownerType is null"); + this.comment = requireNonNull(comment, "comment is null"); + this.parameters = ImmutableMap.copyOf(requireNonNull(parameters, "parameters is null")); + } + + public DatabaseMetadata(Database database) + { + this.ownerName = database.getOwnerName(); + this.ownerType = database.getOwnerType(); + this.comment = database.getComment(); + this.parameters = database.getParameters(); + } + + @JsonProperty + public String getOwnerName() + { + return ownerName; + } + + @JsonProperty + public PrincipalType getOwnerType() + { + return ownerType; + } + + @JsonProperty + public Optional getComment() + { + return comment; + } + + @JsonProperty + public Map getParameters() + { + return parameters; + } + + public Database toDatabase(String databaseName, String location) + { + return Database.builder() + .setDatabaseName(databaseName) + .setLocation(Optional.of(location)) + .setOwnerName(ownerName) + .setOwnerType(ownerType) + .setParameters(parameters) + .build(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/FileHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/FileHiveMetastore.java new file mode 100644 index 00000000..736ccd9a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/FileHiveMetastore.java @@ -0,0 +1,1357 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.file; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.io.ByteStreams; +import io.airlift.json.JsonCodec; +import io.prestosql.plugin.hive.HdfsConfiguration; +import io.prestosql.plugin.hive.HdfsConfigurationInitializer; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.HiveBasicStatistics; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HiveHdfsConfiguration; +import io.prestosql.plugin.hive.HivePartition; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.PartitionNotFoundException; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.authentication.NoHdfsAuthentication; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.HivePrivilegeInfo; +import io.prestosql.plugin.hive.metastore.MetastoreUtil; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.PartitionWithStatistics; +import io.prestosql.plugin.hive.metastore.PrincipalPrivileges; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ColumnNotFoundException; +import io.prestosql.spi.connector.SchemaAlreadyExistsException; +import io.prestosql.spi.connector.SchemaNotFoundException; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableAlreadyExistsException; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.security.ConnectorIdentity; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.metastore.api.DataOperationType; +import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; +import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; + +import javax.annotation.concurrent.ThreadSafe; +import javax.inject.Inject; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.prestosql.plugin.hive.HiveMetadata.TABLE_COMMENT; +import static io.prestosql.plugin.hive.HivePartitionManager.extractPartitionValues; +import static io.prestosql.plugin.hive.HiveUtil.toPartitionValues; +import static io.prestosql.plugin.hive.metastore.MetastoreUtil.makePartitionName; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.getHiveBasicStatistics; +import static io.prestosql.spi.StandardErrorCode.ALREADY_EXISTS; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.security.PrincipalType.ROLE; +import static io.prestosql.spi.security.PrincipalType.USER; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; +import static java.util.stream.Collectors.toSet; +import static org.apache.hadoop.hive.common.FileUtils.unescapePathName; +import static org.apache.hadoop.hive.metastore.TableType.EXTERNAL_TABLE; +import static org.apache.hadoop.hive.metastore.TableType.MANAGED_TABLE; +import static org.apache.hadoop.hive.metastore.TableType.VIRTUAL_VIEW; + +@ThreadSafe +public class FileHiveMetastore + implements HiveMetastore +{ + private static final String PUBLIC_ROLE_NAME = "public"; + private static final String ADMIN_ROLE_NAME = "admin"; + private static final String PRESTO_SCHEMA_FILE_NAME = ".prestoSchema"; + private static final String PRESTO_PERMISSIONS_DIRECTORY_NAME = ".prestoPermissions"; + // todo there should be a way to manage the admins list + private static final Set ADMIN_USERS = ImmutableSet.of("admin", "hive", "hdfs"); + + private final HdfsEnvironment hdfsEnvironment; + private final Path catalogDirectory; + private final HdfsContext hdfsContext; + private final FileSystem metadataFileSystem; + + private final JsonCodec databaseCodec = JsonCodec.jsonCodec(DatabaseMetadata.class); + private final JsonCodec tableCodec = JsonCodec.jsonCodec(TableMetadata.class); + private final JsonCodec partitionCodec = JsonCodec.jsonCodec(PartitionMetadata.class); + private final JsonCodec> permissionsCodec = JsonCodec.listJsonCodec(PermissionMetadata.class); + private final JsonCodec> rolesCodec = JsonCodec.listJsonCodec(String.class); + private final JsonCodec> roleGrantsCodec = JsonCodec.listJsonCodec(RoleGrant.class); + private long txnId = 1L; + + public static FileHiveMetastore createTestingFileHiveMetastore(File catalogDirectory) + { + HiveConfig hiveConfig = new HiveConfig(); + HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hiveConfig), ImmutableSet.of()); + HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hiveConfig, new NoHdfsAuthentication()); + return new FileHiveMetastore(hdfsEnvironment, catalogDirectory.toURI().toString(), "test"); + } + + @Inject + public FileHiveMetastore(HdfsEnvironment hdfsEnvironment, FileHiveMetastoreConfig config) + { + this(hdfsEnvironment, config.getCatalogDirectory(), config.getMetastoreUser()); + } + + public FileHiveMetastore(HdfsEnvironment hdfsEnvironment, String catalogDirectory, String metastoreUser) + { + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.catalogDirectory = new Path(requireNonNull(catalogDirectory, "baseDirectory is null")); + this.hdfsContext = new HdfsContext(new ConnectorIdentity(metastoreUser, Optional.empty(), Optional.empty())); + try { + metadataFileSystem = hdfsEnvironment.getFileSystem(hdfsContext, this.catalogDirectory); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public synchronized void createDatabase(HiveIdentity identity, Database database) + { + requireNonNull(database, "database is null"); + + if (database.getLocation().isPresent()) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Database can not be created with a location set"); + } + + verifyDatabaseNotExists(database.getDatabaseName()); + + Path databaseMetadataDirectory = getDatabaseMetadataDirectory(database.getDatabaseName()); + writeSchemaFile("database", databaseMetadataDirectory, databaseCodec, new DatabaseMetadata(database), false); + } + + @Override + public synchronized void dropDatabase(HiveIdentity identity, String databaseName) + { + requireNonNull(databaseName, "databaseName is null"); + + getRequiredDatabase(databaseName); + if (!getAllTables(databaseName).orElse(ImmutableList.of()).isEmpty()) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Database " + databaseName + " is not empty"); + } + + deleteMetadataDirectory(getDatabaseMetadataDirectory(databaseName)); + } + + @Override + public synchronized void renameDatabase(HiveIdentity identity, String databaseName, String newDatabaseName) + { + requireNonNull(databaseName, "databaseName is null"); + requireNonNull(newDatabaseName, "newDatabaseName is null"); + + getRequiredDatabase(databaseName); + verifyDatabaseNotExists(newDatabaseName); + + try { + if (!metadataFileSystem.rename(getDatabaseMetadataDirectory(databaseName), getDatabaseMetadataDirectory(newDatabaseName))) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Could not rename database metadata directory"); + } + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public synchronized Optional getDatabase(String databaseName) + { + requireNonNull(databaseName, "databaseName is null"); + + Path databaseMetadataDirectory = getDatabaseMetadataDirectory(databaseName); + return readSchemaFile("database", databaseMetadataDirectory, databaseCodec) + .map(databaseMetadata -> databaseMetadata.toDatabase(databaseName, databaseMetadataDirectory.toString())); + } + + private Database getRequiredDatabase(String databaseName) + { + return getDatabase(databaseName) + .orElseThrow(() -> new SchemaNotFoundException(databaseName)); + } + + private void verifyDatabaseNotExists(String databaseName) + { + if (getDatabase(databaseName).isPresent()) { + throw new SchemaAlreadyExistsException(databaseName); + } + } + + @Override + public synchronized List getAllDatabases() + { + List databases = getChildSchemaDirectories(catalogDirectory).stream() + .map(Path::getName) + .collect(toList()); + return ImmutableList.copyOf(databases); + } + + @Override + public synchronized void createTable(HiveIdentity identity, Table table, PrincipalPrivileges principalPrivileges) + { + verifyTableNotExists(table.getDatabaseName(), table.getTableName()); + + Path tableMetadataDirectory = getTableMetadataDirectory(table); + + // validate table location + if (table.getTableType().equals(VIRTUAL_VIEW.name())) { + checkArgument(table.getStorage().getLocation().isEmpty(), "Storage location for view must be empty"); + } + else if (table.getTableType().equals(MANAGED_TABLE.name())) { + if (!tableMetadataDirectory.equals(new Path(table.getStorage().getLocation()))) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Table directory must be " + tableMetadataDirectory); + } + } + else if (table.getTableType().equals(EXTERNAL_TABLE.name())) { + try { + Path externalLocation = new Path(table.getStorage().getLocation()); + FileSystem externalFileSystem = hdfsEnvironment.getFileSystem(hdfsContext, externalLocation); + if (!externalFileSystem.isDirectory(externalLocation)) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "External table location does not exist"); + } + if (isChildDirectory(catalogDirectory, externalLocation)) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "External table location can not be inside the system metadata directory"); + } + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Could not validate external location", e); + } + } + else { + throw new PrestoException(NOT_SUPPORTED, "Table type not supported: " + table.getTableType()); + } + + writeSchemaFile("table", tableMetadataDirectory, tableCodec, new TableMetadata(table), false); + + for (Entry> entry : principalPrivileges.getUserPrivileges().asMap().entrySet()) { + setTablePrivileges(new HivePrincipal(USER, entry.getKey()), table.getDatabaseName(), table.getTableName(), entry.getValue()); + } + for (Entry> entry : principalPrivileges.getRolePrivileges().asMap().entrySet()) { + setTablePrivileges(new HivePrincipal(ROLE, entry.getKey()), table.getDatabaseName(), table.getTableName(), entry.getValue()); + } + } + + @Override + public synchronized Optional
getTable(HiveIdentity identity, String databaseName, String tableName) + { + return getTable(databaseName, tableName); + } + + private Optional
getTable(String databaseName, String tableName) + { + requireNonNull(databaseName, "databaseName is null"); + requireNonNull(tableName, "tableName is null"); + + Path tableMetadataDirectory = getTableMetadataDirectory(databaseName, tableName); + return readSchemaFile("table", tableMetadataDirectory, tableCodec) + .map(tableMetadata -> tableMetadata.toTable(databaseName, tableName, tableMetadataDirectory.toString())); + } + + @Override + public Set getSupportedColumnStatistics(Type type) + { + return ThriftMetastoreUtil.getSupportedColumnStatistics(type); + } + + @Override + public synchronized PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) + { + return getTableStatistics(identity, table.getDatabaseName(), table.getTableName()); + } + + private PartitionStatistics getTableStatistics(HiveIdentity identity, String databaseName, String tableName) + { + Path tableMetadataDirectory = getTableMetadataDirectory(databaseName, tableName); + TableMetadata tableMetadata = readSchemaFile("table", tableMetadataDirectory, tableCodec) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + HiveBasicStatistics basicStatistics = getHiveBasicStatistics(tableMetadata.getParameters()); + Map columnStatistics = tableMetadata.getColumnStatistics(); + return new PartitionStatistics(basicStatistics, columnStatistics); + } + + @Override + public synchronized Map getPartitionStatistics(HiveIdentity identity, Table table, List partitions) + { + return partitions.stream() + .collect(toImmutableMap(partition -> + makePartitionName(table, partition), partition -> getPartitionStatistics(identity, partition.getValues(), table))); + } + + private synchronized PartitionStatistics getPartitionStatistics(HiveIdentity identity, List partitionValues, Table table) + { + Path partitionDirectory = getPartitionMetadataDirectory(table, ImmutableList.copyOf(partitionValues)); + PartitionMetadata partitionMetadata = readSchemaFile("partition", partitionDirectory, partitionCodec) + .orElseThrow(() -> new PartitionNotFoundException(table.getSchemaTableName(), partitionValues)); + HiveBasicStatistics basicStatistics = getHiveBasicStatistics(partitionMetadata.getParameters()); + return new PartitionStatistics(basicStatistics, partitionMetadata.getColumnStatistics()); + } + + private Table getRequiredTable(String databaseName, String tableName) + { + return getTable(databaseName, tableName) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + } + + private void verifyTableNotExists(String newDatabaseName, String newTableName) + { + if (getTable(newDatabaseName, newTableName).isPresent()) { + throw new TableAlreadyExistsException(new SchemaTableName(newDatabaseName, newTableName)); + } + } + + @Override + public synchronized void updateTableStatistics(HiveIdentity identity, String databaseName, String tableName, Function update) + { + PartitionStatistics originalStatistics = getTableStatistics(identity, databaseName, tableName); + PartitionStatistics updatedStatistics = update.apply(originalStatistics); + + Path tableMetadataDirectory = getTableMetadataDirectory(databaseName, tableName); + TableMetadata tableMetadata = readSchemaFile("table", tableMetadataDirectory, tableCodec) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + + TableMetadata updatedMetadata = tableMetadata + .withParameters(ThriftMetastoreUtil.updateStatisticsParameters(tableMetadata.getParameters(), updatedStatistics.getBasicStatistics())) + .withColumnStatistics(updatedStatistics.getColumnStatistics()); + + writeSchemaFile("table", tableMetadataDirectory, tableCodec, updatedMetadata, true); + } + + @Override + public synchronized void updatePartitionStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, Function update) + { + Table table = getRequiredTable(databaseName, tableName); + PartitionStatistics originalStatistics = getPartitionStatistics(identity, extractPartitionValues(partitionName), table); + PartitionStatistics updatedStatistics = update.apply(originalStatistics); + + List partitionValues = extractPartitionValues(partitionName); + Path partitionDirectory = getPartitionMetadataDirectory(table, partitionValues); + PartitionMetadata partitionMetadata = readSchemaFile("partition", partitionDirectory, partitionCodec) + .orElseThrow(() -> new PartitionNotFoundException(new SchemaTableName(databaseName, tableName), partitionValues)); + + PartitionMetadata updatedMetadata = partitionMetadata + .withParameters(ThriftMetastoreUtil.updateStatisticsParameters(partitionMetadata.getParameters(), updatedStatistics.getBasicStatistics())) + .withColumnStatistics(updatedStatistics.getColumnStatistics()); + + writeSchemaFile("partition", partitionDirectory, partitionCodec, updatedMetadata, true); + } + + @Override + public void updatePartitionsStatistics(HiveIdentity identity, String databaseName, String tableName, Map> partNamesUpdateFunctionMap) + { + partNamesUpdateFunctionMap.entrySet().stream().forEach(e -> { + updatePartitionStatistics(identity, databaseName, tableName, e.getKey(), e.getValue()); + }); + } + + @Override + public synchronized Optional> getAllTables(String databaseName) + { + requireNonNull(databaseName, "databaseName is null"); + + Optional database = getDatabase(databaseName); + if (!database.isPresent()) { + return Optional.empty(); + } + + Path databaseMetadataDirectory = getDatabaseMetadataDirectory(databaseName); + List tables = getChildSchemaDirectories(databaseMetadataDirectory).stream() + .map(Path::getName) + .collect(toList()); + return Optional.of(ImmutableList.copyOf(tables)); + } + + @Override + public synchronized Optional> getAllViews(String databaseName) + { + Optional> tables = getAllTables(databaseName); + if (!tables.isPresent()) { + return Optional.empty(); + } + + List views = tables.get().stream() + .map(tableName -> getTable(databaseName, tableName)) + .filter(Optional::isPresent) + .map(Optional::get) + .filter(table -> table.getTableType().equals(VIRTUAL_VIEW.name())) + .map(Table::getTableName) + .collect(toList()); + + return Optional.of(ImmutableList.copyOf(views)); + } + + @Override + public synchronized void dropTable(HiveIdentity identity, String databaseName, String tableName, boolean deleteData) + { + requireNonNull(databaseName, "databaseName is null"); + requireNonNull(tableName, "tableName is null"); + + Table table = getRequiredTable(databaseName, tableName); + + Path tableMetadataDirectory = getTableMetadataDirectory(databaseName, tableName); + + // It is safe to delete the whole meta directory for external tables and views + if (!table.getTableType().equals(MANAGED_TABLE.name()) || deleteData) { + deleteMetadataDirectory(tableMetadataDirectory); + } + else { + // in this case we only wan to delete the metadata of a managed table + deleteSchemaFile("table", tableMetadataDirectory); + deleteTablePrivileges(table); + } + } + + @Override + public synchronized void replaceTable(HiveIdentity identity, String databaseName, String tableName, Table newTable, PrincipalPrivileges principalPrivileges) + { + Table table = getRequiredTable(databaseName, tableName); + if (!table.getTableType().equals(VIRTUAL_VIEW.name()) || !newTable.getTableType().equals(VIRTUAL_VIEW.name())) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Only views can be updated with replaceTable"); + } + if (!table.getDatabaseName().equals(databaseName) || !table.getTableName().equals(tableName)) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Replacement table must have same name"); + } + + Path tableMetadataDirectory = getTableMetadataDirectory(table); + writeSchemaFile("table", tableMetadataDirectory, tableCodec, new TableMetadata(newTable), true); + + // replace existing permissions + deleteTablePrivileges(table); + + for (Entry> entry : principalPrivileges.getUserPrivileges().asMap().entrySet()) { + setTablePrivileges(new HivePrincipal(USER, entry.getKey()), table.getDatabaseName(), table.getTableName(), entry.getValue()); + } + for (Entry> entry : principalPrivileges.getRolePrivileges().asMap().entrySet()) { + setTablePrivileges(new HivePrincipal(ROLE, entry.getKey()), table.getDatabaseName(), table.getTableName(), entry.getValue()); + } + } + + @Override + public synchronized void renameTable(HiveIdentity identity, String databaseName, String tableName, String newDatabaseName, String newTableName) + { + requireNonNull(databaseName, "databaseName is null"); + requireNonNull(tableName, "tableName is null"); + requireNonNull(newDatabaseName, "newDatabaseName is null"); + requireNonNull(newTableName, "newTableName is null"); + + getRequiredTable(databaseName, tableName); + getRequiredDatabase(newDatabaseName); + + // verify new table does not exist + verifyTableNotExists(newDatabaseName, newTableName); + + try { + if (!metadataFileSystem.rename(getTableMetadataDirectory(databaseName, tableName), getTableMetadataDirectory(newDatabaseName, newTableName))) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Could not rename table directory"); + } + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public synchronized void commentTable(HiveIdentity identity, String databaseName, String tableName, Optional comment) + { + alterTable(databaseName, tableName, oldTable -> { + Map parameters = oldTable.getParameters().entrySet().stream() + .filter(entry -> !entry.getKey().equals(TABLE_COMMENT)) + .collect(Collectors.toMap(Entry::getKey, Entry::getValue)); + comment.ifPresent(value -> parameters.put(TABLE_COMMENT, value)); + + return oldTable.withParameters(ImmutableMap.builder() + .putAll(parameters) + .build()); + }); + } + + @Override + public synchronized void addColumn(HiveIdentity identity, String databaseName, String tableName, String columnName, HiveType columnType, String columnComment) + { + alterTable(databaseName, tableName, oldTable -> { + if (oldTable.getColumn(columnName).isPresent()) { + throw new PrestoException(ALREADY_EXISTS, "Column already exists: " + columnName); + } + + return oldTable.withDataColumns(ImmutableList.builder() + .addAll(oldTable.getDataColumns()) + .add(new Column(columnName, columnType, Optional.ofNullable(columnComment))) + .build()); + }); + } + + @Override + public synchronized void renameColumn(HiveIdentity identity, String databaseName, String tableName, String oldColumnName, String newColumnName) + { + alterTable(databaseName, tableName, oldTable -> { + if (oldTable.getColumn(newColumnName).isPresent()) { + throw new PrestoException(ALREADY_EXISTS, "Column already exists: " + newColumnName); + } + if (!oldTable.getColumn(oldColumnName).isPresent()) { + SchemaTableName name = new SchemaTableName(databaseName, tableName); + throw new ColumnNotFoundException(name, oldColumnName); + } + for (Column column : oldTable.getPartitionColumns()) { + if (column.getName().equals(oldColumnName)) { + throw new PrestoException(NOT_SUPPORTED, "Renaming partition columns is not supported"); + } + } + + ImmutableList.Builder newDataColumns = ImmutableList.builder(); + for (Column fieldSchema : oldTable.getDataColumns()) { + if (fieldSchema.getName().equals(oldColumnName)) { + newDataColumns.add(new Column(newColumnName, fieldSchema.getType(), fieldSchema.getComment())); + } + else { + newDataColumns.add(fieldSchema); + } + } + + return oldTable.withDataColumns(newDataColumns.build()); + }); + } + + @Override + public synchronized void dropColumn(HiveIdentity identity, String databaseName, String tableName, String columnName) + { + alterTable(databaseName, tableName, oldTable -> { + MetastoreUtil.verifyCanDropColumn(this, identity, databaseName, tableName, columnName); + if (!oldTable.getColumn(columnName).isPresent()) { + SchemaTableName name = new SchemaTableName(databaseName, tableName); + throw new ColumnNotFoundException(name, columnName); + } + + ImmutableList.Builder newDataColumns = ImmutableList.builder(); + for (Column fieldSchema : oldTable.getDataColumns()) { + if (!fieldSchema.getName().equals(columnName)) { + newDataColumns.add(fieldSchema); + } + } + + return oldTable.withDataColumns(newDataColumns.build()); + }); + } + + private void alterTable(String databaseName, String tableName, Function alterFunction) + { + requireNonNull(databaseName, "databaseName is null"); + requireNonNull(tableName, "tableName is null"); + + Path tableMetadataDirectory = getTableMetadataDirectory(databaseName, tableName); + + TableMetadata oldTableSchema = readSchemaFile("table", tableMetadataDirectory, tableCodec) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + TableMetadata newTableSchema = alterFunction.apply(oldTableSchema); + if (oldTableSchema == newTableSchema) { + return; + } + + writeSchemaFile("table", tableMetadataDirectory, tableCodec, newTableSchema, true); + } + + @Override + public synchronized void addPartitions(HiveIdentity identity, String databaseName, String tableName, List partitions) + { + requireNonNull(databaseName, "databaseName is null"); + requireNonNull(tableName, "tableName is null"); + requireNonNull(partitions, "partitions is null"); + + Table table = getRequiredTable(databaseName, tableName); + + TableType tableType = TableType.valueOf(table.getTableType()); + checkArgument(EnumSet.of(MANAGED_TABLE, EXTERNAL_TABLE).contains(tableType), "Invalid table type: %s", tableType); + + try { + Map schemaFiles = new LinkedHashMap<>(); + for (PartitionWithStatistics partitionWithStatistics : partitions) { + Partition partition = partitionWithStatistics.getPartition(); + verifiedPartition(table, partition); + Path partitionMetadataDirectory = getPartitionMetadataDirectory(table, partition.getValues()); + Path schemaPath = new Path(partitionMetadataDirectory, PRESTO_SCHEMA_FILE_NAME); + if (metadataFileSystem.exists(schemaPath)) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Partition already exists"); + } + byte[] schemaJson = partitionCodec.toJsonBytes(new PartitionMetadata(table, partitionWithStatistics)); + schemaFiles.put(schemaPath, schemaJson); + } + + Set createdFiles = new LinkedHashSet<>(); + try { + for (Entry entry : schemaFiles.entrySet()) { + try (OutputStream outputStream = metadataFileSystem.create(entry.getKey())) { + createdFiles.add(entry.getKey()); + outputStream.write(entry.getValue()); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Could not write partition schema", e); + } + } + } + catch (Throwable e) { + for (Path createdFile : createdFiles) { + try { + metadataFileSystem.delete(createdFile, false); + } + catch (IOException ignored) { + } + } + throw e; + } + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + private void verifiedPartition(Table table, Partition partition) + { + Path partitionMetadataDirectory = getPartitionMetadataDirectory(table, partition.getValues()); + + if (table.getTableType().equals(MANAGED_TABLE.name())) { + if (!partitionMetadataDirectory.equals(new Path(partition.getStorage().getLocation()))) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Partition directory must be " + partitionMetadataDirectory); + } + } + else if (table.getTableType().equals(EXTERNAL_TABLE.name())) { + try { + Path externalLocation = new Path(partition.getStorage().getLocation()); + FileSystem externalFileSystem = hdfsEnvironment.getFileSystem(hdfsContext, externalLocation); + if (!externalFileSystem.isDirectory(externalLocation)) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "External partition location does not exist"); + } + if (isChildDirectory(catalogDirectory, externalLocation)) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "External partition location can not be inside the system metadata directory"); + } + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Could not validate external partition location", e); + } + } + else { + throw new PrestoException(NOT_SUPPORTED, "Partitions can not be added to " + table.getTableType()); + } + } + + @Override + public synchronized void dropPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues, boolean deleteData) + { + requireNonNull(databaseName, "databaseName is null"); + requireNonNull(tableName, "tableName is null"); + requireNonNull(partitionValues, "partitionValues is null"); + + Optional
tableReference = getTable(identity, databaseName, tableName); + if (!tableReference.isPresent()) { + return; + } + Table table = tableReference.get(); + + Path partitionMetadataDirectory = getPartitionMetadataDirectory(table, partitionValues); + if (deleteData) { + deleteMetadataDirectory(partitionMetadataDirectory); + } + else { + deleteSchemaFile("partition", partitionMetadataDirectory); + } + } + + @Override + public synchronized void alterPartition(HiveIdentity identity, String databaseName, String tableName, PartitionWithStatistics partitionWithStatistics) + { + Table table = getRequiredTable(databaseName, tableName); + + Partition partition = partitionWithStatistics.getPartition(); + verifiedPartition(table, partition); + + Path partitionMetadataDirectory = getPartitionMetadataDirectory(table, partition.getValues()); + writeSchemaFile("partition", partitionMetadataDirectory, partitionCodec, new PartitionMetadata(table, partitionWithStatistics), true); + } + + @Override + public synchronized void createRole(String role, String grantor) + { + Set roles = new HashSet<>(listRoles()); + roles.add(role); + writeFile("roles", getRolesFile(), rolesCodec, ImmutableList.copyOf(roles), true); + } + + @Override + public synchronized void dropRole(String role) + { + Set roles = new HashSet<>(listRoles()); + roles.remove(role); + writeFile("roles", getRolesFile(), rolesCodec, ImmutableList.copyOf(roles), true); + Set grants = listRoleGrantsSanitized(); + writeRoleGrantsFile(grants); + } + + @Override + public synchronized Set listRoles() + { + return ImmutableSet.copyOf(readFile("roles", getRolesFile(), rolesCodec).orElse(ImmutableList.of())); + } + + @Override + public synchronized void grantRoles(Set roles, Set grantees, boolean withAdminOption, HivePrincipal grantor) + { + Set existingRoles = listRoles(); + Set existingGrants = listRoleGrantsSanitized(); + Set modifiedGrants = new HashSet<>(existingGrants); + for (HivePrincipal grantee : grantees) { + for (String role : roles) { + checkArgument(existingRoles.contains(role), "Role does not exist: %s", role); + if (grantee.getType() == ROLE) { + checkArgument(existingRoles.contains(grantee.getName()), "Role does not exist: %s", grantee.getName()); + } + + RoleGrant grantWithAdminOption = new RoleGrant(grantee.toPrestoPrincipal(), role, true); + RoleGrant grantWithoutAdminOption = new RoleGrant(grantee.toPrestoPrincipal(), role, false); + + if (withAdminOption) { + modifiedGrants.remove(grantWithoutAdminOption); + modifiedGrants.add(grantWithAdminOption); + } + else { + modifiedGrants.remove(grantWithAdminOption); + modifiedGrants.add(grantWithoutAdminOption); + } + } + } + modifiedGrants = removeDuplicatedEntries(modifiedGrants); + if (!existingGrants.equals(modifiedGrants)) { + writeRoleGrantsFile(modifiedGrants); + } + } + + @Override + public synchronized void revokeRoles(Set roles, Set grantees, boolean adminOptionFor, HivePrincipal grantor) + { + Set existingGrants = listRoleGrantsSanitized(); + Set modifiedGrants = new HashSet<>(existingGrants); + for (HivePrincipal grantee : grantees) { + for (String role : roles) { + RoleGrant grantWithAdminOption = new RoleGrant(grantee.toPrestoPrincipal(), role, true); + RoleGrant grantWithoutAdminOption = new RoleGrant(grantee.toPrestoPrincipal(), role, false); + + if (modifiedGrants.contains(grantWithAdminOption) || modifiedGrants.contains(grantWithoutAdminOption)) { + if (adminOptionFor) { + modifiedGrants.remove(grantWithAdminOption); + modifiedGrants.add(grantWithoutAdminOption); + } + else { + modifiedGrants.remove(grantWithAdminOption); + modifiedGrants.remove(grantWithoutAdminOption); + } + } + } + } + modifiedGrants = removeDuplicatedEntries(modifiedGrants); + if (!existingGrants.equals(modifiedGrants)) { + writeRoleGrantsFile(modifiedGrants); + } + } + + @Override + public synchronized Set listRoleGrants(HivePrincipal principal) + { + ImmutableSet.Builder result = ImmutableSet.builder(); + if (principal.getType() == USER) { + result.add(new RoleGrant(principal.toPrestoPrincipal(), PUBLIC_ROLE_NAME, false)); + if (ADMIN_USERS.contains(principal.getName())) { + result.add(new RoleGrant(principal.toPrestoPrincipal(), ADMIN_ROLE_NAME, true)); + } + } + result.addAll(listRoleGrantsSanitized().stream() + .filter(grant -> HivePrincipal.from(grant.getGrantee()).equals(principal)) + .collect(toSet())); + return result.build(); + } + + private synchronized Set listRoleGrantsSanitized() + { + Set grants = readRoleGrantsFile(); + Set existingRoles = listRoles(); + return removeDuplicatedEntries(removeNonExistingRoles(grants, existingRoles)); + } + + private Set removeDuplicatedEntries(Set grants) + { + Map map = new HashMap<>(); + for (RoleGrant grant : grants) { + RoleGranteeTuple tuple = new RoleGranteeTuple(grant.getRoleName(), HivePrincipal.from(grant.getGrantee())); + map.merge(tuple, grant, (first, second) -> first.isGrantable() ? first : second); + } + return ImmutableSet.copyOf(map.values()); + } + + private static Set removeNonExistingRoles(Set grants, Set existingRoles) + { + ImmutableSet.Builder result = ImmutableSet.builder(); + for (RoleGrant grant : grants) { + if (!existingRoles.contains(grant.getRoleName())) { + continue; + } + HivePrincipal grantee = HivePrincipal.from(grant.getGrantee()); + if (grantee.getType() == ROLE && !existingRoles.contains(grantee.getName())) { + continue; + } + result.add(grant); + } + return result.build(); + } + + private Set readRoleGrantsFile() + { + return ImmutableSet.copyOf(readFile("roleGrants", getRoleGrantsFile(), roleGrantsCodec).orElse(ImmutableList.of())); + } + + private void writeRoleGrantsFile(Set roleGrants) + { + writeFile("roleGrants", getRoleGrantsFile(), roleGrantsCodec, ImmutableList.copyOf(roleGrants), true); + } + + @Override + public synchronized Optional> getPartitionNames(HiveIdentity identity, String databaseName, String tableName) + { + requireNonNull(identity, "identity is null"); + requireNonNull(databaseName, "databaseName is null"); + requireNonNull(tableName, "tableName is null"); + + Optional
tableReference = getTable(identity, databaseName, tableName); + if (!tableReference.isPresent()) { + return Optional.empty(); + } + Table table = tableReference.get(); + + Path tableMetadataDirectory = getTableMetadataDirectory(table); + + List> partitions = listPartitions(tableMetadataDirectory, table.getPartitionColumns()); + + List partitionNames = partitions.stream() + .map(partitionValues -> makePartitionName(table.getPartitionColumns(), ImmutableList.copyOf(partitionValues))) + .collect(toList()); + + return Optional.of(ImmutableList.copyOf(partitionNames)); + } + + private List> listPartitions(Path director, List partitionColumns) + { + if (partitionColumns.isEmpty()) { + return ImmutableList.of(); + } + + try { + String directoryPrefix = partitionColumns.get(0).getName() + '='; + + List> partitionValues = new ArrayList<>(); + for (FileStatus fileStatus : metadataFileSystem.listStatus(director)) { + if (!fileStatus.isDirectory()) { + continue; + } + if (!fileStatus.getPath().getName().startsWith(directoryPrefix)) { + continue; + } + + List> childPartitionValues; + if (partitionColumns.size() == 1) { + childPartitionValues = ImmutableList.of(new ArrayDeque<>()); + } + else { + childPartitionValues = listPartitions(fileStatus.getPath(), partitionColumns.subList(1, partitionColumns.size())); + } + + String value = unescapePathName(fileStatus.getPath().getName().substring(directoryPrefix.length())); + for (ArrayDeque childPartition : childPartitionValues) { + childPartition.addFirst(value); + partitionValues.add(childPartition); + } + } + return partitionValues; + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Error listing partition directories", e); + } + } + + @Override + public synchronized Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues) + { + requireNonNull(databaseName, "databaseName is null"); + requireNonNull(tableName, "tableName is null"); + requireNonNull(partitionValues, "partitionValues is null"); + + Optional
tableReference = getTable(identity, databaseName, tableName); + if (!tableReference.isPresent()) { + return Optional.empty(); + } + Table table = tableReference.get(); + + Path partitionDirectory = getPartitionMetadataDirectory(table, partitionValues); + return readSchemaFile("partition", partitionDirectory, partitionCodec) + .map(partitionMetadata -> partitionMetadata.toPartition(databaseName, tableName, partitionValues, partitionDirectory.toString())); + } + + @Override + public synchronized Optional> getPartitionNamesByParts(HiveIdentity identity, String databaseName, String tableName, List parts) + { + // todo this should be more efficient by selectively walking the directory tree + return getPartitionNames(identity, databaseName, tableName).map(partitionNames -> partitionNames.stream() + .filter(partitionName -> partitionMatches(partitionName, parts)) + .collect(toList())); + } + + private static boolean partitionMatches(String partitionName, List parts) + { + List values = toPartitionValues(partitionName); + if (values.size() != parts.size()) { + return false; + } + for (int i = 0; i < values.size(); i++) { + String part = parts.get(i); + if (!part.isEmpty() && !values.get(i).equals(part)) { + return false; + } + } + return true; + } + + @Override + public synchronized Map> getPartitionsByNames(HiveIdentity identity, String databaseName, String tableName, List partitionNames) + { + ImmutableMap.Builder> builder = ImmutableMap.builder(); + for (String partitionName : partitionNames) { + List partitionValues = toPartitionValues(partitionName); + builder.put(partitionName, getPartition(identity, databaseName, tableName, partitionValues)); + } + return builder.build(); + } + + @Override + public synchronized Set listTablePrivileges(String databaseName, String tableName, HivePrincipal principal) + { + Table table = getRequiredTable(databaseName, tableName); + Path permissionsDirectory = getPermissionsDirectory(table); + if (principal == null) { + return readAllPermissions(permissionsDirectory); + } + ImmutableSet.Builder result = ImmutableSet.builder(); + if (principal.getType() == USER && table.getOwner().equals(principal.getName())) { + result.add(new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.OWNERSHIP, true, principal, principal)); + } + result.addAll(readPermissionsFile(getPermissionsPath(permissionsDirectory, principal))); + return result.build(); + } + + @Override + public synchronized void grantTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + setTablePrivileges(grantee, databaseName, tableName, privileges); + } + + @Override + public synchronized void revokeTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + Set currentPrivileges = listTablePrivileges(databaseName, tableName, grantee); + currentPrivileges.removeAll(privileges); + + setTablePrivileges(grantee, databaseName, tableName, currentPrivileges); + } + + @Override + public boolean isImpersonationEnabled() + { + return false; + } + + @Override + public long openTransaction(HiveIdentity identity) + { + return txnId++; + } + + @Override + public void commitTransaction(HiveIdentity identity, long transactionId) + { + } + + @Override + public void abortTransaction(HiveIdentity identity, long transactionId) + { + } + + @Override + public void sendTransactionHeartbeat(HiveIdentity identity, long transactionId) + { + /* doNothing */ + } + + @Override + public void acquireSharedReadLock(HiveIdentity identity, String queryId, long transactionId, List fullTables, List partitions) + { + /* doNothing */ + } + + @Override + public void acquireLock(HiveIdentity identity, String queryId, long transactionId, List fullTables, List partitions, DataOperationType operationType) + { + /* doNothing */ + } + + @Override + public String getValidWriteIds(HiveIdentity identity, List tables, long currentTransactionId, boolean isVacuum) + { + // .
:::: + return format("%d$%s.%s:%d:9223372036854775807::", + currentTransactionId, + tables.get(0).getSchemaName(), + tables.get(0).getTableName(), + currentTransactionId - 1); + } + + @Override + public ShowLocksResponse showLocks(ShowLocksRequest rqst) + { + return new ShowLocksResponse(); + } + + @Override + public long getTableWriteId(String dbName, String tableName, long transactionId) + { + return transactionId; + } + + private synchronized void setTablePrivileges( + HivePrincipal grantee, + String databaseName, + String tableName, + Collection privileges) + { + requireNonNull(grantee, "grantee is null"); + requireNonNull(databaseName, "databaseName is null"); + requireNonNull(tableName, "tableName is null"); + requireNonNull(privileges, "privileges is null"); + + try { + Table table = getRequiredTable(databaseName, tableName); + + Path permissionsDirectory = getPermissionsDirectory(table); + + metadataFileSystem.mkdirs(permissionsDirectory); + if (!metadataFileSystem.isDirectory(permissionsDirectory)) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Could not create permissions directory"); + } + + Path permissionFilePath = getPermissionsPath(permissionsDirectory, grantee); + List permissions = privileges.stream() + .map(PermissionMetadata::new) + .collect(toList()); + writeFile("permissions", permissionFilePath, permissionsCodec, permissions, true); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + private synchronized void deleteTablePrivileges(Table table) + { + try { + Path permissionsDirectory = getPermissionsDirectory(table); + metadataFileSystem.delete(permissionsDirectory, true); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Could not delete table permissions", e); + } + } + + private Path getDatabaseMetadataDirectory(String databaseName) + { + return new Path(catalogDirectory, databaseName); + } + + private Path getTableMetadataDirectory(Table table) + { + return getTableMetadataDirectory(table.getDatabaseName(), table.getTableName()); + } + + private Path getTableMetadataDirectory(String databaseName, String tableName) + { + return new Path(getDatabaseMetadataDirectory(databaseName), tableName); + } + + private Path getPartitionMetadataDirectory(Table table, List values) + { + String partitionName = makePartitionName(table.getPartitionColumns(), values); + return getPartitionMetadataDirectory(table, partitionName); + } + + private Path getPartitionMetadataDirectory(Table table, String partitionName) + { + Path tableMetadataDirectory = getTableMetadataDirectory(table); + return new Path(tableMetadataDirectory, partitionName); + } + + private Path getPermissionsDirectory(Table table) + { + return new Path(getTableMetadataDirectory(table), PRESTO_PERMISSIONS_DIRECTORY_NAME); + } + + private static Path getPermissionsPath(Path permissionsDirectory, HivePrincipal grantee) + { + return new Path(permissionsDirectory, grantee.getType().toString().toLowerCase(Locale.US) + "_" + grantee.getName()); + } + + private List getChildSchemaDirectories(Path metadataDirectory) + { + try { + if (!metadataFileSystem.isDirectory(metadataDirectory)) { + return ImmutableList.of(); + } + + ImmutableList.Builder childSchemaDirectories = ImmutableList.builder(); + for (FileStatus child : metadataFileSystem.listStatus(metadataDirectory)) { + if (!child.isDirectory()) { + continue; + } + Path childPath = child.getPath(); + if (childPath.getName().startsWith(".")) { + continue; + } + if (metadataFileSystem.isFile(new Path(childPath, PRESTO_SCHEMA_FILE_NAME))) { + childSchemaDirectories.add(childPath); + } + } + return childSchemaDirectories.build(); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + private Path getRolesFile() + { + return new Path(catalogDirectory, ".roles"); + } + + private Path getRoleGrantsFile() + { + return new Path(catalogDirectory, ".roleGrants"); + } + + private Set readPermissionsFile(Path permissionFilePath) + { + return readFile("permissions", permissionFilePath, permissionsCodec).orElse(ImmutableList.of()).stream() + .map(PermissionMetadata::toHivePrivilegeInfo) + .collect(toImmutableSet()); + } + + private Set readAllPermissions(Path permissionsDirectory) + { + try { + return Arrays.stream(metadataFileSystem.listStatus(permissionsDirectory)) + .filter(FileStatus::isFile) + .filter(file -> !file.getPath().getName().startsWith(".")) + .flatMap(file -> readPermissionsFile(file.getPath()).stream()) + .collect(toImmutableSet()); + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + private void deleteMetadataDirectory(Path metadataDirectory) + { + try { + Path schemaPath = new Path(metadataDirectory, PRESTO_SCHEMA_FILE_NAME); + if (!metadataFileSystem.isFile(schemaPath)) { + // if there is no schema file, assume this is not a database, partition or table + return; + } + + if (!metadataFileSystem.delete(metadataDirectory, true)) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Could not delete metadata directory"); + } + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + private Optional readSchemaFile(String type, Path metadataDirectory, JsonCodec codec) + { + Path schemaPath = new Path(metadataDirectory, PRESTO_SCHEMA_FILE_NAME); + return readFile(type + " schema", schemaPath, codec); + } + + private Optional readFile(String type, Path path, JsonCodec codec) + { + try { + if (!metadataFileSystem.isFile(path)) { + return Optional.empty(); + } + + try (FSDataInputStream inputStream = metadataFileSystem.open(path)) { + byte[] json = ByteStreams.toByteArray(inputStream); + return Optional.of(codec.fromJson(json)); + } + } + catch (Exception e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Could not read " + type, e); + } + } + + private void writeSchemaFile(String type, Path directory, JsonCodec codec, T value, boolean overwrite) + { + Path schemaPath = new Path(directory, PRESTO_SCHEMA_FILE_NAME); + writeFile(type + " schema", schemaPath, codec, value, overwrite); + } + + private void writeFile(String type, Path path, JsonCodec codec, T value, boolean overwrite) + { + try { + byte[] json = codec.toJsonBytes(value); + + if (!overwrite) { + if (metadataFileSystem.exists(path)) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, type + " file already exists"); + } + } + + metadataFileSystem.mkdirs(path.getParent()); + + // todo implement safer overwrite code + try (OutputStream outputStream = metadataFileSystem.create(path, overwrite)) { + outputStream.write(json); + } + } + catch (Exception e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Could not write " + type, e); + } + } + + private void deleteSchemaFile(String type, Path metadataDirectory) + { + try { + if (!metadataFileSystem.delete(new Path(metadataDirectory, PRESTO_SCHEMA_FILE_NAME), false)) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Could not delete " + type + " schema"); + } + } + catch (IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Could not delete " + type + " schema", e); + } + } + + private static boolean isChildDirectory(Path parentDirectory, Path childDirectory) + { + if (parentDirectory.equals(childDirectory)) { + return true; + } + if (childDirectory.isRoot()) { + return false; + } + return isChildDirectory(parentDirectory, childDirectory.getParent()); + } + + private static class RoleGranteeTuple + { + private final String role; + private final HivePrincipal grantee; + + private RoleGranteeTuple(String role, HivePrincipal grantee) + { + this.role = requireNonNull(role, "role is null"); + this.grantee = requireNonNull(grantee, "grantee is null"); + } + + public String getRole() + { + return role; + } + + public HivePrincipal getGrantee() + { + return grantee; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + RoleGranteeTuple that = (RoleGranteeTuple) o; + return Objects.equals(role, that.role) && + Objects.equals(grantee, that.grantee); + } + + @Override + public int hashCode() + { + return Objects.hash(role, grantee); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("role", role) + .add("grantee", grantee) + .toString(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/FileHiveMetastoreConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/FileHiveMetastoreConfig.java new file mode 100644 index 00000000..5a005d79 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/FileHiveMetastoreConfig.java @@ -0,0 +1,51 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.file; + +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; + +import javax.validation.constraints.NotNull; + +public class FileHiveMetastoreConfig +{ + private String catalogDirectory; + private String metastoreUser = "presto"; + + @NotNull + public String getCatalogDirectory() + { + return catalogDirectory; + } + + @Config("hive.metastore.catalog.dir") + @ConfigDescription("Hive file-based metastore catalog directory") + public void setCatalogDirectory(String catalogDirectory) + { + this.catalogDirectory = catalogDirectory; + } + + @NotNull + public String getMetastoreUser() + { + return metastoreUser; + } + + @Config("hive.metastore.user") + @ConfigDescription("Hive file-based metastore username for file access") + public void setMetastoreUser(String metastoreUser) + { + this.metastoreUser = metastoreUser; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/FileMetastoreModule.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/FileMetastoreModule.java new file mode 100644 index 00000000..13954da3 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/FileMetastoreModule.java @@ -0,0 +1,38 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.file; + +import com.google.inject.Binder; +import com.google.inject.Module; +import com.google.inject.Scopes; +import io.prestosql.plugin.hive.ForCachingHiveMetastore; +import io.prestosql.plugin.hive.metastore.CachingHiveMetastore; +import io.prestosql.plugin.hive.metastore.HiveMetastore; + +import static io.airlift.configuration.ConfigBinder.configBinder; +import static org.weakref.jmx.guice.ExportBinder.newExporter; + +public class FileMetastoreModule + implements Module +{ + @Override + public void configure(Binder binder) + { + configBinder(binder).bindConfig(FileHiveMetastoreConfig.class); + binder.bind(HiveMetastore.class).annotatedWith(ForCachingHiveMetastore.class).to(FileHiveMetastore.class).in(Scopes.SINGLETON); + binder.bind(HiveMetastore.class).to(CachingHiveMetastore.class).in(Scopes.SINGLETON); + newExporter(binder).export(HiveMetastore.class) + .as(generator -> generator.generatedNameOf(CachingHiveMetastore.class)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/PartitionMetadata.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/PartitionMetadata.java new file mode 100644 index 00000000..dff25db9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/PartitionMetadata.java @@ -0,0 +1,166 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.file; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HiveBucketProperty; +import io.prestosql.plugin.hive.HiveStorageFormat; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.PartitionWithStatistics; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil; +import org.apache.hadoop.hive.metastore.TableType; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +public class PartitionMetadata +{ + private final List columns; + private final Map parameters; + + private final Optional storageFormat; + private final Optional bucketProperty; + private final Map serdeParameters; + + private final Optional externalLocation; + + private final Map columnStatistics; + + @JsonCreator + public PartitionMetadata( + @JsonProperty("columns") List columns, + @JsonProperty("parameters") Map parameters, + @JsonProperty("storageFormat") Optional storageFormat, + @JsonProperty("bucketProperty") Optional bucketProperty, + @JsonProperty("serdeParameters") Map serdeParameters, + @JsonProperty("externalLocation") Optional externalLocation, + @JsonProperty("columnStatistics") Map columnStatistics) + { + this.columns = ImmutableList.copyOf(requireNonNull(columns, "columns is null")); + this.parameters = ImmutableMap.copyOf(requireNonNull(parameters, "parameters is null")); + + this.storageFormat = requireNonNull(storageFormat, "storageFormat is null"); + this.bucketProperty = requireNonNull(bucketProperty, "bucketProperty is null"); + this.serdeParameters = requireNonNull(serdeParameters, "serdeParameters is null"); + + this.externalLocation = requireNonNull(externalLocation, "externalLocation is null"); + this.columnStatistics = ImmutableMap.copyOf(requireNonNull(columnStatistics, "columnStatistics is null")); + } + + public PartitionMetadata(Table table, PartitionWithStatistics partitionWithStatistics) + { + Partition partition = partitionWithStatistics.getPartition(); + PartitionStatistics statistics = partitionWithStatistics.getStatistics(); + + this.columns = partition.getColumns(); + this.parameters = ThriftMetastoreUtil.updateStatisticsParameters(partition.getParameters(), statistics.getBasicStatistics()); + + StorageFormat tableFormat = partition.getStorage().getStorageFormat(); + storageFormat = Arrays.stream(HiveStorageFormat.values()) + .filter(format -> tableFormat.equals(StorageFormat.fromHiveStorageFormat(format))) + .findFirst(); + + if (table.getTableType().equals(TableType.EXTERNAL_TABLE.name())) { + externalLocation = Optional.of(partition.getStorage().getLocation()); + } + else { + externalLocation = Optional.empty(); + } + + bucketProperty = partition.getStorage().getBucketProperty(); + serdeParameters = partition.getStorage().getSerdeParameters(); + columnStatistics = ImmutableMap.copyOf(statistics.getColumnStatistics()); + } + + @JsonProperty + public List getColumns() + { + return columns; + } + + @JsonProperty + public Map getParameters() + { + return parameters; + } + + @JsonProperty + public Optional getStorageFormat() + { + return storageFormat; + } + + @JsonProperty + public Optional getBucketProperty() + { + return bucketProperty; + } + + @JsonProperty + public Map getSerdeParameters() + { + return serdeParameters; + } + + @JsonProperty + public Optional getExternalLocation() + { + return externalLocation; + } + + @JsonProperty + public Map getColumnStatistics() + { + return columnStatistics; + } + + public PartitionMetadata withParameters(Map parameters) + { + return new PartitionMetadata(columns, parameters, storageFormat, bucketProperty, serdeParameters, externalLocation, columnStatistics); + } + + public PartitionMetadata withColumnStatistics(Map columnStatistics) + { + return new PartitionMetadata(columns, parameters, storageFormat, bucketProperty, serdeParameters, externalLocation, columnStatistics); + } + + public Partition toPartition(String databaseName, String tableName, List values, String location) + { + return new Partition( + databaseName, + tableName, + values, + Storage.builder() + .setLocation(externalLocation.orElse(location)) + .setStorageFormat(storageFormat.map(StorageFormat::fromHiveStorageFormat).orElse(StorageFormat.VIEW_STORAGE_FORMAT)) + .setBucketProperty(bucketProperty) + .setSerdeParameters(serdeParameters) + .build(), + columns, + parameters); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/PermissionMetadata.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/PermissionMetadata.java new file mode 100644 index 00000000..30c1be7e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/PermissionMetadata.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.file; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.HivePrivilegeInfo; + +import static io.prestosql.spi.security.PrincipalType.USER; +import static java.util.Objects.requireNonNull; + +public class PermissionMetadata +{ + private final HivePrivilegeInfo.HivePrivilege permission; + private final boolean grantOption; + + @JsonCreator + public PermissionMetadata( + @JsonProperty("permission") HivePrivilegeInfo.HivePrivilege permission, + @JsonProperty("grantOption") boolean grantOption) + { + this.permission = requireNonNull(permission, "permission is null"); + this.grantOption = grantOption; + } + + public PermissionMetadata(HivePrivilegeInfo privilegeInfo) + { + this.permission = privilegeInfo.getHivePrivilege(); + this.grantOption = privilegeInfo.isGrantOption(); + } + + @JsonProperty + public HivePrivilegeInfo.HivePrivilege getPermission() + { + return permission; + } + + @JsonProperty + public boolean isGrantOption() + { + return grantOption; + } + + public HivePrivilegeInfo toHivePrivilegeInfo() + { + return new HivePrivilegeInfo(permission, grantOption, new HivePrincipal(USER, "admin"), new HivePrincipal(USER, "admin")); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/TableMetadata.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/TableMetadata.java new file mode 100644 index 00000000..efc2aea5 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/file/TableMetadata.java @@ -0,0 +1,283 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.file; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HiveBucketProperty; +import io.prestosql.plugin.hive.HiveStorageFormat; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.metastore.Table; +import org.apache.hadoop.hive.metastore.TableType; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class TableMetadata +{ + private final String owner; + private final String tableType; + private final List dataColumns; + private final List partitionColumns; + private final Map parameters; + + private final Optional storageFormat; + private final Optional bucketProperty; + private final Map serdeParameters; + + private final Optional externalLocation; + + private final Optional viewOriginalText; + private final Optional viewExpandedText; + + private final Map columnStatistics; + + @JsonCreator + public TableMetadata( + @JsonProperty("owner") String owner, + @JsonProperty("tableType") String tableType, + @JsonProperty("dataColumns") List dataColumns, + @JsonProperty("partitionColumns") List partitionColumns, + @JsonProperty("parameters") Map parameters, + @JsonProperty("storageFormat") Optional storageFormat, + @JsonProperty("bucketProperty") Optional bucketProperty, + @JsonProperty("serdeParameters") Map serdeParameters, + @JsonProperty("externalLocation") Optional externalLocation, + @JsonProperty("viewOriginalText") Optional viewOriginalText, + @JsonProperty("viewExpandedText") Optional viewExpandedText, + @JsonProperty("columnStatistics") Map columnStatistics) + { + this.owner = requireNonNull(owner, "owner is null"); + this.tableType = requireNonNull(tableType, "tableType is null"); + this.dataColumns = ImmutableList.copyOf(requireNonNull(dataColumns, "dataColumns is null")); + this.partitionColumns = ImmutableList.copyOf(requireNonNull(partitionColumns, "partitionColumns is null")); + this.parameters = ImmutableMap.copyOf(requireNonNull(parameters, "parameters is null")); + + this.storageFormat = requireNonNull(storageFormat, "storageFormat is null"); + this.bucketProperty = requireNonNull(bucketProperty, "bucketProperty is null"); + this.serdeParameters = requireNonNull(serdeParameters, "serdeParameters is null"); + this.externalLocation = requireNonNull(externalLocation, "externalLocation is null"); + if (tableType.equals(TableType.EXTERNAL_TABLE.name())) { + checkArgument(externalLocation.isPresent(), "External location is required for external tables"); + } + else { + checkArgument(!externalLocation.isPresent(), "External location is only allowed for external tables"); + } + + this.viewOriginalText = requireNonNull(viewOriginalText, "viewOriginalText is null"); + this.viewExpandedText = requireNonNull(viewExpandedText, "viewExpandedText is null"); + this.columnStatistics = ImmutableMap.copyOf(requireNonNull(columnStatistics, "columnStatistics is null")); + checkArgument(partitionColumns.isEmpty() || columnStatistics.isEmpty(), "column statistics cannot be set for partitioned table"); + } + + public TableMetadata(Table table) + { + this(table, ImmutableMap.of()); + } + + public TableMetadata(Table table, Map columnStatistics) + { + owner = table.getOwner(); + tableType = table.getTableType(); + dataColumns = table.getDataColumns(); + partitionColumns = table.getPartitionColumns(); + parameters = table.getParameters(); + + StorageFormat tableFormat = table.getStorage().getStorageFormat(); + storageFormat = Arrays.stream(HiveStorageFormat.values()) + .filter(format -> tableFormat.equals(StorageFormat.fromHiveStorageFormat(format))) + .findFirst(); + bucketProperty = table.getStorage().getBucketProperty(); + serdeParameters = table.getStorage().getSerdeParameters(); + + if (tableType.equals(TableType.EXTERNAL_TABLE.name())) { + externalLocation = Optional.of(table.getStorage().getLocation()); + } + else { + externalLocation = Optional.empty(); + } + + viewOriginalText = table.getViewOriginalText(); + viewExpandedText = table.getViewExpandedText(); + this.columnStatistics = ImmutableMap.copyOf(requireNonNull(columnStatistics, "columnStatistics is null")); + } + + @JsonProperty + public String getOwner() + { + return owner; + } + + @JsonProperty + public String getTableType() + { + return tableType; + } + + @JsonProperty + public List getDataColumns() + { + return dataColumns; + } + + @JsonProperty + public List getPartitionColumns() + { + return partitionColumns; + } + + public Optional getColumn(String name) + { + for (Column partitionColumn : partitionColumns) { + if (partitionColumn.getName().equals(name)) { + return Optional.of(partitionColumn); + } + } + for (Column dataColumn : dataColumns) { + if (dataColumn.getName().equals(name)) { + return Optional.of(dataColumn); + } + } + return Optional.empty(); + } + + @JsonProperty + public Map getParameters() + { + return parameters; + } + + @JsonProperty + public Optional getStorageFormat() + { + return storageFormat; + } + + @JsonProperty + public Optional getBucketProperty() + { + return bucketProperty; + } + + @JsonProperty + public Map getSerdeParameters() + { + return serdeParameters; + } + + @JsonProperty + public Optional getExternalLocation() + { + return externalLocation; + } + + @JsonProperty + public Optional getViewOriginalText() + { + return viewOriginalText; + } + + @JsonProperty + public Optional getViewExpandedText() + { + return viewExpandedText; + } + + @JsonProperty + public Map getColumnStatistics() + { + return columnStatistics; + } + + public TableMetadata withDataColumns(List dataColumns) + { + return new TableMetadata( + owner, + tableType, + dataColumns, + partitionColumns, + parameters, + storageFormat, + bucketProperty, + serdeParameters, + externalLocation, + viewOriginalText, + viewExpandedText, + columnStatistics); + } + + public TableMetadata withParameters(Map parameters) + { + return new TableMetadata( + owner, + tableType, + dataColumns, + partitionColumns, + parameters, + storageFormat, + bucketProperty, + serdeParameters, + externalLocation, + viewOriginalText, + viewExpandedText, + columnStatistics); + } + + public TableMetadata withColumnStatistics(Map columnStatistics) + { + return new TableMetadata( + owner, + tableType, + dataColumns, + partitionColumns, + parameters, + storageFormat, + bucketProperty, + serdeParameters, + externalLocation, + viewOriginalText, + viewExpandedText, + columnStatistics); + } + + public Table toTable(String databaseName, String tableName, String location) + { + return new Table( + databaseName, + tableName, + owner, + tableType, + Storage.builder() + .setLocation(externalLocation.orElse(location)) + .setStorageFormat(storageFormat.map(StorageFormat::fromHiveStorageFormat).orElse(StorageFormat.VIEW_STORAGE_FORMAT)) + .setBucketProperty(bucketProperty) + .setSerdeParameters(serdeParameters) + .build(), + dataColumns, + partitionColumns, + parameters, + viewOriginalText, + viewExpandedText); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueExpressionUtil.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueExpressionUtil.java new file mode 100644 index 00000000..f307ec23 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueExpressionUtil.java @@ -0,0 +1,85 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.glue; + +import com.amazonaws.services.glue.model.GetPartitionsRequest; +import com.google.common.base.Joiner; +import com.google.common.base.Strings; +import com.google.common.collect.ImmutableSet; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.spi.PrestoException; + +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import static java.lang.String.format; + +public final class GlueExpressionUtil +{ + private static final Joiner JOINER = Joiner.on(" AND "); + private static final Set QUOTED_TYPES = ImmutableSet.of("string", "char", "varchar", "date", "timestamp", "binary", "varbinary"); + + private GlueExpressionUtil() {} + + /** + * Build an expression string used for partition filtering in {@link GetPartitionsRequest} + *
+     * Ex: partition keys: ['a', 'b']
+     *     partition values: ['1', '2']
+     *     expression: (a='1') AND (b='2')
+     *
+     * Partial specification ex:
+     *      partition values: ['', '2']
+     *      expression: (b='2')
+     * 
+ * + * @param partitionKeys List of partition keys to filter on + * @param partitionValues Full or partial list of partition values to filter on. Keys without filter should be empty string. + */ + public static String buildGlueExpression(List partitionKeys, List partitionValues) + { + if (partitionValues == null || partitionValues.isEmpty()) { + return null; + } + + if (partitionKeys == null || partitionValues.size() != partitionKeys.size()) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Incorrect number of partition values: " + partitionValues); + } + + List predicates = new LinkedList<>(); + for (int i = 0; i < partitionValues.size(); i++) { + if (!Strings.isNullOrEmpty(partitionValues.get(i))) { + predicates.add(buildPredicate(partitionKeys.get(i), partitionValues.get(i))); + } + } + + return JOINER.join(predicates); + } + + private static String buildPredicate(Column partitionKey, String value) + { + if (isQuotedType(partitionKey.getType())) { + return format("(%s='%s')", partitionKey.getName(), value); + } + return format("(%s=%s)", partitionKey.getName(), value); + } + + private static boolean isQuotedType(HiveType type) + { + return QUOTED_TYPES.contains(type.getTypeSignature().getBase()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueHiveMetastore.java new file mode 100644 index 00000000..08d44697 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueHiveMetastore.java @@ -0,0 +1,898 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.glue; + +import com.amazonaws.AmazonServiceException; +import com.amazonaws.ClientConfiguration; +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.AWSStaticCredentialsProvider; +import com.amazonaws.auth.BasicAWSCredentials; +import com.amazonaws.auth.DefaultAWSCredentialsProviderChain; +import com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider; +import com.amazonaws.regions.Region; +import com.amazonaws.regions.Regions; +import com.amazonaws.services.glue.AWSGlueAsync; +import com.amazonaws.services.glue.AWSGlueAsyncClientBuilder; +import com.amazonaws.services.glue.model.AlreadyExistsException; +import com.amazonaws.services.glue.model.BatchCreatePartitionRequest; +import com.amazonaws.services.glue.model.BatchCreatePartitionResult; +import com.amazonaws.services.glue.model.BatchGetPartitionRequest; +import com.amazonaws.services.glue.model.BatchGetPartitionResult; +import com.amazonaws.services.glue.model.CreateDatabaseRequest; +import com.amazonaws.services.glue.model.CreateTableRequest; +import com.amazonaws.services.glue.model.DatabaseInput; +import com.amazonaws.services.glue.model.DeleteDatabaseRequest; +import com.amazonaws.services.glue.model.DeletePartitionRequest; +import com.amazonaws.services.glue.model.DeleteTableRequest; +import com.amazonaws.services.glue.model.EntityNotFoundException; +import com.amazonaws.services.glue.model.ErrorDetail; +import com.amazonaws.services.glue.model.GetDatabaseRequest; +import com.amazonaws.services.glue.model.GetDatabaseResult; +import com.amazonaws.services.glue.model.GetDatabasesRequest; +import com.amazonaws.services.glue.model.GetDatabasesResult; +import com.amazonaws.services.glue.model.GetPartitionRequest; +import com.amazonaws.services.glue.model.GetPartitionResult; +import com.amazonaws.services.glue.model.GetPartitionsRequest; +import com.amazonaws.services.glue.model.GetPartitionsResult; +import com.amazonaws.services.glue.model.GetTableRequest; +import com.amazonaws.services.glue.model.GetTableResult; +import com.amazonaws.services.glue.model.GetTablesRequest; +import com.amazonaws.services.glue.model.GetTablesResult; +import com.amazonaws.services.glue.model.PartitionError; +import com.amazonaws.services.glue.model.PartitionInput; +import com.amazonaws.services.glue.model.PartitionValueList; +import com.amazonaws.services.glue.model.TableInput; +import com.amazonaws.services.glue.model.UpdateDatabaseRequest; +import com.amazonaws.services.glue.model.UpdatePartitionRequest; +import com.amazonaws.services.glue.model.UpdateTableRequest; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import io.airlift.log.Logger; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.HiveUtil; +import io.prestosql.plugin.hive.HiveWriteUtils; +import io.prestosql.plugin.hive.PartitionNotFoundException; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.HivePrivilegeInfo; +import io.prestosql.plugin.hive.metastore.MetastoreUtil; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.PartitionWithStatistics; +import io.prestosql.plugin.hive.metastore.PrincipalPrivileges; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.metastore.glue.converter.GlueInputConverter; +import io.prestosql.plugin.hive.metastore.glue.converter.GlueToPrestoConverter; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ColumnNotFoundException; +import io.prestosql.spi.connector.SchemaAlreadyExistsException; +import io.prestosql.spi.connector.SchemaNotFoundException; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableAlreadyExistsException; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.security.ConnectorIdentity; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.fs.Path; + +import javax.inject.Inject; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.function.Function; + +import static com.google.common.base.Strings.isNullOrEmpty; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_PARTITION_DROPPED_DURING_QUERY; +import static io.prestosql.plugin.hive.HiveUtil.toPartitionValues; +import static io.prestosql.plugin.hive.metastore.MetastoreUtil.makePartitionName; +import static io.prestosql.plugin.hive.metastore.glue.GlueExpressionUtil.buildGlueExpression; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.getHiveBasicStatistics; +import static io.prestosql.spi.StandardErrorCode.ALREADY_EXISTS; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.security.PrincipalType.USER; +import static java.util.Objects.requireNonNull; +import static java.util.function.UnaryOperator.identity; +import static java.util.stream.Collectors.toList; +import static java.util.stream.Collectors.toMap; +import static org.apache.hadoop.hive.metastore.TableType.MANAGED_TABLE; +import static org.apache.hadoop.hive.metastore.TableType.VIRTUAL_VIEW; + +public class GlueHiveMetastore + implements HiveMetastore +{ + private static final Logger log = Logger.get(GlueHiveMetastore.class); + + private static final String PUBLIC_ROLE_NAME = "public"; + private static final String DEFAULT_METASTORE_USER = "presto"; + private static final String WILDCARD_EXPRESSION = ""; + private static final int BATCH_GET_PARTITION_MAX_PAGE_SIZE = 1000; + private static final int BATCH_CREATE_PARTITION_MAX_PAGE_SIZE = 100; + + private final HdfsEnvironment hdfsEnvironment; + private final HdfsContext hdfsContext; + private final AWSGlueAsync glueClient; + private final Optional defaultDir; + private final String catalogId; + + @Inject + public GlueHiveMetastore(HdfsEnvironment hdfsEnvironment, GlueHiveMetastoreConfig glueConfig) + { + this(hdfsEnvironment, glueConfig, createAsyncGlueClient(glueConfig)); + } + + public GlueHiveMetastore(HdfsEnvironment hdfsEnvironment, GlueHiveMetastoreConfig glueConfig, AWSGlueAsync glueClient) + { + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.hdfsContext = new HdfsContext(new ConnectorIdentity(DEFAULT_METASTORE_USER, Optional.empty(), Optional.empty())); + this.glueClient = requireNonNull(glueClient, "glueClient is null"); + this.defaultDir = glueConfig.getDefaultWarehouseDir(); + this.catalogId = glueConfig.getCatalogId().orElse(null); + } + + private static AWSGlueAsync createAsyncGlueClient(GlueHiveMetastoreConfig config) + { + ClientConfiguration clientConfig = new ClientConfiguration().withMaxConnections(config.getMaxGlueConnections()); + AWSGlueAsyncClientBuilder asyncGlueClientBuilder = AWSGlueAsyncClientBuilder.standard() + .withClientConfiguration(clientConfig); + + if (config.getGlueRegion().isPresent()) { + asyncGlueClientBuilder.setRegion(config.getGlueRegion().get()); + } + else if (config.getPinGlueClientToCurrentRegion()) { + Region currentRegion = Regions.getCurrentRegion(); + if (currentRegion != null) { + asyncGlueClientBuilder.setRegion(currentRegion.getName()); + } + } + + asyncGlueClientBuilder.setCredentials(getAwsCredentialsProvider(config)); + + return asyncGlueClientBuilder.build(); + } + + private static AWSCredentialsProvider getAwsCredentialsProvider(GlueHiveMetastoreConfig config) + { + if (config.getAwsAccessKey().isPresent() && config.getAwsSecretKey().isPresent()) { + return new AWSStaticCredentialsProvider( + new BasicAWSCredentials(config.getAwsAccessKey().get(), config.getAwsSecretKey().get())); + } + else if (config.getIamRole().isPresent()) { + return new STSAssumeRoleSessionCredentialsProvider + .Builder(config.getIamRole().get(), "presto-session") + .build(); + } + return DefaultAWSCredentialsProviderChain.getInstance(); + } + + @Override + public Optional getDatabase(String databaseName) + { + try { + GetDatabaseResult result = glueClient.getDatabase(new GetDatabaseRequest().withCatalogId(catalogId).withName(databaseName)); + return Optional.of(GlueToPrestoConverter.convertDatabase(result.getDatabase())); + } + catch (EntityNotFoundException e) { + return Optional.empty(); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public List getAllDatabases() + { + try { + List databaseNames = new ArrayList<>(); + String nextToken = null; + + do { + GetDatabasesResult result = glueClient.getDatabases(new GetDatabasesRequest().withCatalogId(catalogId).withNextToken(nextToken)); + nextToken = result.getNextToken(); + result.getDatabaseList().forEach(database -> databaseNames.add(database.getName())); + } + while (nextToken != null); + + return databaseNames; + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public Optional
getTable(HiveIdentity identity, String databaseName, String tableName) + { + try { + GetTableResult result = glueClient.getTable(new GetTableRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withName(tableName)); + return Optional.of(GlueToPrestoConverter.convertTable(result.getTable(), databaseName)); + } + catch (EntityNotFoundException e) { + return Optional.empty(); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public Set getSupportedColumnStatistics(Type type) + { + return ImmutableSet.of(); + } + + private Table getTableOrElseThrow(HiveIdentity identity, String databaseName, String tableName) + { + return getTable(identity, databaseName, tableName) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + } + + @Override + public PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) + { + return new PartitionStatistics(getHiveBasicStatistics(table.getParameters()), ImmutableMap.of()); + } + + @Override + public Map getPartitionStatistics(HiveIdentity identity, Table table, List partitions) + { + return partitions.stream().collect(toImmutableMap(partition -> makePartitionName(table, partition), this::getPartitionStatistics)); + } + + private PartitionStatistics getPartitionStatistics(Partition partition) + { + return new PartitionStatistics(getHiveBasicStatistics(partition.getParameters()), ImmutableMap.of()); + } + + @Override + public void updateTableStatistics(HiveIdentity identity, String databaseName, String tableName, Function update) + { + Table table = getTableOrElseThrow(identity, databaseName, tableName); + PartitionStatistics currentStatistics = getTableStatistics(identity, table); + PartitionStatistics updatedStatistics = update.apply(currentStatistics); + if (!updatedStatistics.getColumnStatistics().isEmpty()) { + throw new PrestoException(NOT_SUPPORTED, "Glue metastore does not support column level statistics"); + } + + try { + TableInput tableInput = GlueInputConverter.convertTable(table); + tableInput.setParameters(ThriftMetastoreUtil.updateStatisticsParameters(table.getParameters(), updatedStatistics.getBasicStatistics())); + glueClient.updateTable(new UpdateTableRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withTableInput(tableInput)); + } + catch (EntityNotFoundException e) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public void updatePartitionStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, Function update) + { + List partitionValues = toPartitionValues(partitionName); + Partition partition = getPartition(identity, databaseName, tableName, partitionValues) + .orElseThrow(() -> new PrestoException(HIVE_PARTITION_DROPPED_DURING_QUERY, "Statistics result does not contain entry for partition: " + partitionName)); + + PartitionStatistics currentStatistics = getPartitionStatistics(partition); + PartitionStatistics updatedStatistics = update.apply(currentStatistics); + if (!updatedStatistics.getColumnStatistics().isEmpty()) { + throw new PrestoException(NOT_SUPPORTED, "Glue metastore does not support column level statistics"); + } + + try { + PartitionInput partitionInput = GlueInputConverter.convertPartition(partition); + partitionInput.setParameters(ThriftMetastoreUtil.updateStatisticsParameters(partition.getParameters(), updatedStatistics.getBasicStatistics())); + glueClient.updatePartition(new UpdatePartitionRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withTableName(tableName) + .withPartitionValueList(partition.getValues()) + .withPartitionInput(partitionInput)); + } + catch (EntityNotFoundException e) { + throw new PartitionNotFoundException(new SchemaTableName(databaseName, tableName), partitionValues); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public void updatePartitionsStatistics(HiveIdentity identity, String databaseName, String tableName, Map> partNamesUpdateFunctionMap) + { + partNamesUpdateFunctionMap.entrySet().stream().forEach(e -> { + updatePartitionStatistics(identity, databaseName, tableName, e.getKey(), e.getValue()); + }); + } + + @Override + public Optional> getAllTables(String databaseName) + { + try { + List tableNames = new ArrayList<>(); + String nextToken = null; + + do { + GetTablesResult result = glueClient.getTables(new GetTablesRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withNextToken(nextToken)); + result.getTableList().forEach(table -> tableNames.add(table.getName())); + nextToken = result.getNextToken(); + } + while (nextToken != null); + + return Optional.of(tableNames); + } + catch (EntityNotFoundException e) { + // database does not exist + return Optional.empty(); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public Optional> getAllViews(String databaseName) + { + try { + List views = new ArrayList<>(); + String nextToken = null; + + do { + GetTablesResult result = glueClient.getTables(new GetTablesRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withNextToken(nextToken)); + result.getTableList().stream() + .filter(table -> VIRTUAL_VIEW.name().equals(table.getTableType())) + .forEach(table -> views.add(table.getName())); + nextToken = result.getNextToken(); + } + while (nextToken != null); + + return Optional.of(views); + } + catch (EntityNotFoundException e) { + // database does not exist + return Optional.empty(); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public void createDatabase(HiveIdentity identity, Database database) + { + if (!database.getLocation().isPresent() && defaultDir.isPresent()) { + String databaseLocation = new Path(defaultDir.get(), database.getDatabaseName()).toString(); + database = Database.builder(database) + .setLocation(Optional.of(databaseLocation)) + .build(); + } + + try { + DatabaseInput databaseInput = GlueInputConverter.convertDatabase(database); + glueClient.createDatabase(new CreateDatabaseRequest().withCatalogId(catalogId).withDatabaseInput(databaseInput)); + } + catch (AlreadyExistsException e) { + throw new SchemaAlreadyExistsException(database.getDatabaseName()); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + + if (database.getLocation().isPresent()) { + HiveWriteUtils.createDirectory(hdfsContext, hdfsEnvironment, new Path(database.getLocation().get())); + } + } + + @Override + public void dropDatabase(HiveIdentity identity, String databaseName) + { + try { + glueClient.deleteDatabase(new DeleteDatabaseRequest().withCatalogId(catalogId).withName(databaseName)); + } + catch (EntityNotFoundException e) { + throw new SchemaNotFoundException(databaseName); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public void renameDatabase(HiveIdentity identity, String databaseName, String newDatabaseName) + { + try { + Database database = getDatabase(databaseName).orElseThrow(() -> new SchemaNotFoundException(databaseName)); + DatabaseInput renamedDatabase = GlueInputConverter.convertDatabase(database).withName(newDatabaseName); + glueClient.updateDatabase(new UpdateDatabaseRequest() + .withCatalogId(catalogId) + .withName(databaseName) + .withDatabaseInput(renamedDatabase)); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public void createTable(HiveIdentity identity, Table table, PrincipalPrivileges principalPrivileges) + { + try { + TableInput input = GlueInputConverter.convertTable(table); + glueClient.createTable(new CreateTableRequest() + .withCatalogId(catalogId) + .withDatabaseName(table.getDatabaseName()) + .withTableInput(input)); + } + catch (AlreadyExistsException e) { + throw new TableAlreadyExistsException(new SchemaTableName(table.getDatabaseName(), table.getTableName())); + } + catch (EntityNotFoundException e) { + throw new SchemaNotFoundException(table.getDatabaseName()); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public void dropTable(HiveIdentity identity, String databaseName, String tableName, boolean deleteData) + { + Table table = getTableOrElseThrow(identity, databaseName, tableName); + + try { + glueClient.deleteTable(new DeleteTableRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withName(tableName)); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + + String tableLocation = table.getStorage().getLocation(); + if (deleteData && isManagedTable(table) && !isNullOrEmpty(tableLocation)) { + deleteDir(hdfsContext, hdfsEnvironment, new Path(tableLocation), true); + } + } + + private static boolean isManagedTable(Table table) + { + return table.getTableType().equals(MANAGED_TABLE.name()); + } + + private static void deleteDir(HdfsContext context, HdfsEnvironment hdfsEnvironment, Path path, boolean recursive) + { + try { + hdfsEnvironment.getFileSystem(context, path).delete(path, recursive); + } + catch (Exception e) { + // don't fail if unable to delete path + log.warn(e, "Failed to delete path: " + path.toString()); + } + } + + @Override + public void replaceTable(HiveIdentity identity, String databaseName, String tableName, Table newTable, PrincipalPrivileges principalPrivileges) + { + try { + TableInput newTableInput = GlueInputConverter.convertTable(newTable); + glueClient.updateTable(new UpdateTableRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withTableInput(newTableInput)); + } + catch (EntityNotFoundException e) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public void renameTable(HiveIdentity identity, String databaseName, String tableName, String newDatabaseName, String newTableName) + { + throw new PrestoException(NOT_SUPPORTED, "Table rename is not yet supported by Glue service"); + } + + @Override + public void commentTable(HiveIdentity identity, String databaseName, String tableName, Optional comment) + { + throw new PrestoException(NOT_SUPPORTED, "Table comment is not yet supported by Glue service"); + } + + @Override + public void addColumn(HiveIdentity identity, String databaseName, String tableName, String columnName, HiveType columnType, String columnComment) + { + Table oldTable = getTableOrElseThrow(identity, databaseName, tableName); + Table newTable = Table.builder(oldTable) + .addDataColumn(new Column(columnName, columnType, Optional.ofNullable(columnComment))) + .build(); + replaceTable(identity, databaseName, tableName, newTable, null); + } + + @Override + public void renameColumn(HiveIdentity identity, String databaseName, String tableName, String oldColumnName, String newColumnName) + { + Table oldTable = getTableOrElseThrow(identity, databaseName, tableName); + if (oldTable.getPartitionColumns().stream().anyMatch(c -> c.getName().equals(oldColumnName))) { + throw new PrestoException(NOT_SUPPORTED, "Renaming partition columns is not supported"); + } + + ImmutableList.Builder newDataColumns = ImmutableList.builder(); + for (Column column : oldTable.getDataColumns()) { + if (column.getName().equals(oldColumnName)) { + newDataColumns.add(new Column(newColumnName, column.getType(), column.getComment())); + } + else { + newDataColumns.add(column); + } + } + + Table newTable = Table.builder(oldTable) + .setDataColumns(newDataColumns.build()) + .build(); + replaceTable(identity, databaseName, tableName, newTable, null); + } + + @Override + public void dropColumn(HiveIdentity identity, String databaseName, String tableName, String columnName) + { + MetastoreUtil.verifyCanDropColumn(this, identity, databaseName, tableName, columnName); + Table oldTable = getTableOrElseThrow(identity, databaseName, tableName); + + if (!oldTable.getColumn(columnName).isPresent()) { + SchemaTableName name = new SchemaTableName(databaseName, tableName); + throw new ColumnNotFoundException(name, columnName); + } + + ImmutableList.Builder newDataColumns = ImmutableList.builder(); + oldTable.getDataColumns().stream() + .filter(fieldSchema -> !fieldSchema.getName().equals(columnName)) + .forEach(newDataColumns::add); + + Table newTable = Table.builder(oldTable) + .setDataColumns(newDataColumns.build()) + .build(); + replaceTable(identity, databaseName, tableName, newTable, null); + } + + @Override + public Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues) + { + try { + GetPartitionResult result = glueClient.getPartition(new GetPartitionRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withTableName(tableName) + .withPartitionValues(partitionValues)); + return Optional.of(GlueToPrestoConverter.convertPartition(result.getPartition())); + } + catch (EntityNotFoundException e) { + return Optional.empty(); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public Optional> getPartitionNames(HiveIdentity identity, String databaseName, String tableName) + { + Table table = getTableOrElseThrow(identity, databaseName, tableName); + List partitions = getPartitions(databaseName, tableName, WILDCARD_EXPRESSION); + return Optional.of(buildPartitionNames(table.getPartitionColumns(), partitions)); + } + + /** + *
+     * Ex: Partition keys = ['a', 'b', 'c']
+     *     Valid partition values:
+     *     ['1','2','3'] or
+     *     ['', '2', '']
+     * 
+ * + * @param parts Full or partial list of partition values to filter on. Keys without filter will be empty strings. + * @return a list of partition names. + */ + @Override + public Optional> getPartitionNamesByParts(HiveIdentity identity, String databaseName, String tableName, List parts) + { + Table table = getTableOrElseThrow(identity, databaseName, tableName); + String expression = buildGlueExpression(table.getPartitionColumns(), parts); + List partitions = getPartitions(databaseName, tableName, expression); + return Optional.of(buildPartitionNames(table.getPartitionColumns(), partitions)); + } + + private List getPartitions(String databaseName, String tableName, String expression) + { + try { + List partitions = new ArrayList<>(); + String nextToken = null; + + do { + GetPartitionsResult result = glueClient.getPartitions(new GetPartitionsRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withTableName(tableName) + .withExpression(expression) + .withNextToken(nextToken)); + result.getPartitions() + .forEach(partition -> partitions.add(GlueToPrestoConverter.convertPartition(partition))); + nextToken = result.getNextToken(); + } + while (nextToken != null); + + return partitions; + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + private static List buildPartitionNames(List partitionColumns, List partitions) + { + return partitions.stream() + .map(partition -> makePartitionName(partitionColumns, partition.getValues())) + .collect(toList()); + } + + /** + *
+     * Ex: Partition keys = ['a', 'b']
+     *     Partition names = ['a=1/b=2', 'a=2/b=2']
+     * 
+ * + * @param partitionNames List of full partition names + * @return Mapping of partition name to partition object + */ + @Override + public Map> getPartitionsByNames(HiveIdentity identity, String databaseName, String tableName, List partitionNames) + { + requireNonNull(partitionNames, "partitionNames is null"); + if (partitionNames.isEmpty()) { + return ImmutableMap.of(); + } + + List partitions = batchGetPartition(databaseName, tableName, partitionNames); + + Map> partitionNameToPartitionValuesMap = partitionNames.stream() + .collect(toMap(identity(), HiveUtil::toPartitionValues)); + Map, Partition> partitionValuesToPartitionMap = partitions.stream() + .collect(toMap(Partition::getValues, identity())); + + ImmutableMap.Builder> resultBuilder = ImmutableMap.builder(); + for (Entry> entry : partitionNameToPartitionValuesMap.entrySet()) { + Partition partition = partitionValuesToPartitionMap.get(entry.getValue()); + resultBuilder.put(entry.getKey(), Optional.ofNullable(partition)); + } + return resultBuilder.build(); + } + + private List batchGetPartition(String databaseName, String tableName, List partitionNames) + { + try { + List partitionValueLists = partitionNames.stream() + .map(partitionName -> new PartitionValueList().withValues(toPartitionValues(partitionName))).collect(toList()); + + List> batchedPartitionValueLists = Lists.partition(partitionValueLists, BATCH_GET_PARTITION_MAX_PAGE_SIZE); + List> batchGetPartitionFutures = new ArrayList<>(); + List result = new ArrayList<>(); + + for (List partitions : batchedPartitionValueLists) { + batchGetPartitionFutures.add(glueClient.batchGetPartitionAsync(new BatchGetPartitionRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withTableName(tableName) + .withPartitionsToGet(partitions))); + } + + for (Future future : batchGetPartitionFutures) { + future.get().getPartitions() + .forEach(partition -> result.add(GlueToPrestoConverter.convertPartition(partition))); + } + + return result; + } + catch (AmazonServiceException | InterruptedException | ExecutionException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public void addPartitions(HiveIdentity identity, String databaseName, String tableName, List partitions) + { + try { + List> batchedPartitions = Lists.partition(partitions, BATCH_CREATE_PARTITION_MAX_PAGE_SIZE); + List> futures = new ArrayList<>(); + + for (List partitionBatch : batchedPartitions) { + List partitionInputs = partitionBatch.stream().map(GlueInputConverter::convertPartition).collect(toList()); + futures.add(glueClient.batchCreatePartitionAsync(new BatchCreatePartitionRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withTableName(tableName) + .withPartitionInputList(partitionInputs))); + } + + for (Future future : futures) { + BatchCreatePartitionResult result = future.get(); + propagatePartitionErrorToPrestoException(databaseName, tableName, result.getErrors()); + } + } + catch (AmazonServiceException | InterruptedException | ExecutionException e) { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + private static void propagatePartitionErrorToPrestoException(String databaseName, String tableName, List partitionErrors) + { + if (partitionErrors != null && !partitionErrors.isEmpty()) { + ErrorDetail errorDetail = partitionErrors.get(0).getErrorDetail(); + String glueExceptionCode = errorDetail.getErrorCode(); + + switch (glueExceptionCode) { + case "AlreadyExistsException": + throw new PrestoException(ALREADY_EXISTS, errorDetail.getErrorMessage()); + case "EntityNotFoundException": + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName), errorDetail.getErrorMessage()); + default: + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, errorDetail.getErrorCode() + ": " + errorDetail.getErrorMessage()); + } + } + } + + @Override + public void dropPartition(HiveIdentity identity, String databaseName, String tableName, List parts, boolean deleteData) + { + Table table = getTableOrElseThrow(identity, databaseName, tableName); + Partition partition = getPartition(identity, databaseName, tableName, parts) + .orElseThrow(() -> new PartitionNotFoundException(new SchemaTableName(databaseName, tableName), parts)); + + try { + glueClient.deletePartition(new DeletePartitionRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withTableName(tableName) + .withPartitionValues(parts)); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + + String partLocation = partition.getStorage().getLocation(); + if (deleteData && isManagedTable(table) && !isNullOrEmpty(partLocation)) { + deleteDir(hdfsContext, hdfsEnvironment, new Path(partLocation), true); + } + } + + @Override + public void alterPartition(HiveIdentity identity, String databaseName, String tableName, PartitionWithStatistics partition) + { + try { + PartitionInput newPartition = GlueInputConverter.convertPartition(partition); + glueClient.updatePartition(new UpdatePartitionRequest() + .withCatalogId(catalogId) + .withDatabaseName(databaseName) + .withTableName(tableName) + .withPartitionInput(newPartition) + .withPartitionValueList(partition.getPartition().getValues())); + } + catch (EntityNotFoundException e) { + throw new PartitionNotFoundException(new SchemaTableName(databaseName, tableName), partition.getPartition().getValues()); + } + catch (AmazonServiceException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + } + + @Override + public void createRole(String role, String grantor) + { + throw new PrestoException(NOT_SUPPORTED, "createRole is not supported by Glue"); + } + + @Override + public void dropRole(String role) + { + throw new PrestoException(NOT_SUPPORTED, "dropRole is not supported by Glue"); + } + + @Override + public Set listRoles() + { + return ImmutableSet.of(PUBLIC_ROLE_NAME); + } + + @Override + public void grantRoles(Set roles, Set grantees, boolean withAdminOption, HivePrincipal grantor) + { + throw new PrestoException(NOT_SUPPORTED, "grantRoles is not supported by Glue"); + } + + @Override + public void revokeRoles(Set roles, Set grantees, boolean adminOptionFor, HivePrincipal grantor) + { + throw new PrestoException(NOT_SUPPORTED, "revokeRoles is not supported by Glue"); + } + + @Override + public Set listRoleGrants(HivePrincipal principal) + { + if (principal.getType() == USER) { + return ImmutableSet.of(new RoleGrant(principal.toPrestoPrincipal(), PUBLIC_ROLE_NAME, false)); + } + return ImmutableSet.of(); + } + + @Override + public void grantTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + throw new PrestoException(NOT_SUPPORTED, "grantTablePrivileges is not supported by Glue"); + } + + @Override + public void revokeTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + throw new PrestoException(NOT_SUPPORTED, "revokeTablePrivileges is not supported by Glue"); + } + + @Override + public Set listTablePrivileges(String databaseName, String tableName, HivePrincipal principal) + { + throw new PrestoException(NOT_SUPPORTED, "listTablePrivileges is not supported by Glue"); + } + + @Override + public boolean isImpersonationEnabled() + { + throw new PrestoException(NOT_SUPPORTED, "isImpersonationEnabled is not supported by Glue"); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueHiveMetastoreConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueHiveMetastoreConfig.java new file mode 100644 index 00000000..761ca711 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueHiveMetastoreConfig.java @@ -0,0 +1,140 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.glue; + +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; +import io.airlift.configuration.ConfigSecuritySensitive; + +import javax.validation.constraints.Min; + +import java.util.Optional; + +public class GlueHiveMetastoreConfig +{ + private Optional glueRegion = Optional.empty(); + private boolean pinGlueClientToCurrentRegion; + private int maxGlueConnections = 5; + private Optional defaultWarehouseDir = Optional.empty(); + private Optional iamRole = Optional.empty(); + private Optional awsAccessKey = Optional.empty(); + private Optional awsSecretKey = Optional.empty(); + private Optional catalogId = Optional.empty(); + + public Optional getGlueRegion() + { + return glueRegion; + } + + @Config("hive.metastore.glue.region") + @ConfigDescription("AWS Region for Glue Data Catalog") + public GlueHiveMetastoreConfig setGlueRegion(String region) + { + this.glueRegion = Optional.ofNullable(region); + return this; + } + + public boolean getPinGlueClientToCurrentRegion() + { + return pinGlueClientToCurrentRegion; + } + + @Config("hive.metastore.glue.pin-client-to-current-region") + @ConfigDescription("Should the Glue client be pinned to the current EC2 region") + public GlueHiveMetastoreConfig setPinGlueClientToCurrentRegion(boolean pinGlueClientToCurrentRegion) + { + this.pinGlueClientToCurrentRegion = pinGlueClientToCurrentRegion; + return this; + } + + @Min(1) + public int getMaxGlueConnections() + { + return maxGlueConnections; + } + + @Config("hive.metastore.glue.max-connections") + @ConfigDescription("Max number of concurrent connections to Glue") + public GlueHiveMetastoreConfig setMaxGlueConnections(int maxGlueConnections) + { + this.maxGlueConnections = maxGlueConnections; + return this; + } + + public Optional getDefaultWarehouseDir() + { + return defaultWarehouseDir; + } + + @Config("hive.metastore.glue.default-warehouse-dir") + @ConfigDescription("Hive Glue metastore default warehouse directory") + public GlueHiveMetastoreConfig setDefaultWarehouseDir(String defaultWarehouseDir) + { + this.defaultWarehouseDir = Optional.ofNullable(defaultWarehouseDir); + return this; + } + + public Optional getIamRole() + { + return iamRole; + } + + @Config("hive.metastore.glue.iam-role") + @ConfigDescription("ARN of an IAM role to assume when connecting to the Hive Glue metastore") + public GlueHiveMetastoreConfig setIamRole(String iamRole) + { + this.iamRole = Optional.ofNullable(iamRole); + return this; + } + + public Optional getAwsAccessKey() + { + return awsAccessKey; + } + + @Config("hive.metastore.glue.aws-access-key") + @ConfigDescription("Hive Glue metastore AWS access key") + public GlueHiveMetastoreConfig setAwsAccessKey(String awsAccessKey) + { + this.awsAccessKey = Optional.ofNullable(awsAccessKey); + return this; + } + + public Optional getAwsSecretKey() + { + return awsSecretKey; + } + + @Config("hive.metastore.glue.aws-secret-key") + @ConfigDescription("Hive Glue metastore AWS secret key") + @ConfigSecuritySensitive + public GlueHiveMetastoreConfig setAwsSecretKey(String awsSecretKey) + { + this.awsSecretKey = Optional.ofNullable(awsSecretKey); + return this; + } + + public Optional getCatalogId() + { + return catalogId; + } + + @Config("hive.metastore.glue.catalogid") + @ConfigDescription("Hive Glue metastore catalog id") + public GlueHiveMetastoreConfig setCatalogId(String catalogId) + { + this.catalogId = Optional.ofNullable(catalogId); + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueMetastoreModule.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueMetastoreModule.java new file mode 100644 index 00000000..7f383da8 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/GlueMetastoreModule.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.glue; + +import com.google.inject.Binder; +import com.google.inject.Scopes; +import com.google.inject.multibindings.Multibinder; +import io.airlift.configuration.AbstractConfigurationAwareModule; +import io.prestosql.plugin.hive.ForRecordingHiveMetastore; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.RecordingHiveMetastore; +import io.prestosql.plugin.hive.metastore.WriteHiveMetastoreRecordingProcedure; +import io.prestosql.spi.procedure.Procedure; + +import static com.google.inject.multibindings.Multibinder.newSetBinder; +import static io.airlift.configuration.ConfigBinder.configBinder; +import static org.weakref.jmx.guice.ExportBinder.newExporter; + +public class GlueMetastoreModule + extends AbstractConfigurationAwareModule +{ + @Override + protected void setup(Binder binder) + { + configBinder(binder).bindConfig(GlueHiveMetastoreConfig.class); + + if (buildConfigObject(HiveConfig.class).getRecordingPath() != null) { + binder.bind(HiveMetastore.class) + .annotatedWith(ForRecordingHiveMetastore.class) + .to(GlueHiveMetastore.class) + .in(Scopes.SINGLETON); + binder.bind(GlueHiveMetastore.class).in(Scopes.SINGLETON); + newExporter(binder).export(GlueHiveMetastore.class).withGeneratedName(); + + binder.bind(HiveMetastore.class).to(RecordingHiveMetastore.class).in(Scopes.SINGLETON); + binder.bind(RecordingHiveMetastore.class).in(Scopes.SINGLETON); + newExporter(binder).export(RecordingHiveMetastore.class).withGeneratedName(); + + Multibinder procedures = newSetBinder(binder, Procedure.class); + procedures.addBinding().toProvider(WriteHiveMetastoreRecordingProcedure.class).in(Scopes.SINGLETON); + } + else { + binder.bind(HiveMetastore.class).to(GlueHiveMetastore.class).in(Scopes.SINGLETON); + newExporter(binder).export(HiveMetastore.class) + .as(generator -> generator.generatedNameOf(GlueHiveMetastore.class)); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/converter/GlueInputConverter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/converter/GlueInputConverter.java new file mode 100644 index 00000000..df6647ed --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/converter/GlueInputConverter.java @@ -0,0 +1,117 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.glue.converter; + +import com.amazonaws.services.glue.model.DatabaseInput; +import com.amazonaws.services.glue.model.PartitionInput; +import com.amazonaws.services.glue.model.SerDeInfo; +import com.amazonaws.services.glue.model.StorageDescriptor; +import com.amazonaws.services.glue.model.TableInput; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.PartitionWithStatistics; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.PrestoException; + +import java.util.List; + +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.updateStatisticsParameters; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static java.util.stream.Collectors.toList; + +public final class GlueInputConverter +{ + private GlueInputConverter() {} + + public static DatabaseInput convertDatabase(Database database) + { + DatabaseInput input = new DatabaseInput(); + input.setName(database.getDatabaseName()); + input.setParameters(database.getParameters()); + database.getComment().ifPresent(input::setDescription); + database.getLocation().ifPresent(input::setLocationUri); + return input; + } + + public static TableInput convertTable(Table table) + { + TableInput input = new TableInput(); + input.setName(table.getTableName()); + input.setOwner(table.getOwner()); + input.setTableType(table.getTableType()); + input.setStorageDescriptor(convertStorage(table.getStorage(), table.getDataColumns())); + input.setPartitionKeys(table.getPartitionColumns().stream().map(GlueInputConverter::convertColumn).collect(toList())); + input.setParameters(table.getParameters()); + table.getViewOriginalText().ifPresent(input::setViewOriginalText); + table.getViewExpandedText().ifPresent(input::setViewExpandedText); + return input; + } + + public static PartitionInput convertPartition(PartitionWithStatistics partitionWithStatistics) + { + PartitionInput input = convertPartition(partitionWithStatistics.getPartition()); + PartitionStatistics statistics = partitionWithStatistics.getStatistics(); + if (!statistics.getColumnStatistics().isEmpty()) { + throw new PrestoException(NOT_SUPPORTED, "Glue metastore does not support column level statistics"); + } + input.setParameters(updateStatisticsParameters(input.getParameters(), statistics.getBasicStatistics())); + return input; + } + + public static PartitionInput convertPartition(Partition partition) + { + PartitionInput input = new PartitionInput(); + input.setValues(partition.getValues()); + input.setStorageDescriptor(convertStorage(partition.getStorage(), partition.getColumns())); + input.setParameters(partition.getParameters()); + return input; + } + + private static StorageDescriptor convertStorage(Storage storage, List columns) + { + if (storage.isSkewed()) { + throw new IllegalArgumentException("Writing to skewed table/partition is not supported"); + } + SerDeInfo serdeInfo = new SerDeInfo() + .withSerializationLibrary(storage.getStorageFormat().getSerDeNullable()) + .withParameters(storage.getSerdeParameters()); + + StorageDescriptor sd = new StorageDescriptor(); + sd.setLocation(storage.getLocation()); + sd.setColumns(columns.stream().map(GlueInputConverter::convertColumn).collect(toList())); + sd.setSerdeInfo(serdeInfo); + sd.setInputFormat(storage.getStorageFormat().getInputFormatNullable()); + sd.setOutputFormat(storage.getStorageFormat().getOutputFormatNullable()); + sd.setParameters(ImmutableMap.of()); + + if (storage.getBucketProperty().isPresent()) { + sd.setNumberOfBuckets(storage.getBucketProperty().get().getBucketCount()); + sd.setBucketColumns(storage.getBucketProperty().get().getBucketedBy()); + } + + return sd; + } + + private static com.amazonaws.services.glue.model.Column convertColumn(Column prestoColumn) + { + return new com.amazonaws.services.glue.model.Column() + .withName(prestoColumn.getName()) + .withType(prestoColumn.getType().toString()) + .withComment(prestoColumn.getComment().orElse(null)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/converter/GlueToPrestoConverter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/converter/GlueToPrestoConverter.java new file mode 100644 index 00000000..a18aff19 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/glue/converter/GlueToPrestoConverter.java @@ -0,0 +1,153 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.glue.converter; + +import com.amazonaws.services.glue.model.SerDeInfo; +import com.amazonaws.services.glue.model.StorageDescriptor; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HiveBucketProperty; +import io.prestosql.plugin.hive.HiveBucketing; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.SortingColumn; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.security.PrincipalType; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.firstNonNull; +import static com.google.common.base.Strings.nullToEmpty; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; + +public final class GlueToPrestoConverter +{ + private static final String PUBLIC_OWNER = "PUBLIC"; + + private GlueToPrestoConverter() + { + } + + public static Database convertDatabase(com.amazonaws.services.glue.model.Database glueDb) + { + return Database.builder() + .setDatabaseName(glueDb.getName()) + .setLocation(Optional.ofNullable(glueDb.getLocationUri())) + .setComment(Optional.ofNullable(glueDb.getDescription())) + .setParameters(firstNonNull(glueDb.getParameters(), ImmutableMap.of())) + .setOwnerName(PUBLIC_OWNER) + .setOwnerType(PrincipalType.ROLE) + .build(); + } + + public static Table convertTable(com.amazonaws.services.glue.model.Table glueTable, String dbName) + { + requireNonNull(glueTable.getStorageDescriptor(), "Table StorageDescriptor is null"); + StorageDescriptor sd = glueTable.getStorageDescriptor(); + + Table.Builder tableBuilder = Table.builder() + .setDatabaseName(dbName) + .setTableName(glueTable.getName()) + .setOwner(nullToEmpty(glueTable.getOwner())) + .setTableType(glueTable.getTableType()) + .setDataColumns(sd.getColumns().stream() + .map(GlueToPrestoConverter::convertColumn) + .collect(toList())) + .setParameters(firstNonNull(glueTable.getParameters(), ImmutableMap.of())) + .setViewOriginalText(Optional.ofNullable(glueTable.getViewOriginalText())) + .setViewExpandedText(Optional.ofNullable(glueTable.getViewExpandedText())); + + if (glueTable.getPartitionKeys() != null) { + tableBuilder.setPartitionColumns(glueTable.getPartitionKeys().stream() + .map(GlueToPrestoConverter::convertColumn) + .collect(toList())); + } + else { + tableBuilder.setPartitionColumns(new ArrayList<>()); + } + + setStorageBuilder(sd, tableBuilder.getStorageBuilder()); + return tableBuilder.build(); + } + + private static void setStorageBuilder(StorageDescriptor sd, Storage.Builder storageBuilder) + { + requireNonNull(sd.getSerdeInfo(), "StorageDescriptor SerDeInfo is null"); + SerDeInfo serdeInfo = sd.getSerdeInfo(); + + Optional bucketProperty = Optional.empty(); + if (sd.getNumberOfBuckets() > 0) { + if (isNullOrEmpty(sd.getBucketColumns())) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Table/partition metadata has 'numBuckets' set, but 'bucketCols' is not set"); + } + List sortedBy = ImmutableList.of(); + if (!isNullOrEmpty(sd.getSortColumns())) { + sortedBy = sd.getSortColumns().stream() + .map(column -> new SortingColumn( + column.getColumn(), + SortingColumn.Order.fromMetastoreApiOrder(column.getSortOrder(), "unknown"))) + .collect(toImmutableList()); + } + BucketingVersion bucketingVersion = HiveBucketing.getBucketingVersion(sd.getParameters()); + bucketProperty = Optional.of(new HiveBucketProperty(sd.getBucketColumns(), bucketingVersion, sd.getNumberOfBuckets(), sortedBy)); + } + + storageBuilder.setStorageFormat(StorageFormat.createNullable(serdeInfo.getSerializationLibrary(), sd.getInputFormat(), sd.getOutputFormat())) + .setLocation(nullToEmpty(sd.getLocation())) + .setBucketProperty(bucketProperty) + .setSkewed(sd.getSkewedInfo() != null && !isNullOrEmpty(sd.getSkewedInfo().getSkewedColumnNames())) + .setSerdeParameters(firstNonNull(serdeInfo.getParameters(), ImmutableMap.of())) + .build(); + } + + private static Column convertColumn(com.amazonaws.services.glue.model.Column glueColumn) + { + return new Column(glueColumn.getName(), HiveType.valueOf(glueColumn.getType().toLowerCase(Locale.ENGLISH)), Optional.ofNullable(glueColumn.getComment())); + } + + public static Partition convertPartition(com.amazonaws.services.glue.model.Partition gluePartition) + { + requireNonNull(gluePartition.getStorageDescriptor(), "Partition StorageDescriptor is null"); + StorageDescriptor sd = gluePartition.getStorageDescriptor(); + + Partition.Builder partitionBuilder = Partition.builder() + .setDatabaseName(gluePartition.getDatabaseName()) + .setTableName(gluePartition.getTableName()) + .setValues(gluePartition.getValues()) + .setColumns(sd.getColumns().stream() + .map(GlueToPrestoConverter::convertColumn) + .collect(toList())) + .setParameters(firstNonNull(gluePartition.getParameters(), ImmutableMap.of())); + + setStorageBuilder(sd, partitionBuilder.getStorageBuilder()); + return partitionBuilder.build(); + } + + private static boolean isNullOrEmpty(List list) + { + return list == null || list.isEmpty(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/BridgingHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/BridgingHiveMetastore.java new file mode 100644 index 00000000..b8bd673a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/BridgingHiveMetastore.java @@ -0,0 +1,486 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HivePartition; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.HiveUtil; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.HivePrivilegeInfo; +import io.prestosql.plugin.hive.metastore.MetastoreUtil; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.PartitionWithStatistics; +import io.prestosql.plugin.hive.metastore.PrincipalPrivileges; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.SchemaNotFoundException; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.metastore.api.DataOperationType; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; +import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; + +import javax.inject.Inject; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.prestosql.plugin.hive.HiveMetadata.TABLE_COMMENT; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.toMetastoreApiTable; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static java.util.Objects.requireNonNull; +import static java.util.function.UnaryOperator.identity; + +public class BridgingHiveMetastore + implements HiveMetastore +{ + private final ThriftMetastore delegate; + + @Inject + public BridgingHiveMetastore(ThriftMetastore delegate) + { + this.delegate = delegate; + } + + @Override + public Optional getDatabase(String databaseName) + { + return delegate.getDatabase(databaseName).map(ThriftMetastoreUtil::fromMetastoreApiDatabase); + } + + @Override + public List getAllDatabases() + { + return delegate.getAllDatabases(); + } + + @Override + public Optional
getTable(HiveIdentity identity, String databaseName, String tableName) + { + return delegate.getTable(identity, databaseName, tableName).map(table -> { + if (ThriftMetastoreUtil.isAvroTableWithSchemaSet(table) || ThriftMetastoreUtil.isCsvTable(table)) { + return ThriftMetastoreUtil.fromMetastoreApiTable(table, delegate.getFields(identity, databaseName, tableName).get()); + } + return ThriftMetastoreUtil.fromMetastoreApiTable(table); + }); + } + + @Override + public Set getSupportedColumnStatistics(Type type) + { + return delegate.getSupportedColumnStatistics(type); + } + + @Override + public PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) + { + return delegate.getTableStatistics(identity, toMetastoreApiTable(table)); + } + + @Override + public Map getPartitionStatistics(HiveIdentity identity, Table table, List partitions) + { + return delegate.getPartitionStatistics(identity, + toMetastoreApiTable(table), + partitions.stream() + .map(ThriftMetastoreUtil::toMetastoreApiPartition) + .collect(toImmutableList())); + } + + @Override + public void updateTableStatistics(HiveIdentity identity, String databaseName, String tableName, Function update) + { + delegate.updateTableStatistics(identity, databaseName, tableName, update); + } + + @Override + public void updatePartitionStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, Function update) + { + delegate.updatePartitionStatistics(identity, databaseName, tableName, partitionName, update); + } + + @Override + public void updatePartitionsStatistics(HiveIdentity identity, String databaseName, String tableName, Map> partNamesUpdateFunctionMap) + { + delegate.updatePartitionsStatistics(identity, databaseName, tableName, partNamesUpdateFunctionMap); + } + + @Override + public Optional> getAllTables(String databaseName) + { + return delegate.getAllTables(databaseName); + } + + @Override + public Optional> getAllViews(String databaseName) + { + return delegate.getAllViews(databaseName); + } + + @Override + public void createDatabase(HiveIdentity identity, Database database) + { + delegate.createDatabase(identity, ThriftMetastoreUtil.toMetastoreApiDatabase(database)); + } + + @Override + public void dropDatabase(HiveIdentity identity, String databaseName) + { + delegate.dropDatabase(identity, databaseName); + } + + @Override + public void renameDatabase(HiveIdentity identity, String databaseName, String newDatabaseName) + { + org.apache.hadoop.hive.metastore.api.Database database = delegate.getDatabase(databaseName) + .orElseThrow(() -> new SchemaNotFoundException(databaseName)); + database.setName(newDatabaseName); + delegate.alterDatabase(identity, databaseName, database); + + delegate.getDatabase(databaseName).ifPresent(newDatabase -> { + if (newDatabase.getName().equals(databaseName)) { + throw new PrestoException(NOT_SUPPORTED, "Hive metastore does not support renaming schemas"); + } + }); + } + + @Override + public void createTable(HiveIdentity identity, Table table, PrincipalPrivileges principalPrivileges) + { + delegate.createTable(identity, ThriftMetastoreUtil.toMetastoreApiTable(table, principalPrivileges)); + } + + @Override + public void dropTable(HiveIdentity identity, String databaseName, String tableName, boolean deleteData) + { + delegate.dropTable(identity, databaseName, tableName, deleteData); + } + + @Override + public void replaceTable(HiveIdentity identity, String databaseName, String tableName, Table newTable, PrincipalPrivileges principalPrivileges) + { + alterTable(identity, databaseName, tableName, ThriftMetastoreUtil.toMetastoreApiTable(newTable, principalPrivileges)); + } + + @Override + public void renameTable(HiveIdentity identity, String databaseName, String tableName, String newDatabaseName, String newTableName) + { + Optional source = delegate.getTable(identity, databaseName, tableName); + if (!source.isPresent()) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + org.apache.hadoop.hive.metastore.api.Table table = source.get(); + table.setDbName(newDatabaseName); + table.setTableName(newTableName); + alterTable(identity, databaseName, tableName, table); + } + + @Override + public void commentTable(HiveIdentity identity, String databaseName, String tableName, Optional comment) + { + Optional source = delegate.getTable(identity, databaseName, tableName); + if (!source.isPresent()) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + org.apache.hadoop.hive.metastore.api.Table table = source.get(); + + Map parameters = table.getParameters().entrySet().stream() + .filter(entry -> !entry.getKey().equals(TABLE_COMMENT)) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + comment.ifPresent(value -> parameters.put(TABLE_COMMENT, comment.get())); + + table.setParameters(parameters); + alterTable(identity, databaseName, tableName, table); + } + + @Override + public void addColumn(HiveIdentity identity, String databaseName, String tableName, String columnName, HiveType columnType, String columnComment) + { + Optional source = delegate.getTable(identity, databaseName, tableName); + if (!source.isPresent()) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + org.apache.hadoop.hive.metastore.api.Table table = source.get(); + table.getSd().getCols().add( + new FieldSchema(columnName, columnType.getHiveTypeName().toString(), columnComment)); + alterTable(identity, databaseName, tableName, table); + } + + @Override + public void renameColumn(HiveIdentity identity, String databaseName, String tableName, String oldColumnName, String newColumnName) + { + Optional source = delegate.getTable(identity, databaseName, tableName); + if (!source.isPresent()) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + org.apache.hadoop.hive.metastore.api.Table table = source.get(); + for (FieldSchema fieldSchema : table.getPartitionKeys()) { + if (fieldSchema.getName().equals(oldColumnName)) { + throw new PrestoException(NOT_SUPPORTED, "Renaming partition columns is not supported"); + } + } + for (FieldSchema fieldSchema : table.getSd().getCols()) { + if (fieldSchema.getName().equals(oldColumnName)) { + fieldSchema.setName(newColumnName); + } + } + alterTable(identity, databaseName, tableName, table); + } + + @Override + public void dropColumn(HiveIdentity identity, String databaseName, String tableName, String columnName) + { + MetastoreUtil.verifyCanDropColumn(this, identity, databaseName, tableName, columnName); + org.apache.hadoop.hive.metastore.api.Table table = delegate.getTable(identity, databaseName, tableName) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + table.getSd().getCols().removeIf(fieldSchema -> fieldSchema.getName().equals(columnName)); + alterTable(identity, databaseName, tableName, table); + } + + private void alterTable(HiveIdentity identity, String databaseName, String tableName, org.apache.hadoop.hive.metastore.api.Table table) + { + delegate.alterTable(identity, databaseName, tableName, table); + } + + @Override + public Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues) + { + return delegate.getPartition(identity, databaseName, tableName, partitionValues).map(ThriftMetastoreUtil::fromMetastoreApiPartition); + } + + @Override + public Optional> getPartitionNames(HiveIdentity identity, String databaseName, String tableName) + { + return delegate.getPartitionNames(identity, databaseName, tableName); + } + + @Override + public Optional> getPartitionNamesByParts(HiveIdentity identity, String databaseName, String tableName, List parts) + { + return delegate.getPartitionNamesByParts(identity, databaseName, tableName, parts); + } + + @Override + public Map> getPartitionsByNames(HiveIdentity identity, String databaseName, String tableName, List partitionNames) + { + requireNonNull(partitionNames, "partitionNames is null"); + if (partitionNames.isEmpty()) { + return ImmutableMap.of(); + } + + Function fromMetastoreApiPartition = ThriftMetastoreUtil::fromMetastoreApiPartition; + boolean isAvroTableWithSchemaSet = delegate.getTable(identity, databaseName, tableName) + .map(ThriftMetastoreUtil::isAvroTableWithSchemaSet) + .orElse(false); + if (isAvroTableWithSchemaSet) { + List schema = delegate.getFields(identity, databaseName, tableName).get(); + fromMetastoreApiPartition = partition -> ThriftMetastoreUtil.fromMetastoreApiPartition(partition, schema); + } + + Map> partitionNameToPartitionValuesMap = partitionNames.stream() + .collect(Collectors.toMap(identity(), HiveUtil::toPartitionValues)); + Map, Partition> partitionValuesToPartitionMap = delegate.getPartitionsByNames(identity, databaseName, tableName, partitionNames).stream() + .map(fromMetastoreApiPartition) + .collect(Collectors.toMap(Partition::getValues, identity())); + ImmutableMap.Builder> resultBuilder = ImmutableMap.builder(); + for (Map.Entry> entry : partitionNameToPartitionValuesMap.entrySet()) { + Partition partition = partitionValuesToPartitionMap.get(entry.getValue()); + resultBuilder.put(entry.getKey(), Optional.ofNullable(partition)); + } + return resultBuilder.build(); + } + + @Override + public void addPartitions(HiveIdentity identity, String databaseName, String tableName, List partitions) + { + delegate.addPartitions(identity, databaseName, tableName, partitions); + } + + @Override + public void dropPartition(HiveIdentity identity, String databaseName, String tableName, List parts, boolean deleteData) + { + delegate.dropPartition(identity, databaseName, tableName, parts, deleteData); + } + + @Override + public void alterPartition(HiveIdentity identity, String databaseName, String tableName, PartitionWithStatistics partition) + { + delegate.alterPartition(identity, databaseName, tableName, partition); + } + + @Override + public void createRole(String role, String grantor) + { + delegate.createRole(role, grantor); + } + + @Override + public void dropRole(String role) + { + delegate.dropRole(role); + } + + @Override + public Set listRoles() + { + return delegate.listRoles(); + } + + @Override + public void grantRoles(Set roles, Set grantees, boolean withAdminOption, HivePrincipal grantor) + { + delegate.grantRoles(roles, grantees, withAdminOption, grantor); + } + + @Override + public void revokeRoles(Set roles, Set grantees, boolean adminOptionFor, HivePrincipal grantor) + { + delegate.revokeRoles(roles, grantees, adminOptionFor, grantor); + } + + @Override + public Set listRoleGrants(HivePrincipal principal) + { + return delegate.listRoleGrants(principal); + } + + @Override + public void grantTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + delegate.grantTablePrivileges(databaseName, tableName, grantee, privileges); + } + + @Override + public void revokeTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + delegate.revokeTablePrivileges(databaseName, tableName, grantee, privileges); + } + + @Override + public Set listTablePrivileges(String databaseName, String tableName, HivePrincipal principal) + { + return delegate.listTablePrivileges(databaseName, tableName, principal); + } + + @Override + public Optional getConfigValue(String name) + { + return delegate.getConfigValue(name); + } + + @Override + public long openTransaction(HiveIdentity identity) + { + return delegate.openTransaction(identity); + } + + @Override + public void commitTransaction(HiveIdentity identity, long transactionId) + { + delegate.commitTransaction(identity, transactionId); + } + + @Override + public void abortTransaction(HiveIdentity identity, long transactionId) + { + delegate.abortTransaction(identity, transactionId); + } + + @Override + public void sendTransactionHeartbeat(HiveIdentity identity, long transactionId) + { + delegate.sendTransactionHeartbeat(identity, transactionId); + } + + @Override + public void acquireSharedReadLock(HiveIdentity identity, String queryId, long transactionId, List fullTables, List partitions) + { + delegate.acquireSharedReadLock(identity, queryId, transactionId, fullTables, partitions); + } + + @Override + public void acquireLock(HiveIdentity identity, String queryId, long transactionId, List fullTables, List partitions, DataOperationType operationType) + { + delegate.acquireLock(identity, queryId, transactionId, fullTables, partitions, operationType); + } + + @Override + public String getValidWriteIds(HiveIdentity identity, List tables, long currentTransactionId, boolean isVacuum) + { + return delegate.getValidWriteIds(identity, tables, currentTransactionId, isVacuum); + } + + @Override + public long getTableWriteId(String dbName, String tableName, long transactionId) + { + return delegate.getTableWriteId(dbName, tableName, transactionId); + } + + @Override + public ShowLocksResponse showLocks(ShowLocksRequest rqst) + { + return delegate.showLocks(rqst); + } + + /** + * list all privileges of principal to the column + * + * @param databaseName + * @param tableName + * @param columnName + * @param principal + * @return HivePrivilegeInfo set + */ + @Override + public Set listColumnPrivileges(String databaseName, String tableName, String columnName, + HivePrincipal principal) + { + return delegate.listColumnPrivileges(databaseName, tableName, columnName, principal); + } + + /** + * list all privileges of principal to the db + * + * @param databaseName + * @param principal + * @return HivePrivilegeInfo set + */ + @Override + public Set listSchemaPrivileges(String databaseName, + String tableName, + HivePrincipal principal) + { + return delegate.listSchemaPrivileges(databaseName, tableName, principal); + } + + @Override + public boolean isImpersonationEnabled() + { + return delegate.isImpersonationEnabled(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/MetastoreLocator.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/MetastoreLocator.java new file mode 100644 index 00000000..6e091c78 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/MetastoreLocator.java @@ -0,0 +1,25 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import org.apache.thrift.TException; + +public interface MetastoreLocator +{ + /** + * Create a connected {@link ThriftMetastoreClient} + */ + ThriftMetastoreClient createMetastoreClient() + throws TException; +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/StaticMetastoreConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/StaticMetastoreConfig.java new file mode 100644 index 00000000..f58c368f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/StaticMetastoreConfig.java @@ -0,0 +1,71 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableList; +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; +import io.prestosql.spi.function.Mandatory; + +import javax.validation.constraints.NotNull; + +import java.net.URI; +import java.util.List; + +import static com.google.common.collect.Iterables.transform; + +public class StaticMetastoreConfig +{ + private static final Splitter SPLITTER = Splitter.on(',').trimResults().omitEmptyStrings(); + + private List metastoreUris; + private String metastoreUsername; + + @NotNull + public List getMetastoreUris() + { + return metastoreUris; + } + + @Mandatory(name = "hive.metastore.uri", + description = "The URI(s) of the Hive metastore to connect to using the Thrift protocol. If multiple URIs are provided, the first URI is used by default and the rest of the URIs are fallback metastores. This property is required. Example: thrift://192.0.2.3:9083 or thrift://192.0.2.3:9083,thrift://192.0.2.4:9083", + defaultValue = "thrift://host:port", + required = true) + @Config("hive.metastore.uri") + @ConfigDescription("Hive metastore URIs (comma separated)") + public StaticMetastoreConfig setMetastoreUris(String uris) + { + if (uris == null) { + this.metastoreUris = null; + return this; + } + + this.metastoreUris = ImmutableList.copyOf(transform(SPLITTER.split(uris), URI::create)); + return this; + } + + public String getMetastoreUsername() + { + return metastoreUsername; + } + + @Config("hive.metastore.username") + @ConfigDescription("Optional username for accessing the Hive metastore") + public StaticMetastoreConfig setMetastoreUsername(String metastoreUsername) + { + this.metastoreUsername = metastoreUsername; + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/StaticMetastoreLocator.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/StaticMetastoreLocator.java new file mode 100644 index 00000000..ae52fc20 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/StaticMetastoreLocator.java @@ -0,0 +1,96 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.net.HostAndPort; +import io.prestosql.plugin.hive.metastore.MetastoreClientFactory; +import org.apache.thrift.TException; + +import javax.inject.Inject; + +import java.net.URI; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.isNullOrEmpty; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; + +public class StaticMetastoreLocator + implements MetastoreLocator +{ + private final List addresses; + private final MetastoreClientFactory clientFactory; + private final String metastoreUsername; + private AtomicInteger nextIndex = new AtomicInteger(0); + + @Inject + public StaticMetastoreLocator(StaticMetastoreConfig config, MetastoreClientFactory clientFactory) + { + this(config.getMetastoreUris(), config.getMetastoreUsername(), clientFactory); + } + + public StaticMetastoreLocator(List metastoreUris, String metastoreUsername, MetastoreClientFactory clientFactory) + { + requireNonNull(metastoreUris, "metastoreUris is null"); + checkArgument(!metastoreUris.isEmpty(), "metastoreUris must specify at least one URI"); + this.addresses = metastoreUris.stream() + .map(StaticMetastoreLocator::checkMetastoreUri) + .map(uri -> HostAndPort.fromParts(uri.getHost(), uri.getPort())) + .collect(toList()); + Collections.shuffle(addresses); + this.metastoreUsername = metastoreUsername; + this.clientFactory = requireNonNull(clientFactory, "clientFactory is null"); + } + + /** + * Create a metastore client connected to the Hive metastore. + *

+ * Connects to all metastores in round-roubin order + */ + @Override + public ThriftMetastoreClient createMetastoreClient() + throws TException + { + TException lastException = null; + for (int i = 0; i < addresses.size(); i++) { + int next = nextIndex.updateAndGet((current -> (current + 1) % addresses.size())); + HostAndPort metastore = addresses.get(next); + try { + ThriftMetastoreClient client = clientFactory.create(metastore); + if (!isNullOrEmpty(metastoreUsername)) { + client.setUGI(metastoreUsername); + } + return client; + } + catch (TException e) { + lastException = e; + } + } + throw new TException("Failed connecting to Hive metastore: " + addresses, lastException); + } + + private static URI checkMetastoreUri(URI uri) + { + requireNonNull(uri, "metastoreUri is null"); + String scheme = uri.getScheme(); + checkArgument(!isNullOrEmpty(scheme), "metastoreUri scheme is missing: %s", uri); + checkArgument(scheme.equals("thrift"), "metastoreUri scheme must be thrift: %s", uri); + checkArgument(uri.getHost() != null, "metastoreUri host is missing: %s", uri); + checkArgument(uri.getPort() != -1, "metastoreUri port is missing: %s", uri); + return uri; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftConstants.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftConstants.java new file mode 100644 index 00000000..ee3e0bb0 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftConstants.java @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class ThriftConstants +{ + /** + * White list for MetastoreClientFactory implementation's class name + */ + public static final List WHITE_LIST_FOR_METASTORECLIENTFACTORY_CLASS = Collections.unmodifiableList(new ArrayList() { + { + this.add("io.prestosql.plugin.hive.metastore.thrift.MockThriftMetastoreClientFactory"); + this.add(ThriftMetastoreClientFactory.class.getName()); + } + }); + + /** + * White list for ThriftMetastore implementation's class name + */ + public static final List WHITE_LIST_FOR_THRIFTMETASTORE_CLASS = Collections.unmodifiableList(new ArrayList() { + { + this.add("io.prestosql.plugin.hive.metastore.thrift.InMemoryThriftMetastore"); + this.add(ThriftHiveMetastore.class.getName()); + } + }); + + private ThriftConstants() + { + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftHiveMetastore.java new file mode 100644 index 00000000..d8b9950d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftHiveMetastore.java @@ -0,0 +1,1897 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; +import io.airlift.log.Logger; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.HiveBasicStatistics; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HivePartition; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.HiveViewNotSupportedException; +import io.prestosql.plugin.hive.PartitionNotFoundException; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.HivePrivilegeInfo; +import io.prestosql.plugin.hive.metastore.PartitionWithStatistics; +import io.prestosql.plugin.hive.util.RetryDriver; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.SchemaAlreadyExistsException; +import io.prestosql.spi.connector.SchemaNotFoundException; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableAlreadyExistsException; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.metastore.LockComponentBuilder; +import org.apache.hadoop.hive.metastore.LockRequestBuilder; +import org.apache.hadoop.hive.metastore.api.AlreadyExistsException; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.ConfigValSecurityException; +import org.apache.hadoop.hive.metastore.api.DataOperationType; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.HiveObjectPrivilege; +import org.apache.hadoop.hive.metastore.api.HiveObjectRef; +import org.apache.hadoop.hive.metastore.api.InvalidInputException; +import org.apache.hadoop.hive.metastore.api.InvalidObjectException; +import org.apache.hadoop.hive.metastore.api.InvalidOperationException; +import org.apache.hadoop.hive.metastore.api.LockComponent; +import org.apache.hadoop.hive.metastore.api.LockRequest; +import org.apache.hadoop.hive.metastore.api.LockResponse; +import org.apache.hadoop.hive.metastore.api.LockState; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.NoSuchLockException; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.NoSuchTxnException; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.PrincipalType; +import org.apache.hadoop.hive.metastore.api.PrivilegeBag; +import org.apache.hadoop.hive.metastore.api.PrivilegeGrantInfo; +import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; +import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.TxnAbortedException; +import org.apache.hadoop.hive.metastore.api.UnknownDBException; +import org.apache.hadoop.hive.metastore.api.UnknownTableException; +import org.apache.thrift.TException; +import org.weakref.jmx.Flatten; +import org.weakref.jmx.Managed; + +import javax.annotation.concurrent.ThreadSafe; +import javax.inject.Inject; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalLong; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Throwables.propagateIfPossible; +import static com.google.common.base.Throwables.throwIfUnchecked; +import static com.google.common.base.Verify.verifyNotNull; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.Iterables.getOnlyElement; +import static com.google.common.collect.Sets.difference; +import static io.prestosql.plugin.hive.HiveUtil.PRESTO_VIEW_FLAG; +import static io.prestosql.spi.StandardErrorCode.ALREADY_EXISTS; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.security.PrincipalType.USER; +import static java.lang.String.format; +import static java.lang.System.nanoTime; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toSet; +import static org.apache.hadoop.hive.common.FileUtils.makePartName; +import static org.apache.hadoop.hive.metastore.api.HiveObjectType.TABLE; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.HIVE_FILTER_FIELD_PARAMS; + +@ThreadSafe +public class ThriftHiveMetastore + implements ThriftMetastore +{ + private static final Logger log = Logger.get(ThriftHiveMetastore.class); + + private final ThriftMetastoreStats stats = new ThriftMetastoreStats(); + private final MetastoreLocator clientProvider; + private final double backoffScaleFactor; + private final Duration minBackoffDelay; + private final Duration maxBackoffDelay; + private final Duration maxRetryTime; + private final Duration maxWaitForLock; + private final int maxRetries; + private final boolean isRoleNameCaseSensitive; + private final boolean impersonationEnabled; + + private final AtomicInteger chosenGetTableAlternative = new AtomicInteger(Integer.MAX_VALUE); + private volatile boolean metastoreKnownToSupportTableParamEqualsPredicate; + private volatile boolean metastoreKnownToSupportTableParamLikePredicate; + + @Inject + public ThriftHiveMetastore(MetastoreLocator metastoreLocator, ThriftHiveMetastoreConfig thriftConfig) + { + this.clientProvider = requireNonNull(metastoreLocator, "metastoreLocator is null"); + this.backoffScaleFactor = thriftConfig.getBackoffScaleFactor(); + this.minBackoffDelay = thriftConfig.getMinBackoffDelay(); + this.maxBackoffDelay = thriftConfig.getMaxBackoffDelay(); + this.maxRetryTime = thriftConfig.getMaxRetryTime(); + this.maxRetries = thriftConfig.getMaxRetries(); + this.maxWaitForLock = thriftConfig.getMaxWaitForTransactionLock(); + this.isRoleNameCaseSensitive = thriftConfig.isRoleNameCaseSensitive(); + this.impersonationEnabled = thriftConfig.isImpersonationEnabled(); + } + + @Managed + @Flatten + public ThriftMetastoreStats getStats() + { + return stats; + } + + @Override + public List getAllDatabases() + { + try { + return retry() + .stopOnIllegalExceptions() + .run("getAllDatabases", stats.getGetAllDatabases().wrap(() -> { + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + return client.getAllDatabases(); + } + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public Optional getDatabase(String databaseName) + { + try { + return retry() + .stopOn(NoSuchObjectException.class) + .stopOnIllegalExceptions() + .run("getDatabase", stats.getGetDatabase().wrap(() -> { + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + return Optional.of(client.getDatabase(databaseName)); + } + })); + } + catch (NoSuchObjectException e) { + return Optional.empty(); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public Optional> getAllTables(String databaseName) + { + Callable> getAllTables = stats.getGetAllTables().wrap(() -> { + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + return client.getAllTables(databaseName); + } + }); + + Callable getDatabase = stats.getGetDatabase().wrap(() -> { + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + client.getDatabase(databaseName); + return null; + } + }); + + try { + return retry() + .stopOn(NoSuchObjectException.class) + .stopOnIllegalExceptions() + .run("getAllTables", () -> { + List tables = getAllTables.call(); + if (tables.isEmpty()) { + // Check to see if the database exists + getDatabase.call(); + } + return Optional.of(tables); + }); + } + catch (NoSuchObjectException e) { + return Optional.empty(); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public Optional

getTable(HiveIdentity identity, String databaseName, String tableName) + { + try { + return retry() + .stopOn(NoSuchObjectException.class, HiveViewNotSupportedException.class) + .stopOnIllegalExceptions() + .run("getTable", stats.getGetTable().wrap(() -> { + Table table = getTableFromMetastore(identity, databaseName, tableName); + // we need delete view type check + return Optional.of(table); + })); + } + catch (NoSuchObjectException e) { + return Optional.empty(); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + private Table getTableFromMetastore(HiveIdentity identity, String databaseName, String tableName) + throws TException + { + return alternativeCall( + () -> createMetastoreClient(identity), + chosenGetTableAlternative, + client -> client.getTable(databaseName, tableName), + client -> client.getTableWithCapabilities(databaseName, tableName)); + } + + private Table getTableFromMetastore(String databaseName, String tableName) + throws TException + { + return alternativeCall( + () -> createMetastoreClient(), + chosenGetTableAlternative, + client -> client.getTable(databaseName, tableName), + client -> client.getTableWithCapabilities(databaseName, tableName)); + } + + @Override + public Set getSupportedColumnStatistics(Type type) + { + return ThriftMetastoreUtil.getSupportedColumnStatistics(type); + } + + private static boolean isPrestoView(Table table) + { + return "true".equals(table.getParameters().get(PRESTO_VIEW_FLAG)); + } + + @Override + public PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) + { + List dataColumns = table.getSd().getCols().stream() + .map(FieldSchema::getName) + .collect(toImmutableList()); + HiveBasicStatistics basicStatistics = ThriftMetastoreUtil.getHiveBasicStatistics(table.getParameters()); + Map columnStatistics = getTableColumnStatistics(identity, table.getDbName(), table.getTableName(), + dataColumns, basicStatistics.getRowCount()); + return new PartitionStatistics(basicStatistics, columnStatistics); + } + + private Map getTableColumnStatistics(HiveIdentity identity, String databaseName, String tableName, List columns, OptionalLong rowCount) + { + try { + return retry() + .stopOn(NoSuchObjectException.class, HiveViewNotSupportedException.class) + .stopOnIllegalExceptions() + .run("getTableColumnStatistics", stats.getGetTableColumnStatistics().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + return groupStatisticsByColumn(client.getTableColumnStatistics(databaseName, tableName, columns), rowCount); + } + })); + } + catch (NoSuchObjectException e) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public Map getPartitionStatistics(HiveIdentity identity, Table table, List partitions) + { + List dataColumns = table.getSd().getCols().stream() + .map(FieldSchema::getName) + .collect(toImmutableList()); + List partitionColumns = table.getPartitionKeys().stream() + .map(FieldSchema::getName) + .collect(toImmutableList()); + + Map partitionBasicStatistics = partitions.stream() + .collect(toImmutableMap( + partition -> makePartName(partitionColumns, partition.getValues()), + partition -> ThriftMetastoreUtil.getHiveBasicStatistics(partition.getParameters()))); + Map partitionRowCounts = partitionBasicStatistics.entrySet().stream() + .collect(toImmutableMap(Map.Entry::getKey, entry -> entry.getValue().getRowCount())); + Map> partitionColumnStatistics = getPartitionColumnStatistics( + identity, + table.getDbName(), + table.getTableName(), + partitionBasicStatistics.keySet(), + dataColumns, + partitionRowCounts); + ImmutableMap.Builder result = ImmutableMap.builder(); + for (String partitionName : partitionBasicStatistics.keySet()) { + HiveBasicStatistics basicStatistics = partitionBasicStatistics.get(partitionName); + Map columnStatistics = partitionColumnStatistics.getOrDefault(partitionName, ImmutableMap.of()); + result.put(partitionName, new PartitionStatistics(basicStatistics, columnStatistics)); + } + + return result.build(); + } + + @Override + public Optional> getFields(HiveIdentity identity, String databaseName, String tableName) + { + try { + return retry() + .stopOn(MetaException.class, UnknownTableException.class, UnknownDBException.class) + .stopOnIllegalExceptions() + .run("getFields", stats.getGetFields().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient()) { + return Optional.of(ImmutableList.copyOf(client.getFields(databaseName, tableName))); + } + })); + } + catch (NoSuchObjectException e) { + return Optional.empty(); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + private Map> getPartitionColumnStatistics( + HiveIdentity identity, + String databaseName, + String tableName, + Set partitionNames, + List columnNames, + Map partitionRowCounts) + { + return getMetastorePartitionColumnStatistics(identity, databaseName, tableName, partitionNames, columnNames).entrySet().stream() + .filter(entry -> !entry.getValue().isEmpty()) + .collect(toImmutableMap( + Map.Entry::getKey, + entry -> groupStatisticsByColumn(entry.getValue(), partitionRowCounts.getOrDefault(entry.getKey(), OptionalLong.empty())))); + } + + private Map> getMetastorePartitionColumnStatistics(HiveIdentity identity, String databaseName, String tableName, Set partitionNames, List columnNames) + { + try { + return retry() + .stopOn(NoSuchObjectException.class, HiveViewNotSupportedException.class) + .stopOnIllegalExceptions() + .run("getPartitionColumnStatistics", stats.getGetPartitionColumnStatistics().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + return client.getPartitionColumnStatistics(databaseName, tableName, ImmutableList.copyOf(partitionNames), columnNames); + } + })); + } + catch (NoSuchObjectException e) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + private Map groupStatisticsByColumn(List statistics, OptionalLong rowCount) + { + return statistics.stream() + .collect(toImmutableMap(ColumnStatisticsObj::getColName, statisticsObj -> ThriftMetastoreUtil.fromMetastoreApiColumnStatistics(statisticsObj, rowCount))); + } + + @Override + public void updateTableStatistics(HiveIdentity identity, String databaseName, String tableName, Function update) + { + Table originalTable = getTable(identity, databaseName, tableName) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + PartitionStatistics currentStatistics = getTableStatistics(identity, originalTable); + PartitionStatistics updatedStatistics = update.apply(currentStatistics); + + Table modifiedTable = originalTable.deepCopy(); + HiveBasicStatistics basicStatistics = updatedStatistics.getBasicStatistics(); + modifiedTable.setParameters(ThriftMetastoreUtil.updateStatisticsParameters(modifiedTable.getParameters(), basicStatistics)); + alterTable(identity, databaseName, tableName, modifiedTable); + + io.prestosql.plugin.hive.metastore.Table table = ThriftMetastoreUtil.fromMetastoreApiTable(modifiedTable); + OptionalLong rowCount = basicStatistics.getRowCount(); + List metastoreColumnStatistics = updatedStatistics.getColumnStatistics().entrySet().stream() + .map(entry -> ThriftMetastoreUtil.createMetastoreColumnStatistics(entry.getKey(), table.getColumn(entry.getKey()).get().getType(), entry.getValue(), rowCount)) + .collect(toImmutableList()); + if (!metastoreColumnStatistics.isEmpty()) { + setTableColumnStatistics(identity, databaseName, tableName, metastoreColumnStatistics); + } + Set removedColumnStatistics = difference(currentStatistics.getColumnStatistics().keySet(), updatedStatistics.getColumnStatistics().keySet()); + removedColumnStatistics.forEach(column -> deleteTableColumnStatistics(identity, databaseName, tableName, column)); + } + + private void setTableColumnStatistics(HiveIdentity identity, String databaseName, String tableName, List statistics) + { + try { + retry() + .stopOn(NoSuchObjectException.class, InvalidObjectException.class, MetaException.class, InvalidInputException.class) + .stopOnIllegalExceptions() + .run("setTableColumnStatistics", stats.getCreateDatabase().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + client.setTableColumnStatistics(databaseName, tableName, statistics); + } + return null; + })); + } + catch (NoSuchObjectException e) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + private void deleteTableColumnStatistics(HiveIdentity identity, String databaseName, String tableName, String columnName) + { + try { + retry() + .stopOn(NoSuchObjectException.class, InvalidObjectException.class, MetaException.class, InvalidInputException.class) + .stopOnIllegalExceptions() + .run("deleteTableColumnStatistics", stats.getCreateDatabase().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + client.deleteTableColumnStatistics(databaseName, tableName, columnName); + } + return null; + })); + } + catch (NoSuchObjectException e) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void updatePartitionStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, Function update) + { + List partitions = getPartitionsByNames(identity, databaseName, tableName, ImmutableList.of(partitionName)); + if (partitions.size() != 1) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Metastore returned multiple partitions for name: " + partitionName); + } + + Table table = getTable(identity, databaseName, tableName) + .orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + PartitionStatistics currentStatistics = requireNonNull( + getPartitionStatistics(identity, table, partitions).get(partitionName), "getPartitionStatistics() returned null"); + PartitionStatistics updatedStatistics = update.apply(currentStatistics); + + Partition originalPartition = getOnlyElement(partitions); + Partition modifiedPartition = originalPartition.deepCopy(); + HiveBasicStatistics basicStatistics = updatedStatistics.getBasicStatistics(); + modifiedPartition.setParameters(ThriftMetastoreUtil.updateStatisticsParameters(modifiedPartition.getParameters(), basicStatistics)); + alterPartitionWithoutStatistics(identity, databaseName, tableName, modifiedPartition); + + updatePartitionColumnStatistics(identity, modifiedPartition, databaseName, tableName, partitionName, basicStatistics, currentStatistics, updatedStatistics); + } + + @Override + public synchronized void updatePartitionsStatistics(HiveIdentity identity, String databaseName, String tableName, Map> partNamesUpdateFunctionMap) + { + ImmutableList.Builder modifiedPartitionBuilder = ImmutableList.builder(); + ImmutableMap.Builder partitionInfoMapBuilder = ImmutableMap.builder(); + Optional
table = getTable(identity, databaseName, tableName); + + List partitions = getPartitionsByNames(identity, databaseName, tableName, partNamesUpdateFunctionMap.keySet().stream().collect(Collectors.toList())); + Map partitionsStatistics = getPartitionStatistics(identity, table.get(), partitions); + + if (partitions.size() != partitionsStatistics.size() || partitions.size() != partNamesUpdateFunctionMap.size()) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Metastore returned multiple partitions"); + } + List partColumns = table.get().getPartitionKeys().stream() + .map(FieldSchema::getName) + .collect(toImmutableList()); + for (int index = 0; index < partitions.size(); index++) { + String partitionName = makePartName(partColumns, partitions.get(index).getValues()); + PartitionStatistics currentStatistics = requireNonNull(partitionsStatistics.get(partitionName), + "getPartitionStatistics() returned null"); + + PartitionStatistics updatedStatistics = partNamesUpdateFunctionMap.get(partitionName).apply(currentStatistics); + + Partition originalPartition = partitions.get(index); + Partition modifiedPartition = originalPartition.deepCopy(); + HiveBasicStatistics basicStatistics = updatedStatistics.getBasicStatistics(); + modifiedPartition.setParameters(ThriftMetastoreUtil.updateStatisticsParameters(modifiedPartition.getParameters(), basicStatistics)); + originalPartition.setParameters(ThriftMetastoreUtil.updateStatisticsParameters(originalPartition.getParameters(), basicStatistics)); + + modifiedPartitionBuilder.add(modifiedPartition); + partitionInfoMapBuilder.put(partitionName, new PartitionInfo(basicStatistics, currentStatistics, originalPartition, updatedStatistics)); + } + alterPartitionsWithoutStatistics(databaseName, tableName, modifiedPartitionBuilder.build()); + + ImmutableMap partitionInfoMap = partitionInfoMapBuilder.build(); + partitionInfoMap.forEach((partName, partInfo) -> + updatePartitionColumnStatistics(identity, partInfo.modifiedPartition, databaseName, tableName, partName, + partInfo.basicStatistics, partInfo.currentStatistics, partInfo.updatedStatistics)); + } + + private void updatePartitionColumnStatistics(HiveIdentity identity, Partition modifiedPartition, String databaseName, + String tableName, String partitionName, HiveBasicStatistics basicStatistics, + PartitionStatistics currentStatistics, PartitionStatistics updatedStatistics) + { + Map columns = modifiedPartition.getSd().getCols().stream() + .collect(toImmutableMap(FieldSchema::getName, schema -> HiveType.valueOf(schema.getType()))); + setPartitionColumnStatistics(identity, databaseName, tableName, partitionName, columns, updatedStatistics.getColumnStatistics(), basicStatistics.getRowCount()); + + Set removedStatistics = difference(currentStatistics.getColumnStatistics().keySet(), updatedStatistics.getColumnStatistics().keySet()); + removedStatistics.forEach(column -> deletePartitionColumnStatistics(identity, databaseName, tableName, partitionName, column)); + } + + private class PartitionInfo + { + private final HiveBasicStatistics basicStatistics; + private final PartitionStatistics currentStatistics; + private final Partition modifiedPartition; + private final PartitionStatistics updatedStatistics; + + PartitionInfo(HiveBasicStatistics basicStatistics, PartitionStatistics currentStatistics, + Partition modifiedPartition, PartitionStatistics updatedStatistics) + { + this.basicStatistics = basicStatistics; + this.currentStatistics = currentStatistics; + this.modifiedPartition = modifiedPartition; + this.updatedStatistics = updatedStatistics; + } + } + + private void setPartitionColumnStatistics( + HiveIdentity identity, + String databaseName, + String tableName, + String partitionName, + Map columns, + Map columnStatistics, + OptionalLong rowCount) + { + List metastoreColumnStatistics = columnStatistics.entrySet().stream() + .filter(entry -> columns.containsKey(entry.getKey())) + .map(entry -> ThriftMetastoreUtil.createMetastoreColumnStatistics(entry.getKey(), columns.get(entry.getKey()), entry.getValue(), rowCount)) + .collect(toImmutableList()); + if (!metastoreColumnStatistics.isEmpty()) { + setPartitionColumnStatistics(identity, databaseName, tableName, partitionName, metastoreColumnStatistics); + } + } + + private void setPartitionColumnStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, List statistics) + { + try { + retry() + .stopOn(NoSuchObjectException.class, InvalidObjectException.class, MetaException.class, InvalidInputException.class) + .stopOnIllegalExceptions() + .run("setPartitionColumnStatistics", stats.getCreateDatabase().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + client.setPartitionColumnStatistics(databaseName, tableName, partitionName, statistics); + } + return null; + })); + } + catch (NoSuchObjectException e) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + private void deletePartitionColumnStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, String columnName) + { + try { + retry() + .stopOn(NoSuchObjectException.class, InvalidObjectException.class, MetaException.class, InvalidInputException.class) + .stopOnIllegalExceptions() + .run("deletePartitionColumnStatistics", stats.getCreateDatabase().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + client.deletePartitionColumnStatistics(databaseName, tableName, partitionName, columnName); + } + return null; + })); + } + catch (NoSuchObjectException e) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void createRole(String role, String grantor) + { + try { + retry() + .stopOn(MetaException.class) + .stopOnIllegalExceptions() + .run("createRole", stats.getCreateRole().wrap(() -> { + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + client.createRole(isRoleNameCaseSensitive ? role : role.toLowerCase(Locale.ENGLISH), + grantor); + return null; + } + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void dropRole(String role) + { + try { + retry() + .stopOn(MetaException.class) + .stopOnIllegalExceptions() + .run("dropRole", stats.getDropRole().wrap(() -> { + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + client.dropRole(isRoleNameCaseSensitive ? role : role.toLowerCase(Locale.ENGLISH)); + return null; + } + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public Set listRoles() + { + try { + return retry() + .stopOn(MetaException.class) + .stopOnIllegalExceptions() + .run("listRoles", stats.getListRoles().wrap(() -> { + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + List roles = new ArrayList<>(); + for (String role : client.getRoleNames()) { + roles.add(isRoleNameCaseSensitive ? role : role.toLowerCase(Locale.ENGLISH)); + } + return ImmutableSet.copyOf(roles); + } + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void grantRoles(Set roles, Set grantees, boolean withAdminOption, HivePrincipal grantor) + { + for (HivePrincipal grantee : grantees) { + for (String role : roles) { + grantRole( + role, + grantee.getName(), ThriftMetastoreUtil.fromPrestoPrincipalType(grantee.getType()), + grantor.getName(), ThriftMetastoreUtil.fromPrestoPrincipalType(grantor.getType()), + withAdminOption); + } + } + } + + private void grantRole(String role, String granteeName, PrincipalType granteeType, String grantorName, PrincipalType grantorType, boolean grantOption) + { + try { + retry() + .stopOn(MetaException.class) + .stopOnIllegalExceptions() + .run("grantRole", stats.getGrantRole().wrap(() -> { + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + client.grantRole(isRoleNameCaseSensitive ? role : role.toLowerCase(Locale.ENGLISH), + (granteeType == PrincipalType.ROLE && isRoleNameCaseSensitive) ? granteeName : granteeName.toLowerCase(Locale.ENGLISH), + granteeType, + (grantorType == PrincipalType.ROLE && isRoleNameCaseSensitive) ? grantorName : grantorName.toLowerCase(Locale.ENGLISH), + grantorType, + grantOption); + return null; + } + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void revokeRoles(Set roles, Set grantees, boolean adminOptionFor, HivePrincipal grantor) + { + for (HivePrincipal grantee : grantees) { + for (String role : roles) { + revokeRole( + role, + grantee.getName(), ThriftMetastoreUtil.fromPrestoPrincipalType(grantee.getType()), + adminOptionFor); + } + } + } + + private void revokeRole(String role, String granteeName, PrincipalType granteeType, boolean grantOption) + { + try { + retry() + .stopOn(MetaException.class) + .stopOnIllegalExceptions() + .run("revokeRole", stats.getRevokeRole().wrap(() -> { + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + client.revokeRole(isRoleNameCaseSensitive ? role : role.toLowerCase(Locale.ENGLISH), + (granteeType == PrincipalType.ROLE && isRoleNameCaseSensitive) ? granteeName : granteeName.toLowerCase(Locale.ENGLISH), + granteeType, + grantOption); + return null; + } + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public Set listRoleGrants(HivePrincipal sourcePrincipal) + { + try { + return retry() + .stopOn(MetaException.class) + .stopOnIllegalExceptions() + .run("listRoleGrants", stats.getListRoleGrants().wrap(() -> { + HivePrincipal principal = ThriftMetastoreUtil.applyRoleNameCaseSensitive(sourcePrincipal, isRoleNameCaseSensitive); + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + return ThriftMetastoreUtil.fromRolePrincipalGrants(client.listRoleGrants(principal.getName(), + ThriftMetastoreUtil.fromPrestoPrincipalType(principal.getType())), + isRoleNameCaseSensitive); + } + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public Optional> getAllViews(String databaseName) + { + try { + return retry() + .stopOn(UnknownDBException.class) + .stopOnIllegalExceptions() + .run("getAllViews", stats.getGetAllViews().wrap(() -> { + return Optional.of(getPrestoViews(databaseName)); + })); + } + catch (UnknownDBException e) { + return Optional.empty(); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + private List getPrestoViews(String databaseName) + throws TException + { + /* + * Thrift call `get_table_names_by_filter` may be translated by Metastore to a SQL query against Metastore database. + * Hive 2.3 on some databases uses CLOB for table parameter value column and some databases disallow `=` predicate over + * CLOB values. At the same time, they allow `LIKE` predicates over them. + */ + String filterWithEquals = HIVE_FILTER_FIELD_PARAMS + PRESTO_VIEW_FLAG + " = \"true\""; + String filterWithLike = HIVE_FILTER_FIELD_PARAMS + PRESTO_VIEW_FLAG + " LIKE \"true\""; + + if (metastoreKnownToSupportTableParamEqualsPredicate) { + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + return client.getTableNamesByFilter(databaseName, filterWithEquals); + } + } + if (metastoreKnownToSupportTableParamLikePredicate) { + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + return client.getTableNamesByFilter(databaseName, filterWithLike); + } + } + + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + List views = client.getTableNamesByFilter(databaseName, filterWithEquals); + metastoreKnownToSupportTableParamEqualsPredicate = true; + return views; + } + catch (TException | RuntimeException firstException) { + try (ThriftMetastoreClient client = clientProvider.createMetastoreClient()) { + List views = client.getTableNamesByFilter(databaseName, filterWithLike); + metastoreKnownToSupportTableParamLikePredicate = true; + return views; + } + catch (TException | RuntimeException secondException) { + if (firstException != secondException) { + firstException.addSuppressed(secondException); + } + } + throw firstException; + } + } + + @Override + public void createDatabase(HiveIdentity identity, Database database) + { + try { + retry() + .stopOn(AlreadyExistsException.class, InvalidObjectException.class, MetaException.class) + .stopOnIllegalExceptions() + .run("createDatabase", stats.getCreateDatabase().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + client.createDatabase(database); + } + return null; + })); + } + catch (AlreadyExistsException e) { + throw new SchemaAlreadyExistsException(database.getName()); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void dropDatabase(HiveIdentity identity, String databaseName) + { + try { + retry() + .stopOn(NoSuchObjectException.class, InvalidOperationException.class) + .stopOnIllegalExceptions() + .run("dropDatabase", stats.getDropDatabase().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + client.dropDatabase(databaseName, true, false); + } + return null; + })); + } + catch (NoSuchObjectException e) { + throw new SchemaNotFoundException(databaseName); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void alterDatabase(HiveIdentity identity, String databaseName, Database database) + { + try { + retry() + .stopOn(NoSuchObjectException.class, MetaException.class) + .stopOnIllegalExceptions() + .run("alterDatabase", stats.getAlterDatabase().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + client.alterDatabase(databaseName, database); + } + return null; + })); + } + catch (NoSuchObjectException e) { + throw new SchemaNotFoundException(databaseName); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void createTable(HiveIdentity identity, Table table) + { + try { + retry() + .stopOn(AlreadyExistsException.class, InvalidObjectException.class, MetaException.class, NoSuchObjectException.class) + .stopOnIllegalExceptions() + .run("createTable", stats.getCreateTable().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + client.createTable(table); + } + return null; + })); + } + catch (AlreadyExistsException e) { + throw new TableAlreadyExistsException(new SchemaTableName(table.getDbName(), table.getTableName())); + } + catch (NoSuchObjectException e) { + throw new SchemaNotFoundException(table.getDbName()); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void dropTable(HiveIdentity identity, String databaseName, String tableName, boolean deleteData) + { + try { + retry() + .stopOn(NoSuchObjectException.class) + .stopOnIllegalExceptions() + .run("dropTable", stats.getDropTable().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + client.dropTable(databaseName, tableName, deleteData); + } + return null; + })); + } + catch (NoSuchObjectException e) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void alterTable(HiveIdentity identity, String databaseName, String tableName, Table table) + { + try { + retry() + .stopOn(InvalidOperationException.class, MetaException.class) + .stopOnIllegalExceptions() + .run("alterTable", stats.getAlterTable().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + Optional
source = getTable(identity, databaseName, tableName); + if (!source.isPresent()) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + client.alterTable(databaseName, tableName, table); + } + return null; + })); + } + catch (NoSuchObjectException e) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public Optional> getPartitionNames(HiveIdentity identity, String databaseName, String tableName) + { + try { + return retry() + .stopOn(NoSuchObjectException.class) + .stopOnIllegalExceptions() + .run("getPartitionNames", stats.getGetPartitionNames().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + return Optional.of(client.getPartitionNames(databaseName, tableName)); + } + })); + } + catch (NoSuchObjectException e) { + return Optional.empty(); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public Optional> getPartitionNamesByParts(HiveIdentity identity, String databaseName, String tableName, List parts) + { + try { + return retry() + .stopOn(NoSuchObjectException.class) + .stopOnIllegalExceptions() + .run("getPartitionNamesByParts", stats.getGetPartitionNamesPs().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + return Optional.of(client.getPartitionNamesFiltered(databaseName, tableName, parts)); + } + })); + } + catch (NoSuchObjectException e) { + return Optional.empty(); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void addPartitions(HiveIdentity identity, String databaseName, String tableName, List partitionsWithStatistics) + { + List partitions = partitionsWithStatistics.stream() + .map(ThriftMetastoreUtil::toMetastoreApiPartition) + .collect(toImmutableList()); + addPartitionsWithoutStatistics(identity, databaseName, tableName, partitions); + for (PartitionWithStatistics partitionWithStatistics : partitionsWithStatistics) { + storePartitionColumnStatistics(identity, databaseName, tableName, partitionWithStatistics.getPartitionName(), partitionWithStatistics); + } + } + + private void addPartitionsWithoutStatistics(HiveIdentity identity, String databaseName, String tableName, List partitions) + { + if (partitions.isEmpty()) { + return; + } + try { + retry() + .stopOn(AlreadyExistsException.class, InvalidObjectException.class, MetaException.class, NoSuchObjectException.class, PrestoException.class) + .stopOnIllegalExceptions() + .run("addPartitions", stats.getAddPartitions().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + int partitionsAdded = client.addPartitions(partitions); + if (partitionsAdded != partitions.size()) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, + format("Hive metastore only added %s of %s partitions", partitionsAdded, partitions.size())); + } + return null; + } + })); + } + catch (AlreadyExistsException e) { + throw new PrestoException(ALREADY_EXISTS, format("One or more partitions already exist for table '%s.%s'", databaseName, tableName), e); + } + catch (NoSuchObjectException e) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void dropPartition(HiveIdentity identity, String databaseName, String tableName, List parts, boolean deleteData) + { + try { + retry() + .stopOn(NoSuchObjectException.class, MetaException.class) + .stopOnIllegalExceptions() + .run("dropPartition", stats.getDropPartition().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + client.dropPartition(databaseName, tableName, parts, deleteData); + } + return null; + })); + } + catch (NoSuchObjectException e) { + throw new PartitionNotFoundException(new SchemaTableName(databaseName, tableName), parts); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void alterPartition(HiveIdentity identity, String databaseName, String tableName, PartitionWithStatistics partitionWithStatistics) + { + alterPartitionWithoutStatistics(identity, databaseName, tableName, ThriftMetastoreUtil.toMetastoreApiPartition(partitionWithStatistics)); + storePartitionColumnStatistics(identity, databaseName, tableName, partitionWithStatistics.getPartitionName(), partitionWithStatistics); + dropExtraColumnStatisticsAfterAlterPartition(identity, databaseName, tableName, partitionWithStatistics); + } + + private void alterPartitionWithoutStatistics(HiveIdentity identity, String databaseName, String tableName, Partition partition) + { + try { + retry() + .stopOn(NoSuchObjectException.class, MetaException.class) + .stopOnIllegalExceptions() + .run("alterPartition", stats.getAlterPartition().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + client.alterPartition(databaseName, tableName, partition); + } + return null; + })); + } + catch (NoSuchObjectException e) { + throw new PartitionNotFoundException(new SchemaTableName(databaseName, tableName), partition.getValues()); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + private void alterPartitionsWithoutStatistics(String databaseName, String tableName, List partitions) + { + try { + retry() + .stopOn(NoSuchObjectException.class, MetaException.class) + .stopOnIllegalExceptions() + .run("alterPartition", stats.getAlterPartition().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient()) { + client.alterPartitions(databaseName, tableName, partitions); + } + return null; + })); + } + catch (NoSuchObjectException e) { + throw new PartitionNotFoundException(new SchemaTableName(databaseName, tableName), null); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + private void storePartitionColumnStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, PartitionWithStatistics partitionWithStatistics) + { + PartitionStatistics statistics = partitionWithStatistics.getStatistics(); + Map columnStatistics = statistics.getColumnStatistics(); + if (columnStatistics.isEmpty()) { + return; + } + Map columnTypes = partitionWithStatistics.getPartition().getColumns().stream() + .collect(toImmutableMap(Column::getName, Column::getType)); + setPartitionColumnStatistics(identity, databaseName, tableName, partitionName, columnTypes, columnStatistics, statistics.getBasicStatistics().getRowCount()); + } + + /* + * After altering a partition metastore preserves all column statistics for that partition. + * + * The old statistics are supposed to be replaced by storing the new partition statistics. + * + * In case when the new statistics are not present for some columns, or if the table schema has changed + * if is needed to explicitly remove the statistics from the metastore for that columns. + */ + private void dropExtraColumnStatisticsAfterAlterPartition( + HiveIdentity identity, + String databaseName, + String tableName, + PartitionWithStatistics partitionWithStatistics) + { + List dataColumns = partitionWithStatistics.getPartition().getColumns().stream() + .map(Column::getName) + .collect(toImmutableList()); + + Set columnsWithMissingStatistics = new HashSet<>(dataColumns); + columnsWithMissingStatistics.removeAll(partitionWithStatistics.getStatistics().getColumnStatistics().keySet()); + + // In case new partition had the statistics computed for all the columns, the storePartitionColumnStatistics + // call in the alterPartition will just overwrite the old statistics. There is no need to explicitly remove anything. + if (columnsWithMissingStatistics.isEmpty()) { + return; + } + + // check if statistics for the columnsWithMissingStatistics are actually stored in the metastore + // when trying to remove any missing statistics the metastore throws NoSuchObjectException + String partitionName = partitionWithStatistics.getPartitionName(); + List statisticsToBeRemoved = getMetastorePartitionColumnStatistics( + identity, + databaseName, + tableName, + ImmutableSet.of(partitionName), + ImmutableList.copyOf(columnsWithMissingStatistics)) + .getOrDefault(partitionName, ImmutableList.of()); + + for (ColumnStatisticsObj statistics : statisticsToBeRemoved) { + deletePartitionColumnStatistics(identity, databaseName, tableName, partitionName, statistics.getColName()); + } + } + + @Override + public Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues) + { + requireNonNull(partitionValues, "partitionValues is null"); + try { + return retry() + .stopOn(NoSuchObjectException.class) + .stopOnIllegalExceptions() + .run("getPartition", stats.getGetPartition().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + return Optional.of(client.getPartition(databaseName, tableName, partitionValues)); + } + })); + } + catch (NoSuchObjectException e) { + return Optional.empty(); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public List getPartitionsByNames(HiveIdentity identity, String databaseName, String tableName, List partitionNames) + { + requireNonNull(partitionNames, "partitionNames is null"); + checkArgument(!Iterables.isEmpty(partitionNames), "partitionNames is empty"); + + try { + return retry() + .stopOn(NoSuchObjectException.class) + .stopOnIllegalExceptions() + .run("getPartitionsByNames", stats.getGetPartitionsByNames().wrap(() -> { + try (ThriftMetastoreClient client = createMetastoreClient(identity)) { + return client.getPartitionsByNames(databaseName, tableName, partitionNames); + } + })); + } + catch (NoSuchObjectException e) { + // assume none of the partitions in the batch are available + return ImmutableList.of(); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void grantTablePrivileges(String databaseName, String tableName, HivePrincipal sourceGrantee, Set privileges) + { + Set requestedPrivileges = privileges.stream() + .map(ThriftMetastoreUtil::toMetastoreApiPrivilegeGrantInfo) + .collect(Collectors.toSet()); + checkArgument(!containsAllPrivilege(requestedPrivileges), "\"ALL\" not supported in PrivilegeGrantInfo.privilege"); + HivePrincipal grantee = ThriftMetastoreUtil.applyRoleNameCaseSensitive(sourceGrantee, isRoleNameCaseSensitive); + + try { + retry() + .stopOnIllegalExceptions() + .run("grantTablePrivileges", stats.getGrantTablePrivileges().wrap(() -> { + try (ThriftMetastoreClient metastoreClient = clientProvider.createMetastoreClient()) { + Set existingPrivileges = listTablePrivileges(databaseName, tableName, grantee); + + Set privilegesToGrant = new HashSet<>(requestedPrivileges); + Iterator iterator = privilegesToGrant.iterator(); + while (iterator.hasNext()) { + HivePrivilegeInfo requestedPrivilege = getOnlyElement(ThriftMetastoreUtil.parsePrivilege(iterator.next(), Optional.empty())); + + for (HivePrivilegeInfo existingPrivilege : existingPrivileges) { + if ((requestedPrivilege.isContainedIn(existingPrivilege))) { + iterator.remove(); + } + else if (existingPrivilege.isContainedIn(requestedPrivilege)) { + throw new PrestoException(NOT_SUPPORTED, format( + "Granting %s WITH GRANT OPTION is not supported while %s possesses %s", + requestedPrivilege.getHivePrivilege().name(), + grantee, + requestedPrivilege.getHivePrivilege().name())); + } + } + } + + if (privilegesToGrant.isEmpty()) { + return null; + } + + metastoreClient.grantPrivileges(buildPrivilegeBag(databaseName, tableName, grantee, privilegesToGrant)); + } + return null; + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void revokeTablePrivileges(String databaseName, String tableName, HivePrincipal sourceGrantee, Set privileges) + { + Set requestedPrivileges = privileges.stream() + .map(ThriftMetastoreUtil::toMetastoreApiPrivilegeGrantInfo) + .collect(Collectors.toSet()); + checkArgument(!containsAllPrivilege(requestedPrivileges), "\"ALL\" not supported in PrivilegeGrantInfo.privilege"); + HivePrincipal grantee = ThriftMetastoreUtil.applyRoleNameCaseSensitive(sourceGrantee, isRoleNameCaseSensitive); + + try { + retry() + .stopOnIllegalExceptions() + .run("revokeTablePrivileges", stats.getRevokeTablePrivileges().wrap(() -> { + try (ThriftMetastoreClient metastoreClient = clientProvider.createMetastoreClient()) { + Set existingHivePrivileges = listTablePrivileges(databaseName, tableName, grantee).stream() + .map(HivePrivilegeInfo::getHivePrivilege) + .collect(toSet()); + + Set privilegesToRevoke = requestedPrivileges.stream() + .filter(privilegeGrantInfo -> existingHivePrivileges.contains(getOnlyElement(ThriftMetastoreUtil.parsePrivilege(privilegeGrantInfo, Optional.empty())).getHivePrivilege())) + .collect(toSet()); + + if (privilegesToRevoke.isEmpty()) { + return null; + } + + metastoreClient.revokePrivileges(buildPrivilegeBag(databaseName, tableName, grantee, privilegesToRevoke)); + } + return null; + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public Set listTablePrivileges(String databaseName, String tableName, HivePrincipal sourcePrincipal) + { + try { + return retry() + .stopOnIllegalExceptions() + .run("listTablePrivileges", stats.getListTablePrivileges().wrap(() -> { + Table table = getTableFromMetastore(databaseName, tableName); + try (ThriftMetastoreClient client = createMetastoreClient()) { + ImmutableSet.Builder privileges = ImmutableSet.builder(); + List hiveObjectPrivilegeList; + HivePrincipal principal = ThriftMetastoreUtil.applyRoleNameCaseSensitive(sourcePrincipal, isRoleNameCaseSensitive); + // principal can be null when we want to list all privileges for admins + if (principal == null) { + hiveObjectPrivilegeList = client.listPrivileges( + null, + null, + new HiveObjectRef(TABLE, databaseName, tableName, null, null)); + } + else { + if (principal.getType() == USER && table.getOwner().equals(principal.getName())) { + privileges.add(new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.OWNERSHIP, true, principal, principal)); + } + hiveObjectPrivilegeList = client.listPrivileges( + principal.getName(), + ThriftMetastoreUtil.fromPrestoPrincipalType(principal.getType()), + new HiveObjectRef(TABLE, databaseName, tableName, null, null)); + } + for (HiveObjectPrivilege hiveObjectPrivilege : hiveObjectPrivilegeList) { + HivePrincipal grantee = new HivePrincipal(ThriftMetastoreUtil.fromMetastoreApiPrincipalType(hiveObjectPrivilege.getPrincipalType()), hiveObjectPrivilege.getPrincipalName()); + + privileges.addAll(ThriftMetastoreUtil.parsePrivilege(hiveObjectPrivilege.getGrantInfo(), + Optional.of(ThriftMetastoreUtil.applyRoleNameCaseSensitive(grantee, isRoleNameCaseSensitive)))); + } + return privileges.build(); + } + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public boolean isImpersonationEnabled() + { + return impersonationEnabled; + } + + @Override + public long openTransaction(HiveIdentity identity) + { + checkArgument(!identity.getUsername().map(String::isEmpty).orElse(true), "User should be provided to open transaction"); + try { + return retry() + .stopOnIllegalExceptions() + .run("openTransaction", stats.getOpenTransaction().wrap(() -> { + try (ThriftMetastoreClient metastoreClient = clientProvider.createMetastoreClient()) { + return metastoreClient.openTransaction(identity.getUsername().get()); + } + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void commitTransaction(HiveIdentity identity, long transactionId) + { + try { + retry() + .stopOnIllegalExceptions() + .stopOn(TxnAbortedException.class) + .run("commitTransaction", stats.getCommitTransaction().wrap(() -> { + try (ThriftMetastoreClient metastoreClient = clientProvider.createMetastoreClient()) { + metastoreClient.commitTransaction(transactionId); + } + return null; + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void abortTransaction(HiveIdentity identity, long transactionId) + { + try { + retry() + .stopOnIllegalExceptions() + .run("abortTransaction", stats.getCommitTransaction().wrap(() -> { + try (ThriftMetastoreClient metastoreClient = clientProvider.createMetastoreClient()) { + metastoreClient.abortTransaction(transactionId); + } + return null; + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void sendTransactionHeartbeat(HiveIdentity identity, long transactionId) + { + try { + retry() + .stopOnIllegalExceptions() + .run("sendTransactionHeartbeat", (() -> { + try (ThriftMetastoreClient metastoreClient = clientProvider.createMetastoreClient()) { + metastoreClient.sendTransactionHeartbeat(transactionId); + } + return null; + })); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void acquireSharedReadLock(HiveIdentity identity, String queryId, long transactionId, List fullTables, List partitions) + { + DataOperationType operationType = DataOperationType.SELECT; + acquireLock(identity, queryId, transactionId, fullTables, partitions, operationType); + } + + @Override + public void acquireLock(HiveIdentity identity, + String queryId, + long transactionId, + List fullTables, + List partitions, + DataOperationType operationType) + { + checkArgument(!identity.getUsername().map(String::isEmpty).orElse(true), "User should be provided to acquire locks"); + requireNonNull(queryId, "queryId is null"); + + if (fullTables.isEmpty() && partitions.isEmpty()) { + return; + } + + LockRequestBuilder request = new LockRequestBuilder(queryId) + .setTransactionId(transactionId) + .setUser(identity.getUsername().get()); + + for (SchemaTableName table : fullTables) { + request.addLockComponent(createLockComponent(table, Optional.empty(), operationType)); + } + + for (HivePartition partition : partitions) { + request.addLockComponent(createLockComponent(partition.getTableName(), Optional.of(partition.getPartitionId()), operationType)); + } + + LockRequest lockRequest = request.build(); + try { + LockResponse response = retry() + .stopOn(NoSuchTxnException.class, TxnAbortedException.class, MetaException.class) + .run("acquireLock", stats.getAcquireLock().wrap(() -> { + try (ThriftMetastoreClient metastoreClient = clientProvider.createMetastoreClient()) { + return metastoreClient.acquireLock(lockRequest); + } + })); + + long waitStart = nanoTime(); + while (response.getState() == LockState.WAITING) { + long lockId = response.getLockid(); + + if (Duration.nanosSince(waitStart).compareTo(maxWaitForLock) > 0) { + // timed out + throw new PrestoException(HiveErrorCode.HIVE_TABLE_LOCK_NOT_ACQUIRED, format("Timed out waiting for lock %d in hive transaction %s for query %s", lockId, transactionId, queryId)); + } + + log.debug("Waiting for lock %d in hive transaction %s for query %s", lockId, transactionId, queryId); + + response = retry() + .stopOn(NoSuchTxnException.class, NoSuchLockException.class, TxnAbortedException.class, MetaException.class) + .run("checkLock", stats.getCheckLock().wrap(() -> { + try (ThriftMetastoreClient metastoreClient = clientProvider.createMetastoreClient()) { + return metastoreClient.checkLock(lockId); + } + })); + } + + if (response.getState() != LockState.ACQUIRED) { + throw new PrestoException(HiveErrorCode.HIVE_TABLE_LOCK_NOT_ACQUIRED, "Could not acquire lock. Lock in state " + response.getState()); + } + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + private static LockComponent createLockComponent(SchemaTableName table, + Optional partitionName, + DataOperationType operationType) + { + requireNonNull(table, "table is null"); + requireNonNull(partitionName, "partitionName is null"); + + LockComponentBuilder builder = new LockComponentBuilder(); + builder.setOperationType(operationType); + switch (operationType) { + case SELECT: + case INSERT: + builder.setShared(); + break; + case DELETE: + case UPDATE: + builder.setSemiShared(); + break; + default: + throw new PrestoException(HiveErrorCode.HIVE_UNKNOWN_ERROR, "Unexpected operationType to aquireLock " + operationType); + } + + builder.setDbName(table.getSchemaName()); + builder.setTableName(table.getTableName()); + requireNonNull(partitionName, "partitionName is null").ifPresent(builder::setPartitionName); + + // acquire locks is called only for TransactionalTable + builder.setIsTransactional(true); + return builder.build(); + } + + @Override + public String getValidWriteIds(HiveIdentity identity, List tables, long currentTransactionId, boolean isVacuum) + { + try { + return retry() + .stopOnIllegalExceptions() + .run("getValidWriteIds", stats.getValidWriteIds().wrap(() -> { + try (ThriftMetastoreClient metastoreClient = clientProvider.createMetastoreClient()) { + return metastoreClient.getValidWriteIds( + tables.stream() + .map(table -> format("%s.%s", table.getSchemaName(), table.getTableName())) + .collect(toImmutableList()), + currentTransactionId, + isVacuum); + } + })); + } + catch (TException e) { + // When calling Hive metastore < 3, the call fails with + // Required field 'open_txns' is unset! Struct:GetOpenTxnsResponse(txn_high_water_mark:4, open_txns:null, min_open_txn:4, abortedBits:null) + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Failed to open transaction. Transactional tables support require Hive metastore version at least 3.0", e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public long getTableWriteId(String dbName, String tableName, long transactionId) + { + try { + return retry() + .stopOnIllegalExceptions() + .run("getTableWriteId", stats.getTableWriteId().wrap(() -> { + try (ThriftMetastoreClient metastoreClient = clientProvider.createMetastoreClient()) { + return metastoreClient.getTableWriteId(dbName, tableName, transactionId); + } + })); + } + catch (TException e) { + if (e.getMessage().contains("Invalid method name")) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Transactional tables support require Hive metastore version at least 3.0"); + } + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public ShowLocksResponse showLocks(ShowLocksRequest rqst) + { + try { + try (ThriftMetastoreClient metastoreClient = clientProvider.createMetastoreClient()) { + return metastoreClient.showLocks(rqst); + } + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public Optional getConfigValue(String name) + { + try { + return retry() + .stopOnIllegalExceptions() + .run("getConfigValueFromServer", () -> { + try (ThriftMetastoreClient metastoreClient = clientProvider.createMetastoreClient()) { + return Optional.ofNullable(metastoreClient.get_config_value(name, null)); + } + }); + } + catch (ConfigValSecurityException e) { + log.debug(e, "Could not fetch value for config '%s' from Hive", name); + return Optional.empty(); + } + catch (TException e) { + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, e); + } + catch (Exception e) { + throw propagate(e); + } + } + + private PrivilegeBag buildPrivilegeBag( + String databaseName, + String tableName, + HivePrincipal grantee, + Set privilegeGrantInfos) + { + ImmutableList.Builder privilegeBagBuilder = ImmutableList.builder(); + for (PrivilegeGrantInfo privilegeGrantInfo : privilegeGrantInfos) { + privilegeBagBuilder.add( + new HiveObjectPrivilege( + new HiveObjectRef(TABLE, databaseName, tableName, null, null), + grantee.getName(), + ThriftMetastoreUtil.fromPrestoPrincipalType(grantee.getType()), + privilegeGrantInfo, + "SQL")); + } + return new PrivilegeBag(privilegeBagBuilder.build()); + } + + private boolean containsAllPrivilege(Set requestedPrivileges) + { + return requestedPrivileges.stream() + .anyMatch(privilege -> privilege.getPrivilege().equalsIgnoreCase("all")); + } + + @SafeVarargs + private final T alternativeCall( + ClientSupplier clientSupplier, + AtomicInteger chosenAlternative, + Call... alternatives) + throws TException + { + checkArgument(alternatives.length > 0, "No alternatives"); + int chosen = chosenAlternative.get(); + checkArgument(chosen == Integer.MAX_VALUE || (0 <= chosen && chosen < alternatives.length), "Bad chosen alternative value: %s", chosen); + + if (chosen != Integer.MAX_VALUE) { + try (ThriftMetastoreClient client = clientSupplier.createMetastoreClient()) { + return alternatives[chosen].callOn(client); + } + } + + Exception firstException = null; + for (int i = 0; i < alternatives.length; i++) { + int position = i; + try (ThriftMetastoreClient client = clientSupplier.createMetastoreClient()) { + T result = alternatives[i].callOn(client); + chosenAlternative.updateAndGet(currentChosen -> Math.min(currentChosen, position)); + return result; + } + catch (TException | RuntimeException exception) { + if (firstException == null) { + firstException = exception; + } + else if (firstException != exception) { + firstException.addSuppressed(exception); + } + } + } + + verifyNotNull(firstException); + propagateIfPossible(firstException, TException.class); + throw propagate(firstException); + } + + private ThriftMetastoreClient createMetastoreClient() throws TException + { + return clientProvider.createMetastoreClient(); + } + + private ThriftMetastoreClient createMetastoreClient(HiveIdentity identity) + throws TException + { + ThriftMetastoreClient client = createMetastoreClient(); + if (!impersonationEnabled) { + return client; + } + + setMetastoreUserOrClose(client, identity.getUsername() + .orElseThrow(() -> new IllegalStateException("End-user name should exist when metastore impersonation is enabled"))); + return client; + } + + private static void setMetastoreUserOrClose(ThriftMetastoreClient client, String username) + throws TException + { + try { + client.setUGI(username); + } + catch (Throwable t) { + // close client and suppress any error from close + try (Closeable ignored = client) { + throw t; + } + catch (IOException e) { + // impossible; will be suppressed + } + } + } + + protected RetryDriver retry() + { + return RetryDriver.retry() + .exponentialBackoff(minBackoffDelay, maxBackoffDelay, maxRetryTime, backoffScaleFactor) + .maxAttempts(maxRetries + 1) + .stopOn(PrestoException.class); + } + + protected static RuntimeException propagate(Throwable throwable) + { + if (throwable instanceof InterruptedException) { + Thread.currentThread().interrupt(); + } + throwIfUnchecked(throwable); + throw new RuntimeException(throwable); + } + + @FunctionalInterface + private interface ClientSupplier + { + ThriftMetastoreClient createMetastoreClient() + throws TException; + } + + @FunctionalInterface + private interface Call + { + T callOn(ThriftMetastoreClient client) + throws TException; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftHiveMetastoreClient.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftHiveMetastoreClient.java new file mode 100644 index 00000000..52d524ef --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftHiveMetastoreClient.java @@ -0,0 +1,557 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.collect.ImmutableList; +import io.airlift.log.Logger; +import io.prestosql.plugin.hive.util.LoggingInvocationHandler; +import org.apache.hadoop.hive.common.ValidTxnList; +import org.apache.hadoop.hive.common.ValidTxnWriteIdList; +import org.apache.hadoop.hive.metastore.api.AbortTxnRequest; +import org.apache.hadoop.hive.metastore.api.AllocateTableWriteIdsRequest; +import org.apache.hadoop.hive.metastore.api.AllocateTableWriteIdsResponse; +import org.apache.hadoop.hive.metastore.api.CheckLockRequest; +import org.apache.hadoop.hive.metastore.api.ClientCapabilities; +import org.apache.hadoop.hive.metastore.api.ClientCapability; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.CommitTxnRequest; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.GetRoleGrantsForPrincipalRequest; +import org.apache.hadoop.hive.metastore.api.GetRoleGrantsForPrincipalResponse; +import org.apache.hadoop.hive.metastore.api.GetTableRequest; +import org.apache.hadoop.hive.metastore.api.GetValidWriteIdsRequest; +import org.apache.hadoop.hive.metastore.api.GrantRevokeRoleRequest; +import org.apache.hadoop.hive.metastore.api.GrantRevokeRoleResponse; +import org.apache.hadoop.hive.metastore.api.GrantRevokeType; +import org.apache.hadoop.hive.metastore.api.HeartbeatTxnRangeRequest; +import org.apache.hadoop.hive.metastore.api.HiveObjectPrivilege; +import org.apache.hadoop.hive.metastore.api.HiveObjectRef; +import org.apache.hadoop.hive.metastore.api.LockRequest; +import org.apache.hadoop.hive.metastore.api.LockResponse; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.OpenTxnRequest; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.PartitionsStatsRequest; +import org.apache.hadoop.hive.metastore.api.PrincipalType; +import org.apache.hadoop.hive.metastore.api.PrivilegeBag; +import org.apache.hadoop.hive.metastore.api.Role; +import org.apache.hadoop.hive.metastore.api.RolePrincipalGrant; +import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; +import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.TableStatsRequest; +import org.apache.hadoop.hive.metastore.api.TableValidWriteIds; +import org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore; +import org.apache.hadoop.hive.metastore.txn.TxnUtils; +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TBinaryProtocol; +import org.apache.thrift.transport.TTransport; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.reflect.Reflection.newProxy; +import static java.util.Objects.requireNonNull; + +public class ThriftHiveMetastoreClient + implements ThriftMetastoreClient +{ + private static final Logger log = Logger.get(ThriftHiveMetastoreClient.class); + + private static final LoggingInvocationHandler.ParameterNamesProvider PARAMETER_NAMES_PROVIDER = new LoggingInvocationHandler.AirliftParameterNamesProvider(ThriftHiveMetastore.Iface.class, ThriftHiveMetastore.Client.class); + + private final TTransport transport; + private final ThriftHiveMetastore.Iface client; + private final String hostname; + + public ThriftHiveMetastoreClient(TTransport transport, String hostname) + { + this.transport = requireNonNull(transport, "transport is null"); + ThriftHiveMetastore.Client client = new ThriftHiveMetastore.Client(new TBinaryProtocol(transport)); + if (log.isDebugEnabled()) { + this.client = newProxy(ThriftHiveMetastore.Iface.class, new LoggingInvocationHandler(client, PARAMETER_NAMES_PROVIDER, log::debug)); + } + else { + this.client = client; + } + this.hostname = requireNonNull(hostname, "hostname is null"); + } + + public ThriftHiveMetastore.Iface getClient() + { + return client; + } + + @Override + public void close() + { + transport.close(); + } + + @Override + public List getAllDatabases() + throws TException + { + return client.get_all_databases(); + } + + @Override + public Database getDatabase(String dbName) + throws TException + { + return client.get_database(dbName); + } + + @Override + public List getAllTables(String databaseName) + throws TException + { + return client.get_all_tables(databaseName); + } + + @Override + public List getTableNamesByFilter(String databaseName, String filter) + throws TException + { + return client.get_table_names_by_filter(databaseName, filter, (short) -1); + } + + @Override + public void createDatabase(Database database) + throws TException + { + client.create_database(database); + } + + @Override + public void dropDatabase(String databaseName, boolean deleteData, boolean cascade) + throws TException + { + client.drop_database(databaseName, deleteData, cascade); + } + + @Override + public void alterDatabase(String databaseName, Database database) + throws TException + { + client.alter_database(databaseName, database); + } + + @Override + public void createTable(Table table) + throws TException + { + client.create_table(table); + } + + @Override + public void dropTable(String databaseName, String name, boolean deleteData) + throws TException + { + client.drop_table(databaseName, name, deleteData); + } + + @Override + public void alterTable(String databaseName, String tableName, Table newTable) + throws TException + { + client.alter_table(databaseName, tableName, newTable); + } + + @Override + public Table getTable(String databaseName, String tableName) + throws TException + { + return client.get_table(databaseName, tableName); + } + + @Override + public Table getTableWithCapabilities(String databaseName, String tableName) + throws TException + { + GetTableRequest request = new GetTableRequest(); + request.setDbName(databaseName); + request.setTblName(tableName); + request.setCapabilities(new ClientCapabilities(ImmutableList.of(ClientCapability.INSERT_ONLY_TABLES))); + return client.get_table_req(request).getTable(); + } + + @Override + public List getFields(String databaseName, String tableName) + throws TException + { + return client.get_fields(databaseName, tableName); + } + + @Override + public List getTableColumnStatistics(String databaseName, String tableName, List columnNames) + throws TException + { + TableStatsRequest tableStatsRequest = new TableStatsRequest(databaseName, tableName, columnNames); + return client.get_table_statistics_req(tableStatsRequest).getTableStats(); + } + + @Override + public void setTableColumnStatistics(String databaseName, String tableName, List statistics) + throws TException + { + ColumnStatisticsDesc statisticsDescription = new ColumnStatisticsDesc(true, databaseName, tableName); + ColumnStatistics request = new ColumnStatistics(statisticsDescription, statistics); + client.update_table_column_statistics(request); + } + + @Override + public void deleteTableColumnStatistics(String databaseName, String tableName, String columnName) + throws TException + { + client.delete_table_column_statistics(databaseName, tableName, columnName); + } + + @Override + public Map> getPartitionColumnStatistics(String databaseName, String tableName, List partitionNames, List columnNames) + throws TException + { + PartitionsStatsRequest partitionsStatsRequest = new PartitionsStatsRequest(databaseName, tableName, columnNames, partitionNames); + return client.get_partitions_statistics_req(partitionsStatsRequest).getPartStats(); + } + + @Override + public void setPartitionColumnStatistics(String databaseName, String tableName, String partitionName, List statistics) + throws TException + { + ColumnStatisticsDesc statisticsDescription = new ColumnStatisticsDesc(false, databaseName, tableName); + statisticsDescription.setPartName(partitionName); + ColumnStatistics request = new ColumnStatistics(statisticsDescription, statistics); + client.update_partition_column_statistics(request); + } + + @Override + public void deletePartitionColumnStatistics(String databaseName, String tableName, String partitionName, String columnName) + throws TException + { + client.delete_partition_column_statistics(databaseName, tableName, partitionName, columnName); + } + + @Override + public List getPartitionNames(String databaseName, String tableName) + throws TException + { + return client.get_partition_names(databaseName, tableName, (short) -1); + } + + @Override + public List getPartitionNamesFiltered(String databaseName, String tableName, List partitionValues) + throws TException + { + return client.get_partition_names_ps(databaseName, tableName, partitionValues, (short) -1); + } + + @Override + public int addPartitions(List newPartitions) + throws TException + { + return client.add_partitions(newPartitions); + } + + @Override + public boolean dropPartition(String databaseName, String tableName, List partitionValues, boolean deleteData) + throws TException + { + return client.drop_partition(databaseName, tableName, partitionValues, deleteData); + } + + @Override + public void alterPartition(String databaseName, String tableName, Partition partition) + throws TException + { + client.alter_partition(databaseName, tableName, partition); + } + + @Override + public void alterPartitions(String databaseName, String tableName, List partitions) + throws TException + { + client.alter_partitions(databaseName, tableName, partitions); + } + + @Override + public Partition getPartition(String databaseName, String tableName, List partitionValues) + throws TException + { + return client.get_partition(databaseName, tableName, partitionValues); + } + + @Override + public List getPartitionsByNames(String databaseName, String tableName, List partitionNames) + throws TException + { + return client.get_partitions_by_names(databaseName, tableName, partitionNames); + } + + @Override + public List listRoles(String principalName, PrincipalType principalType) + throws TException + { + return client.list_roles(principalName, principalType); + } + + @Override + public List listPrivileges(String principalName, PrincipalType principalType, HiveObjectRef hiveObjectRef) + throws TException + { + return client.list_privileges(principalName, principalType, hiveObjectRef); + } + + @Override + public List getRoleNames() + throws TException + { + return client.get_role_names(); + } + + @Override + public void createRole(String roleName, String grantor) + throws TException + { + Role role = new Role(roleName, 0, grantor); + client.create_role(role); + } + + @Override + public void dropRole(String role) + throws TException + { + client.drop_role(role); + } + + @Override + public boolean grantPrivileges(PrivilegeBag privilegeBag) + throws TException + { + return client.grant_privileges(privilegeBag); + } + + @Override + public boolean revokePrivileges(PrivilegeBag privilegeBag) + throws TException + { + return client.revoke_privileges(privilegeBag); + } + + @Override + public void grantRole(String role, String granteeName, PrincipalType granteeType, String grantorName, PrincipalType grantorType, boolean grantOption) + throws TException + { + List grants = listRoleGrants(granteeName, granteeType); + for (RolePrincipalGrant grant : grants) { + if (grant.getRoleName().equals(role)) { + if (grant.isGrantOption() == grantOption) { + return; + } + if (!grant.isGrantOption() && grantOption) { + revokeRole(role, granteeName, granteeType, false); + break; + } + } + } + createGrant(role, granteeName, granteeType, grantorName, grantorType, grantOption); + } + + private void createGrant(String role, String granteeName, PrincipalType granteeType, String grantorName, PrincipalType grantorType, boolean grantOption) + throws TException + { + GrantRevokeRoleRequest request = new GrantRevokeRoleRequest(); + request.setRequestType(GrantRevokeType.GRANT); + request.setRoleName(role); + request.setPrincipalName(granteeName); + request.setPrincipalType(granteeType); + request.setGrantor(grantorName); + request.setGrantorType(grantorType); + request.setGrantOption(grantOption); + GrantRevokeRoleResponse response = client.grant_revoke_role(request); + if (!response.isSetSuccess()) { + throw new MetaException("GrantRevokeResponse missing success field"); + } + } + + @Override + public void revokeRole(String role, String granteeName, PrincipalType granteeType, boolean grantOption) + throws TException + { + List grants = listRoleGrants(granteeName, granteeType); + RolePrincipalGrant currentGrant = null; + for (RolePrincipalGrant grant : grants) { + if (grant.getRoleName().equals(role)) { + currentGrant = grant; + break; + } + } + + if (currentGrant == null) { + return; + } + + if (!currentGrant.isGrantOption() && grantOption) { + return; + } + + removeGrant(role, granteeName, granteeType, grantOption); + } + + private void removeGrant(String role, String granteeName, PrincipalType granteeType, boolean grantOption) + throws TException + { + GrantRevokeRoleRequest request = new GrantRevokeRoleRequest(); + request.setRequestType(GrantRevokeType.REVOKE); + request.setRoleName(role); + request.setPrincipalName(granteeName); + request.setPrincipalType(granteeType); + request.setGrantOption(grantOption); + GrantRevokeRoleResponse response = client.grant_revoke_role(request); + if (!response.isSetSuccess()) { + throw new MetaException("GrantRevokeResponse missing success field"); + } + } + + @Override + public List listRoleGrants(String principalName, PrincipalType principalType) + throws TException + { + GetRoleGrantsForPrincipalRequest request = new GetRoleGrantsForPrincipalRequest(principalName, principalType); + GetRoleGrantsForPrincipalResponse resp = client.get_role_grants_for_principal(request); + return ImmutableList.copyOf(resp.getPrincipalGrants()); + } + + @Override + public void setUGI(String userName) + throws TException + { + client.set_ugi(userName, new ArrayList<>()); + } + + @Override + public long openTransaction(String user) + throws TException + { + OpenTxnRequest request = new OpenTxnRequest(1, user, hostname); + return client.open_txns(request).getTxn_ids().get(0); + } + + @Override + public void commitTransaction(long transactionId) + throws TException + { + client.commit_txn(new CommitTxnRequest(transactionId)); + } + + @Override + public void abortTransaction(long transactionId) + throws TException + { + client.abort_txn(new AbortTxnRequest(transactionId)); + } + + @Override + public void sendTransactionHeartbeat(long transactionId) + throws TException + { + HeartbeatTxnRangeRequest rqst = new HeartbeatTxnRangeRequest(transactionId, transactionId); + client.heartbeat_txn_range(rqst); + } + + @Override + public LockResponse acquireLock(LockRequest lockRequest) + throws TException + { + return client.lock(lockRequest); + } + + @Override + public LockResponse checkLock(long lockId) + throws TException + { + return client.check_lock(new CheckLockRequest(lockId)); + } + + @Override + public String getValidWriteIds(List tableList, long currentTransactionId, boolean isVacuum) + throws TException + { + // Pass currentTxn as 0L to get the recent snapshot of valid transactions in Hive + // Do not pass currentTransactionId instead as it will break Hive's listing of delta directories if major compaction + // deletes deleta directories for valid transactions that existed at the time transaction is opened + ValidTxnList validTransactions = TxnUtils.createValidReadTxnList(client.get_open_txns(), 0L); + GetValidWriteIdsRequest request = new GetValidWriteIdsRequest(tableList, validTransactions.toString()); + List tblValidWriteIds = client.get_valid_write_ids(request).getTblValidWriteIds(); + if (isVacuum) { + return createValidTxnWriteIdListForVacuum(currentTransactionId, tblValidWriteIds).toString(); + } + return TxnUtils.createValidTxnWriteIdList( + currentTransactionId, + tblValidWriteIds) + .toString(); + } + + /** + * Creates the validTxnWriteIdList which consists of all valid commits less than minOpenWriteId + */ + private static ValidTxnWriteIdList createValidTxnWriteIdListForVacuum(Long currentTxnId, List validIds) + { + ValidTxnWriteIdList validTxnWriteIdList = new ValidTxnWriteIdList(currentTxnId); + for (TableValidWriteIds tableWriteIds : validIds) { + validTxnWriteIdList.addTableValidWriteIdList(TxnUtils.createValidCompactWriteIdList(tableWriteIds)); + } + return validTxnWriteIdList; + } + + @Override + public ShowLocksResponse showLocks(ShowLocksRequest rqst) + throws TException + { + return client.show_locks(rqst); + } + + @Override + public String get_config_value(String name, String defaultValue) + throws TException + { + return client.get_config_value(name, defaultValue); + } + + @Override + public String getDelegationToken(String userName) + throws TException + { + return client.get_delegation_token(userName, userName); + } + + @Override + public long getTableWriteId(String dbName, String tableName, long transactionId) + throws TException + { + AllocateTableWriteIdsRequest allocateTableWriteIdsRequest = new AllocateTableWriteIdsRequest(); + allocateTableWriteIdsRequest.setDbName(dbName); + allocateTableWriteIdsRequest.setTableName(tableName); + allocateTableWriteIdsRequest.addToTxnIds(transactionId); + + final AllocateTableWriteIdsResponse allocateTableWriteIdsResponse = client.allocate_table_write_ids(allocateTableWriteIdsRequest); + long txnToWriteId = allocateTableWriteIdsResponse.getTxnToWriteIds().stream().filter(e -> e.getTxnId() == transactionId).collect(toImmutableList()).get(0).getWriteId(); + return txnToWriteId; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftHiveMetastoreConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftHiveMetastoreConfig.java new file mode 100644 index 00000000..a0974001 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftHiveMetastoreConfig.java @@ -0,0 +1,142 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.util.RetryDriver; + +import javax.validation.constraints.Min; +import javax.validation.constraints.NotNull; + +import java.util.concurrent.TimeUnit; + +public class ThriftHiveMetastoreConfig +{ + private int maxRetries = RetryDriver.DEFAULT_MAX_ATTEMPTS - 1; + private double backoffScaleFactor = RetryDriver.DEFAULT_SCALE_FACTOR; + private Duration minBackoffDelay = RetryDriver.DEFAULT_SLEEP_TIME; + private Duration maxBackoffDelay = RetryDriver.DEFAULT_SLEEP_TIME; + private Duration maxRetryTime = RetryDriver.DEFAULT_MAX_RETRY_TIME; + private Duration maxWaitForTransactionLock = new Duration(10, TimeUnit.MINUTES); + private boolean isRoleNameCaseSensitive; + private boolean impersonationEnabled; + + @Min(0) + public int getMaxRetries() + { + return maxRetries; + } + + @Config("hive.metastore.thrift.client.max-retries") + @ConfigDescription("Maximum number of retry attempts for metastore requests") + public ThriftHiveMetastoreConfig setMaxRetries(int maxRetries) + { + this.maxRetries = maxRetries; + return this; + } + + public double getBackoffScaleFactor() + { + return backoffScaleFactor; + } + + @Config("hive.metastore.thrift.client.backoff-scale-factor") + @ConfigDescription("Scale factor for metastore request retry delay") + public ThriftHiveMetastoreConfig setBackoffScaleFactor(double backoffScaleFactor) + { + this.backoffScaleFactor = backoffScaleFactor; + return this; + } + + @NotNull + public Duration getMaxRetryTime() + { + return maxRetryTime; + } + + @Config("hive.metastore.thrift.client.max-retry-time") + @ConfigDescription("Total time limit for a metastore request to be retried") + public ThriftHiveMetastoreConfig setMaxRetryTime(Duration maxRetryTime) + { + this.maxRetryTime = maxRetryTime; + return this; + } + + public Duration getMinBackoffDelay() + { + return minBackoffDelay; + } + + @Config("hive.metastore.thrift.client.min-backoff-delay") + @ConfigDescription("Minimum delay between metastore request retries") + public ThriftHiveMetastoreConfig setMinBackoffDelay(Duration minBackoffDelay) + { + this.minBackoffDelay = minBackoffDelay; + return this; + } + + public Duration getMaxBackoffDelay() + { + return maxBackoffDelay; + } + + @Config("hive.metastore.thrift.client.max-backoff-delay") + @ConfigDescription("Maximum delay between metastore request retries") + public ThriftHiveMetastoreConfig setMaxBackoffDelay(Duration maxBackoffDelay) + { + this.maxBackoffDelay = maxBackoffDelay; + return this; + } + + public Duration getMaxWaitForTransactionLock() + { + return maxWaitForTransactionLock; + } + + @Config("hive.metastore.thrift.txn-lock-max-wait") + @ConfigDescription("Maximum time to wait to acquire hive transaction lock") + public ThriftHiveMetastoreConfig setMaxWaitForTransactionLock(Duration maxWaitForTransactionLock) + { + this.maxWaitForTransactionLock = maxWaitForTransactionLock; + return this; + } + + @Config("hive.metastore.thrift.is-role-name-case-sensitive") + @ConfigDescription("whether the role name to be Case-Sensitive or not, default value false.") + public ThriftHiveMetastoreConfig setRoleNameCaseSensitive(boolean isRoleNameCaseSensitive) + { + this.isRoleNameCaseSensitive = isRoleNameCaseSensitive; + return this; + } + + public boolean isRoleNameCaseSensitive() + { + return isRoleNameCaseSensitive; + } + + public boolean isImpersonationEnabled() + { + return impersonationEnabled; + } + + @Config("hive.metastore.thrift.impersonation.enabled") + @ConfigDescription("Should end user be impersonated when communicating with metastore") + public ThriftHiveMetastoreConfig setImpersonationEnabled(boolean impersonationEnabled) + { + this.impersonationEnabled = impersonationEnabled; + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastore.java new file mode 100644 index 00000000..1b6fd8d0 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastore.java @@ -0,0 +1,188 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HivePartition; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.HivePrivilegeInfo; +import io.prestosql.plugin.hive.metastore.PartitionWithStatistics; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.metastore.api.DataOperationType; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; +import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; +import org.apache.hadoop.hive.metastore.api.Table; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; + +public interface ThriftMetastore +{ + void createDatabase(HiveIdentity identity, Database database); + + void dropDatabase(HiveIdentity identity, String databaseName); + + void alterDatabase(HiveIdentity identity, String databaseName, Database database); + + void createTable(HiveIdentity identity, Table table); + + void dropTable(HiveIdentity identity, String databaseName, String tableName, boolean deleteData); + + void alterTable(HiveIdentity identity, String databaseName, String tableName, Table table); + + List getAllDatabases(); + + Optional> getAllTables(String databaseName); + + Optional> getAllViews(String databaseName); + + Optional getDatabase(String databaseName); + + void addPartitions(HiveIdentity identity, String databaseName, String tableName, List partitions); + + void dropPartition(HiveIdentity identity, String databaseName, String tableName, List parts, boolean deleteData); + + void alterPartition(HiveIdentity identity, String databaseName, String tableName, PartitionWithStatistics partition); + + Optional> getPartitionNames(HiveIdentity identity, String databaseName, String tableName); + + Optional> getPartitionNamesByParts(HiveIdentity identity, String databaseName, String tableName, List parts); + + Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues); + + List getPartitionsByNames(HiveIdentity identity, String databaseName, String tableName, List partitionNames); + + Optional
getTable(HiveIdentity identity, String databaseName, String tableName); + + Set getSupportedColumnStatistics(Type type); + + PartitionStatistics getTableStatistics(HiveIdentity identity, Table table); + + Map getPartitionStatistics(HiveIdentity identity, Table table, List partitions); + + void updateTableStatistics(HiveIdentity identity, String databaseName, String tableName, Function update); + + void updatePartitionStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, Function update); + + void updatePartitionsStatistics(HiveIdentity identity, String databaseName, String tableName, Map> partNamesUpdateFunctionMap); + + void createRole(String role, String grantor); + + void dropRole(String role); + + Set listRoles(); + + void grantRoles(Set roles, Set grantees, boolean withAdminOption, HivePrincipal grantor); + + void revokeRoles(Set roles, Set grantees, boolean adminOptionFor, HivePrincipal grantor); + + Set listRoleGrants(HivePrincipal principal); + + void grantTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges); + + void revokeTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges); + + Set listTablePrivileges(String databaseName, String tableName, HivePrincipal principal); + + boolean isImpersonationEnabled(); + + default Optional> getFields(HiveIdentity identity, String databaseName, String tableName) + { + Optional
table = getTable(identity, databaseName, tableName); + if (!table.isPresent()) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + + if (table.get().getSd() == null) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Table is missing storage descriptor"); + } + + return Optional.of(table.get().getSd().getCols()); + } + + default long openTransaction(HiveIdentity identity) + { + throw new UnsupportedOperationException(); + } + + default void commitTransaction(HiveIdentity identity, long transactionId) + { + throw new UnsupportedOperationException(); + } + + default void abortTransaction(HiveIdentity identity, long transactionId) + { + throw new UnsupportedOperationException(); + } + + default void sendTransactionHeartbeat(HiveIdentity identity, long transactionId) + { + throw new UnsupportedOperationException(); + } + + default void acquireSharedReadLock(HiveIdentity identity, String queryId, long transactionId, List fullTables, List partitions) + { + throw new UnsupportedOperationException(); + } + + default void acquireLock(HiveIdentity identity, String queryId, long transactionId, List fullTables, List partitions, DataOperationType operationType) + { + throw new UnsupportedOperationException(); + } + + default String getValidWriteIds(HiveIdentity identity, List tables, long currentTransactionId, boolean isVacuum) + { + throw new UnsupportedOperationException(); + } + + default ShowLocksResponse showLocks(ShowLocksRequest rqst) + { + throw new UnsupportedOperationException(); + } + + default long getTableWriteId(String dbName, String tableName, long transactionId) + { + throw new UnsupportedOperationException(); + } + + default Optional getConfigValue(String name) + { + return Optional.empty(); + } + + default Set listColumnPrivileges(String databaseName, String tableName, String columnName, + HivePrincipal principal) + { + throw new UnsupportedOperationException(); + } + + default Set listSchemaPrivileges(String databaseName, String tableName, + HivePrincipal principal) + { + throw new UnsupportedOperationException(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreApiStats.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreApiStats.java new file mode 100644 index 00000000..0ea7d883 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreApiStats.java @@ -0,0 +1,97 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import io.airlift.stats.CounterStat; +import io.airlift.stats.TimeStat; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.thrift.TBase; +import org.apache.thrift.TException; +import org.weakref.jmx.Managed; +import org.weakref.jmx.Nested; + +import javax.annotation.concurrent.ThreadSafe; + +import java.util.concurrent.Callable; + +import static java.util.concurrent.TimeUnit.MILLISECONDS; + +@ThreadSafe +public class ThriftMetastoreApiStats +{ + private final TimeStat time = new TimeStat(MILLISECONDS); + private final CounterStat totalFailures = new CounterStat(); + private final CounterStat metastoreExceptions = new CounterStat(); + private final CounterStat thriftExceptions = new CounterStat(); + + public Callable wrap(Callable callable) + { + return () -> { + try (TimeStat.BlockTimer ignored = time.time()) { + return callable.call(); + } + catch (Exception e) { + if (e instanceof MetaException) { + metastoreExceptions.update(1); + // Need to throw here instead of falling through due to JDK-8059299 + totalFailures.update(1); + throw e; + } + + if (e instanceof TException) { + if (e instanceof TBase) { + // This exception is an API response and not a server error + throw e; + } + + thriftExceptions.update(1); + // Need to throw here instead of falling through due to JDK-8059299 + totalFailures.update(1); + throw e; + } + + totalFailures.update(1); + throw e; + } + }; + } + + @Managed + @Nested + public TimeStat getTime() + { + return time; + } + + @Managed + @Nested + public CounterStat getTotalFailures() + { + return totalFailures; + } + + @Managed + @Nested + public CounterStat getThriftExceptions() + { + return thriftExceptions; + } + + @Managed + @Nested + public CounterStat getMetastoreExceptions() + { + return metastoreExceptions; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreClient.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreClient.java new file mode 100644 index 00000000..2e444d5d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreClient.java @@ -0,0 +1,189 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.HiveObjectPrivilege; +import org.apache.hadoop.hive.metastore.api.HiveObjectRef; +import org.apache.hadoop.hive.metastore.api.LockRequest; +import org.apache.hadoop.hive.metastore.api.LockResponse; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.PrincipalType; +import org.apache.hadoop.hive.metastore.api.PrivilegeBag; +import org.apache.hadoop.hive.metastore.api.Role; +import org.apache.hadoop.hive.metastore.api.RolePrincipalGrant; +import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; +import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.thrift.TException; + +import java.io.Closeable; +import java.util.List; +import java.util.Map; + +public interface ThriftMetastoreClient + extends Closeable +{ + @Override + void close(); + + List getAllDatabases() + throws TException; + + Database getDatabase(String databaseName) + throws TException; + + List getAllTables(String databaseName) + throws TException; + + List getTableNamesByFilter(String databaseName, String filter) + throws TException; + + void createDatabase(Database database) + throws TException; + + void dropDatabase(String databaseName, boolean deleteData, boolean cascade) + throws TException; + + void alterDatabase(String databaseName, Database database) + throws TException; + + void createTable(Table table) + throws TException; + + void dropTable(String databaseName, String name, boolean deleteData) + throws TException; + + void alterTable(String databaseName, String tableName, Table newTable) + throws TException; + + Table getTable(String databaseName, String tableName) + throws TException; + + Table getTableWithCapabilities(String databaseName, String tableName) + throws TException; + + List getFields(String databaseName, String tableName) + throws TException; + + List getTableColumnStatistics(String databaseName, String tableName, List columnNames) + throws TException; + + void setTableColumnStatistics(String databaseName, String tableName, List statistics) + throws TException; + + void deleteTableColumnStatistics(String databaseName, String tableName, String columnName) + throws TException; + + Map> getPartitionColumnStatistics(String databaseName, String tableName, List partitionNames, List columnNames) + throws TException; + + void setPartitionColumnStatistics(String databaseName, String tableName, String partitionName, List statistics) + throws TException; + + void deletePartitionColumnStatistics(String databaseName, String tableName, String partitionName, String columnName) + throws TException; + + List getPartitionNames(String databaseName, String tableName) + throws TException; + + List getPartitionNamesFiltered(String databaseName, String tableName, List partitionValues) + throws TException; + + int addPartitions(List newPartitions) + throws TException; + + boolean dropPartition(String databaseName, String tableName, List partitionValues, boolean deleteData) + throws TException; + + void alterPartition(String databaseName, String tableName, Partition partition) + throws TException; + + void alterPartitions(String databaseName, String tableName, List partitions) + throws TException; + + Partition getPartition(String databaseName, String tableName, List partitionValues) + throws TException; + + List getPartitionsByNames(String databaseName, String tableName, List partitionNames) + throws TException; + + List listRoles(String principalName, PrincipalType principalType) + throws TException; + + List listPrivileges(String principalName, PrincipalType principalType, HiveObjectRef hiveObjectRef) + throws TException; + + List getRoleNames() + throws TException; + + void createRole(String role, String grantor) + throws TException; + + void dropRole(String role) + throws TException; + + boolean grantPrivileges(PrivilegeBag privilegeBag) + throws TException; + + boolean revokePrivileges(PrivilegeBag privilegeBag) + throws TException; + + void grantRole(String role, String granteeName, PrincipalType granteeType, String grantorName, PrincipalType grantorType, boolean grantOption) + throws TException; + + void revokeRole(String role, String granteeName, PrincipalType granteeType, boolean grantOption) + throws TException; + + List listRoleGrants(String name, PrincipalType principalType) + throws TException; + + void setUGI(String userName) + throws TException; + + long openTransaction(String user) + throws TException; + + void commitTransaction(long transactionId) + throws TException; + + void abortTransaction(long transactionId) + throws TException; + + void sendTransactionHeartbeat(long transactionId) + throws TException; + + LockResponse acquireLock(LockRequest lockRequest) + throws TException; + + LockResponse checkLock(long lockId) + throws TException; + + String getValidWriteIds(List tableList, long currentTransactionId, boolean isVacuum) + throws TException; + + String get_config_value(String name, String defaultValue) + throws TException; + + String getDelegationToken(String userName) + throws TException; + + long getTableWriteId(String dbName, String tableName, long transactionId) + throws TException; + + ShowLocksResponse showLocks(ShowLocksRequest rqst) + throws TException; +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreClientFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreClientFactory.java new file mode 100644 index 00000000..7998caa1 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreClientFactory.java @@ -0,0 +1,103 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.net.HostAndPort; +import io.airlift.units.Duration; +import io.hetu.core.common.util.SslSocketUtil; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.authentication.HiveMetastoreAuthentication; +import io.prestosql.plugin.hive.metastore.MetastoreClientFactory; +import io.prestosql.spi.NodeManager; +import io.prestosql.spi.PrestoException; +import org.apache.thrift.transport.TTransportException; + +import javax.inject.Inject; +import javax.net.ssl.SSLContext; + +import java.security.GeneralSecurityException; +import java.util.Optional; + +import static io.prestosql.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; +import static java.lang.Math.toIntExact; +import static java.util.Objects.requireNonNull; + +public class ThriftMetastoreClientFactory + implements MetastoreClientFactory +{ + private Optional sslContext; + private Optional socksProxy; + private int timeoutMillis; + private HiveMetastoreAuthentication metastoreAuthentication; + private String hostname; + + public ThriftMetastoreClientFactory( + Optional sslContext, + Optional socksProxy, + Duration timeout, + HiveMetastoreAuthentication metastoreAuthentication, + String hostname) + { + init(sslContext, socksProxy, timeout, metastoreAuthentication, hostname); + } + + @Inject + public ThriftMetastoreClientFactory(HiveConfig config, HiveMetastoreAuthentication metastoreAuthentication, NodeManager nodeManager) + { + try { + init(SslSocketUtil.buildSslContext(config.isTlsEnabled()), + Optional.ofNullable(config.getMetastoreSocksProxy()), + config.getMetastoreTimeout(), + metastoreAuthentication, + nodeManager.getCurrentNode().getHost()); + } + catch (GeneralSecurityException e) { + throw new PrestoException(GENERIC_INTERNAL_ERROR, e.getMessage()); + } + } + + public ThriftMetastoreClientFactory(HiveConfig config, HiveMetastoreAuthentication metastoreAuthentication) + { + try { + init(SslSocketUtil.buildSslContext(config.isTlsEnabled()), + Optional.ofNullable(config.getMetastoreSocksProxy()), + config.getMetastoreTimeout(), + metastoreAuthentication, + "localhost"); + } + catch (GeneralSecurityException e) { + throw new PrestoException(GENERIC_INTERNAL_ERROR, e.getMessage()); + } + } + + public ThriftMetastoreClient create(HostAndPort address) + throws TTransportException + { + return new ThriftHiveMetastoreClient(Transport.create(address, sslContext, socksProxy, timeoutMillis, metastoreAuthentication), hostname); + } + + private void init( + Optional sslContext, + Optional socksProxy, + Duration timeout, + HiveMetastoreAuthentication metastoreAuthentication, + String hostname) + { + this.sslContext = requireNonNull(sslContext, "sslContext is null"); + this.socksProxy = requireNonNull(socksProxy, "socksProxy is null"); + this.timeoutMillis = toIntExact(timeout.toMillis()); + this.metastoreAuthentication = requireNonNull(metastoreAuthentication, "metastoreAuthentication is null"); + this.hostname = requireNonNull(hostname, "hostname is null"); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreModule.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreModule.java new file mode 100644 index 00000000..5ff862bd --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreModule.java @@ -0,0 +1,120 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.inject.Binder; +import com.google.inject.Scopes; +import com.google.inject.multibindings.Multibinder; +import io.airlift.configuration.AbstractConfigurationAwareModule; +import io.airlift.log.Logger; +import io.prestosql.plugin.hive.ForCachingHiveMetastore; +import io.prestosql.plugin.hive.ForRecordingHiveMetastore; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.metastore.CachingHiveMetastore; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.MetastoreClientFactory; +import io.prestosql.plugin.hive.metastore.MetastoreConfig; +import io.prestosql.plugin.hive.metastore.RecordingHiveMetastore; +import io.prestosql.plugin.hive.metastore.WriteHiveMetastoreRecordingProcedure; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.procedure.Procedure; + +import static com.google.inject.multibindings.Multibinder.newSetBinder; +import static io.airlift.configuration.ConfigBinder.configBinder; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftConstants.WHITE_LIST_FOR_METASTORECLIENTFACTORY_CLASS; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftConstants.WHITE_LIST_FOR_THRIFTMETASTORE_CLASS; +import static org.weakref.jmx.guice.ExportBinder.newExporter; + +public class ThriftMetastoreModule + extends AbstractConfigurationAwareModule +{ + private static final Logger log = Logger.get(ThriftMetastoreModule.class); + + @Override + protected void setup(Binder binder) + { + MetastoreConfig config = this.buildConfigObject(MetastoreConfig.class); + try { + // bind MetastoreClientFactory + if (config.getMetastoreClientFactoryImp().isEmpty()) { + log.info("Binding default implementation of MetastoreClientFactory."); + binder.bind(ThriftMetastoreClientFactory.class).in(Scopes.SINGLETON); + binder.bind(MetastoreClientFactory.class).to(ThriftMetastoreClientFactory.class).in(Scopes.SINGLETON); + } + else { + if (!WHITE_LIST_FOR_METASTORECLIENTFACTORY_CLASS.contains(config.getMetastoreClientFactoryImp().trim())) { + throw new PrestoException(HiveErrorCode.HIVE_FILE_NOT_FOUND, "Found illegal class when binding MetastoreClientFactory."); + } + log.info("Binding MetastoreClientFactory.class to %s", config.getMetastoreClientFactoryImp().trim()); + binder.bind(MetastoreClientFactory.class) + .to((Class) Class.forName(config.getMetastoreClientFactoryImp().trim())) + .in(Scopes.SINGLETON); + } + + // bind MetastoreLocator + binder.bind(MetastoreLocator.class).to(StaticMetastoreLocator.class).in(Scopes.SINGLETON); + + // bind ThriftMetastore + if (config.getThriftMetastoreImp().isEmpty()) { + log.info("Binding default implementation of ThriftMetastore."); + binder.bind(ThriftMetastore.class).to(ThriftHiveMetastore.class).in(Scopes.SINGLETON); + } + else { + if (!WHITE_LIST_FOR_THRIFTMETASTORE_CLASS.contains(config.getThriftMetastoreImp().trim())) { + throw new PrestoException(HiveErrorCode.HIVE_FILE_NOT_FOUND, "Found illegal class when binding ThriftMetastore."); + } + log.info("Binding ThriftMetastore.class to %s", config.getThriftMetastoreImp().trim()); + binder.bind(ThriftMetastore.class) + .to((Class) Class.forName(config.getThriftMetastoreImp().trim())) + .in(Scopes.SINGLETON); + } + } + catch (ClassNotFoundException e) { + log.error("Failed to bind classes which specified in MetaStore configuration. error: %s", e.getLocalizedMessage()); + throw new PrestoException(HiveErrorCode.HIVE_METASTORE_ERROR, "Failed to bind classes which specified in MetaStore configuration"); + } + + configBinder(binder).bindConfig(ThriftHiveMetastoreConfig.class); + configBinder(binder).bindConfig(StaticMetastoreConfig.class); + + if (buildConfigObject(HiveConfig.class).getRecordingPath() != null) { + binder.bind(HiveMetastore.class) + .annotatedWith(ForRecordingHiveMetastore.class) + .to(BridgingHiveMetastore.class) + .in(Scopes.SINGLETON); + binder.bind(HiveMetastore.class) + .annotatedWith(ForCachingHiveMetastore.class) + .to(RecordingHiveMetastore.class) + .in(Scopes.SINGLETON); + binder.bind(RecordingHiveMetastore.class).in(Scopes.SINGLETON); + newExporter(binder).export(RecordingHiveMetastore.class).withGeneratedName(); + + Multibinder procedures = newSetBinder(binder, Procedure.class); + procedures.addBinding().toProvider(WriteHiveMetastoreRecordingProcedure.class).in(Scopes.SINGLETON); + } + else { + binder.bind(HiveMetastore.class) + .annotatedWith(ForCachingHiveMetastore.class) + .to(BridgingHiveMetastore.class) + .in(Scopes.SINGLETON); + } + + binder.bind(HiveMetastore.class).to(CachingHiveMetastore.class).in(Scopes.SINGLETON); + newExporter(binder).export(ThriftMetastore.class) + .as(generator -> generator.generatedNameOf(ThriftHiveMetastore.class)); + newExporter(binder).export(HiveMetastore.class) + .as(generator -> generator.generatedNameOf(CachingHiveMetastore.class)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreStats.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreStats.java new file mode 100644 index 00000000..9355384f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreStats.java @@ -0,0 +1,317 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import org.weakref.jmx.Managed; +import org.weakref.jmx.Nested; + +public class ThriftMetastoreStats +{ + private final ThriftMetastoreApiStats getAllDatabases = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats getDatabase = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats getAllTables = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats getAllViews = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats getTable = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats getFields = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats getTableColumnStatistics = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats getPartitionColumnStatistics = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats getPartitionNames = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats getPartitionNamesPs = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats getPartition = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats getPartitionsByNames = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats createDatabase = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats dropDatabase = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats alterDatabase = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats createTable = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats dropTable = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats alterTable = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats addPartitions = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats dropPartition = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats alterPartition = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats listTablePrivileges = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats grantTablePrivileges = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats revokeTablePrivileges = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats listRoles = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats grantRole = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats revokeRole = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats listRoleGrants = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats createRole = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats dropRole = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats openTransaction = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats commitTransaction = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats rollbackTransaction = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats acquireLock = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats checkLock = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats validWriteIds = new ThriftMetastoreApiStats(); + private final ThriftMetastoreApiStats tableWriteId = new ThriftMetastoreApiStats(); + + @Managed + @Nested + public ThriftMetastoreApiStats getGetAllDatabases() + { + return getAllDatabases; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGetDatabase() + { + return getDatabase; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGetAllTables() + { + return getAllTables; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGetAllViews() + { + return getAllViews; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGetTable() + { + return getTable; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGetFields() + { + return getFields; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGetTableColumnStatistics() + { + return getTableColumnStatistics; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGetPartitionColumnStatistics() + { + return getPartitionColumnStatistics; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGetPartitionNames() + { + return getPartitionNames; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGetPartitionNamesPs() + { + return getPartitionNamesPs; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGetPartition() + { + return getPartition; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGetPartitionsByNames() + { + return getPartitionsByNames; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getCreateDatabase() + { + return createDatabase; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getDropDatabase() + { + return dropDatabase; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getAlterDatabase() + { + return alterDatabase; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getCreateTable() + { + return createTable; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getDropTable() + { + return dropTable; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getAlterTable() + { + return alterTable; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getAddPartitions() + { + return addPartitions; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getDropPartition() + { + return dropPartition; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getAlterPartition() + { + return alterPartition; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGrantTablePrivileges() + { + return grantTablePrivileges; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getRevokeTablePrivileges() + { + return revokeTablePrivileges; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getListTablePrivileges() + { + return listTablePrivileges; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getListRoles() + { + return listRoles; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getGrantRole() + { + return grantRole; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getRevokeRole() + { + return revokeRole; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getListRoleGrants() + { + return listRoleGrants; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getCreateRole() + { + return createRole; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getDropRole() + { + return dropRole; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getOpenTransaction() + { + return openTransaction; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getCommitTransaction() + { + return commitTransaction; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getRollbackTransaction() + { + return rollbackTransaction; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getAcquireLock() + { + return acquireLock; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getCheckLock() + { + return checkLock; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getValidWriteIds() + { + return validWriteIds; + } + + @Managed + @Nested + public ThriftMetastoreApiStats getTableWriteId() + { + return tableWriteId; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreUtil.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreUtil.java new file mode 100644 index 00000000..573aea3c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/ThriftMetastoreUtil.java @@ -0,0 +1,1000 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.collect.AbstractIterator; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Streams; +import com.google.common.primitives.Longs; +import com.google.common.primitives.Shorts; +import io.prestosql.plugin.hive.HiveBasicStatistics; +import io.prestosql.plugin.hive.HiveBucketProperty; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.HivePrivilegeInfo; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.PartitionWithStatistics; +import io.prestosql.plugin.hive.metastore.PrincipalPrivileges; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.security.ConnectorIdentity; +import io.prestosql.spi.security.PrestoPrincipal; +import io.prestosql.spi.security.PrincipalType; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.security.SelectedRole; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.ArrayType; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.MapType; +import io.prestosql.spi.type.RowType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData; +import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.Date; +import org.apache.hadoop.hive.metastore.api.DateColumnStatsData; +import org.apache.hadoop.hive.metastore.api.Decimal; +import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData; +import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; +import org.apache.hadoop.hive.metastore.api.Order; +import org.apache.hadoop.hive.metastore.api.PrincipalPrivilegeSet; +import org.apache.hadoop.hive.metastore.api.PrivilegeGrantInfo; +import org.apache.hadoop.hive.metastore.api.RolePrincipalGrant; +import org.apache.hadoop.hive.metastore.api.SerDeInfo; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import javax.annotation.Nullable; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.time.LocalDate; +import java.util.ArrayDeque; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalDouble; +import java.util.OptionalLong; +import java.util.Queue; +import java.util.Set; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Stream; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.emptyToNull; +import static com.google.common.base.Strings.nullToEmpty; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.prestosql.plugin.hive.HiveMetadata.AVRO_SCHEMA_URL_KEY; +import static io.prestosql.plugin.hive.HiveStorageFormat.AVRO; +import static io.prestosql.plugin.hive.HiveStorageFormat.CSV; +import static io.prestosql.spi.security.PrincipalType.ROLE; +import static io.prestosql.spi.security.PrincipalType.USER; +import static io.prestosql.spi.statistics.ColumnStatisticType.MAX_VALUE; +import static io.prestosql.spi.statistics.ColumnStatisticType.MAX_VALUE_SIZE_IN_BYTES; +import static io.prestosql.spi.statistics.ColumnStatisticType.MIN_VALUE; +import static io.prestosql.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; +import static io.prestosql.spi.statistics.ColumnStatisticType.NUMBER_OF_NON_NULL_VALUES; +import static io.prestosql.spi.statistics.ColumnStatisticType.NUMBER_OF_TRUE_VALUES; +import static io.prestosql.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.Chars.isCharType; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TimestampType.TIMESTAMP; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.VarbinaryType.VARBINARY; +import static io.prestosql.spi.type.Varchars.isVarcharType; +import static java.lang.Math.round; +import static java.lang.String.format; +import static java.util.Locale.ENGLISH; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.binaryStats; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.booleanStats; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.dateStats; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.decimalStats; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.doubleStats; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.longStats; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.stringStats; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE; + +public final class ThriftMetastoreUtil +{ + private static final String PUBLIC_ROLE_NAME = "public"; + private static final String ADMIN_ROLE_NAME = "admin"; + private static final String NUM_FILES = "numFiles"; + private static final String NUM_ROWS = "numRows"; + private static final String RAW_DATA_SIZE = "rawDataSize"; + private static final String TOTAL_SIZE = "totalSize"; + private static final Set STATS_PROPERTIES = ImmutableSet.of(NUM_FILES, NUM_ROWS, RAW_DATA_SIZE, TOTAL_SIZE); + + private ThriftMetastoreUtil() {} + + public static org.apache.hadoop.hive.metastore.api.Database toMetastoreApiDatabase(Database database) + { + org.apache.hadoop.hive.metastore.api.Database result = new org.apache.hadoop.hive.metastore.api.Database(); + result.setName(database.getDatabaseName()); + database.getLocation().ifPresent(result::setLocationUri); + result.setOwnerName(database.getOwnerName()); + result.setOwnerType(toMetastoreApiPrincipalType(database.getOwnerType())); + database.getComment().ifPresent(result::setDescription); + result.setParameters(database.getParameters()); + return result; + } + + public static org.apache.hadoop.hive.metastore.api.Table toMetastoreApiTable(Table table, PrincipalPrivileges privileges) + { + org.apache.hadoop.hive.metastore.api.Table result = toMetastoreApiTable(table); + result.setPrivileges(toMetastoreApiPrincipalPrivilegeSet(privileges)); + return result; + } + + static org.apache.hadoop.hive.metastore.api.Table toMetastoreApiTable(Table table) + { + org.apache.hadoop.hive.metastore.api.Table result = new org.apache.hadoop.hive.metastore.api.Table(); + result.setDbName(table.getDatabaseName()); + result.setTableName(table.getTableName()); + result.setOwner(table.getOwner()); + result.setTableType(table.getTableType()); + result.setParameters(table.getParameters()); + result.setPartitionKeys(table.getPartitionColumns().stream().map(ThriftMetastoreUtil::toMetastoreApiFieldSchema).collect(toList())); + result.setSd(makeStorageDescriptor(table.getTableName(), table.getDataColumns(), table.getStorage())); + result.setViewOriginalText(table.getViewOriginalText().orElse(null)); + result.setViewExpandedText(table.getViewExpandedText().orElse(null)); + return result; + } + + private static PrincipalPrivilegeSet toMetastoreApiPrincipalPrivilegeSet(PrincipalPrivileges privileges) + { + ImmutableMap.Builder> userPrivileges = ImmutableMap.builder(); + for (Map.Entry> entry : privileges.getUserPrivileges().asMap().entrySet()) { + userPrivileges.put(entry.getKey(), entry.getValue().stream() + .map(ThriftMetastoreUtil::toMetastoreApiPrivilegeGrantInfo) + .collect(toList())); + } + + ImmutableMap.Builder> rolePrivileges = ImmutableMap.builder(); + for (Map.Entry> entry : privileges.getRolePrivileges().asMap().entrySet()) { + rolePrivileges.put(entry.getKey(), entry.getValue().stream() + .map(ThriftMetastoreUtil::toMetastoreApiPrivilegeGrantInfo) + .collect(toList())); + } + + return new PrincipalPrivilegeSet(userPrivileges.build(), ImmutableMap.of(), rolePrivileges.build()); + } + + public static PrivilegeGrantInfo toMetastoreApiPrivilegeGrantInfo(HivePrivilegeInfo privilegeInfo) + { + return new PrivilegeGrantInfo( + privilegeInfo.getHivePrivilege().name().toLowerCase(Locale.ENGLISH), + 0, + privilegeInfo.getGrantor().getName(), + fromPrestoPrincipalType(privilegeInfo.getGrantor().getType()), + privilegeInfo.isGrantOption()); + } + + private static org.apache.hadoop.hive.metastore.api.PrincipalType toMetastoreApiPrincipalType(PrincipalType principalType) + { + switch (principalType) { + case USER: + return org.apache.hadoop.hive.metastore.api.PrincipalType.USER; + case ROLE: + return org.apache.hadoop.hive.metastore.api.PrincipalType.ROLE; + default: + throw new IllegalArgumentException("Unsupported principal type: " + principalType); + } + } + + public static Stream listApplicableRoles(HivePrincipal principal, Function> listRoleGrants) + { + Queue queue = new ArrayDeque<>(); + queue.add(principal); + Queue output = new ArrayDeque<>(); + Set seenRoles = new HashSet<>(); + return Streams.stream(new AbstractIterator() + { + @Override + protected RoleGrant computeNext() + { + if (!output.isEmpty()) { + return output.remove(); + } + if (queue.isEmpty()) { + return endOfData(); + } + + while (!queue.isEmpty()) { + Set grants = listRoleGrants.apply(queue.remove()); + if (!grants.isEmpty()) { + for (RoleGrant grant : grants) { + if (seenRoles.add(grant)) { + output.add(grant); + queue.add(new HivePrincipal(ROLE, grant.getRoleName())); + } + } + break; + } + } + if (output.isEmpty()) { + return endOfData(); + } + return output.remove(); + } + }); + } + + public static boolean isRoleApplicable(SemiTransactionalHiveMetastore metastore, HivePrincipal principal, String role) + { + if (principal.getType() == ROLE && principal.getName().equals(role)) { + return true; + } + return listApplicableRoles(metastore, principal) + .anyMatch(role::equals); + } + + public static Stream listApplicableRoles(SemiTransactionalHiveMetastore metastore, HivePrincipal principal) + { + return listApplicableRoles(principal, metastore::listRoleGrants) + .map(RoleGrant::getRoleName); + } + + public static Stream listEnabledPrincipals(SemiTransactionalHiveMetastore metastore, ConnectorIdentity identity) + { + return Stream.concat( + Stream.of(new HivePrincipal(USER, identity.getUser())), + listEnabledRoles(identity, metastore::listRoleGrants) + .map(role -> new HivePrincipal(ROLE, role))); + } + + public static Stream listEnabledTablePrivileges(SemiTransactionalHiveMetastore metastore, String databaseName, String tableName, ConnectorIdentity identity) + { + return listTablePrivileges(metastore, databaseName, tableName, listEnabledPrincipals(metastore, identity)); + } + + public static Stream listApplicableTablePrivileges(SemiTransactionalHiveMetastore metastore, String databaseName, String tableName, String user) + { + HivePrincipal userPrincipal = new HivePrincipal(USER, user); + Stream principals = Stream.concat( + Stream.of(userPrincipal), + listApplicableRoles(metastore, userPrincipal) + .map(role -> new HivePrincipal(ROLE, role))); + return listTablePrivileges(metastore, databaseName, tableName, principals); + } + + private static Stream listTablePrivileges(SemiTransactionalHiveMetastore metastore, String databaseName, String tableName, Stream principals) + { + return principals.flatMap(principal -> metastore.listTablePrivileges(databaseName, tableName, principal).stream()); + } + + public static boolean isRoleEnabled(ConnectorIdentity identity, Function> listRoleGrants, String role) + { + if (role.equals(PUBLIC_ROLE_NAME)) { + return true; + } + + if (identity.getRole().isPresent() && identity.getRole().get().getType() == SelectedRole.Type.NONE) { + return false; + } + + HivePrincipal principal; + if (!identity.getRole().isPresent() || identity.getRole().get().getType() == SelectedRole.Type.ALL) { + principal = new HivePrincipal(USER, identity.getUser()); + } + else { + principal = new HivePrincipal(ROLE, identity.getRole().get().getRole().get()); + } + + if (principal.getType() == ROLE && principal.getName().equals(role)) { + return true; + } + + if (role.equals(ADMIN_ROLE_NAME)) { + // The admin role must be enabled explicitly, and so it should checked above + return false; + } + + // all the above code could be removed and method semantic would remain the same, however it would be more expensive for some negative cases (see above) + return listEnabledRoles(identity, listRoleGrants) + .anyMatch(role::equals); + } + + public static Stream listEnabledRoles(ConnectorIdentity identity, Function> listRoleGrants) + { + Optional role = identity.getRole(); + if (role.isPresent() && role.get().getType() == SelectedRole.Type.NONE) { + return Stream.of(PUBLIC_ROLE_NAME); + } + HivePrincipal principal; + if (!role.isPresent() || role.get().getType() == SelectedRole.Type.ALL) { + principal = new HivePrincipal(USER, identity.getUser()); + } + else { + principal = new HivePrincipal(ROLE, role.get().getRole().get()); + } + + Stream roles = Stream.of(PUBLIC_ROLE_NAME); + + if (principal.getType() == ROLE) { + roles = Stream.concat(roles, Stream.of(principal.getName())); + } + + return Stream.concat( + roles, + listApplicableRoles(principal, listRoleGrants) + .map(RoleGrant::getRoleName) + // The admin role must be enabled explicitly. If it is, it was added above. + .filter(Predicate.isEqual(ADMIN_ROLE_NAME).negate())); + } + + public static org.apache.hadoop.hive.metastore.api.Partition toMetastoreApiPartition(PartitionWithStatistics partitionWithStatistics) + { + org.apache.hadoop.hive.metastore.api.Partition partition = toMetastoreApiPartition(partitionWithStatistics.getPartition()); + partition.setParameters(updateStatisticsParameters(partition.getParameters(), partitionWithStatistics.getStatistics().getBasicStatistics())); + return partition; + } + + public static org.apache.hadoop.hive.metastore.api.Partition toMetastoreApiPartition(Partition partition) + { + org.apache.hadoop.hive.metastore.api.Partition result = new org.apache.hadoop.hive.metastore.api.Partition(); + result.setDbName(partition.getDatabaseName()); + result.setTableName(partition.getTableName()); + result.setValues(partition.getValues()); + result.setSd(makeStorageDescriptor(partition.getTableName(), partition.getColumns(), partition.getStorage())); + result.setParameters(partition.getParameters()); + return result; + } + + public static Database fromMetastoreApiDatabase(org.apache.hadoop.hive.metastore.api.Database database) + { + String ownerName = "PUBLIC"; + PrincipalType ownerType = ROLE; + if (database.getOwnerName() != null) { + ownerName = database.getOwnerName(); + ownerType = fromMetastoreApiPrincipalType(database.getOwnerType()); + } + + Map parameters = database.getParameters(); + if (parameters == null) { + parameters = ImmutableMap.of(); + } + + return Database.builder() + .setDatabaseName(database.getName()) + .setLocation(Optional.ofNullable(database.getLocationUri())) + .setOwnerName(ownerName) + .setOwnerType(ownerType) + .setComment(Optional.ofNullable(database.getDescription())) + .setParameters(parameters) + .build(); + } + + public static Table fromMetastoreApiTable(org.apache.hadoop.hive.metastore.api.Table table) + { + StorageDescriptor storageDescriptor = table.getSd(); + if (storageDescriptor == null) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Table is missing storage descriptor"); + } + return fromMetastoreApiTable(table, storageDescriptor.getCols()); + } + + public static Table fromMetastoreApiTable(org.apache.hadoop.hive.metastore.api.Table table, List schema) + { + StorageDescriptor storageDescriptor = table.getSd(); + if (storageDescriptor == null) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Table is missing storage descriptor"); + } + + Table.Builder tableBuilder = Table.builder() + .setDatabaseName(table.getDbName()) + .setTableName(table.getTableName()) + .setOwner(nullToEmpty(table.getOwner())) + .setTableType(table.getTableType()) + .setDataColumns(schema.stream() + .map(ThriftMetastoreUtil::fromMetastoreApiFieldSchema) + .collect(toList())) + .setPartitionColumns(table.getPartitionKeys().stream() + .map(ThriftMetastoreUtil::fromMetastoreApiFieldSchema) + .collect(toList())) + .setParameters(table.getParameters() == null ? ImmutableMap.of() : table.getParameters()) + .setViewOriginalText(Optional.ofNullable(emptyToNull(table.getViewOriginalText()))) + .setViewExpandedText(Optional.ofNullable(emptyToNull(table.getViewExpandedText()))); + + fromMetastoreApiStorageDescriptor(table.getParameters(), storageDescriptor, tableBuilder.getStorageBuilder(), table.getTableName()); + + return tableBuilder.build(); + } + + public static boolean isAvroTableWithSchemaSet(org.apache.hadoop.hive.metastore.api.Table table) + { + if (table.getParameters() == null) { + return false; + } + SerDeInfo serdeInfo = getSerdeInfo(table); + + return serdeInfo.getSerializationLib() != null && + (table.getParameters().get(AVRO_SCHEMA_URL_KEY) != null || + (serdeInfo.getParameters() != null && serdeInfo.getParameters().get(AVRO_SCHEMA_URL_KEY) != null)) && + serdeInfo.getSerializationLib().equals(AVRO.getSerDe()); + } + + public static boolean isCsvTable(org.apache.hadoop.hive.metastore.api.Table table) + { + return CSV.getSerDe().equals(getSerdeInfo(table).getSerializationLib()); + } + + private static SerDeInfo getSerdeInfo(org.apache.hadoop.hive.metastore.api.Table table) + { + StorageDescriptor storageDescriptor = table.getSd(); + if (storageDescriptor == null) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Table does not contain a storage descriptor: " + table); + } + SerDeInfo serdeInfo = storageDescriptor.getSerdeInfo(); + if (serdeInfo == null) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Table storage descriptor is missing SerDe info"); + } + + return serdeInfo; + } + + public static Partition fromMetastoreApiPartition(org.apache.hadoop.hive.metastore.api.Partition partition) + { + StorageDescriptor storageDescriptor = partition.getSd(); + if (storageDescriptor == null) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Partition does not contain a storage descriptor: " + partition); + } + + return fromMetastoreApiPartition(partition, storageDescriptor.getCols()); + } + + public static Partition fromMetastoreApiPartition(org.apache.hadoop.hive.metastore.api.Partition partition, List schema) + { + StorageDescriptor storageDescriptor = partition.getSd(); + if (storageDescriptor == null) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Partition does not contain a storage descriptor: " + partition); + } + + Partition.Builder partitionBuilder = Partition.builder() + .setDatabaseName(partition.getDbName()) + .setTableName(partition.getTableName()) + .setValues(partition.getValues()) + .setColumns(schema.stream() + .map(ThriftMetastoreUtil::fromMetastoreApiFieldSchema) + .collect(toList())) + .setParameters(partition.getParameters()); + + // TODO is bucketing_version set on partition level?? + fromMetastoreApiStorageDescriptor( + partition.getParameters(), + storageDescriptor, + partitionBuilder.getStorageBuilder(), + format("%s.%s", partition.getTableName(), partition.getValues())); + + return partitionBuilder.build(); + } + + public static HiveColumnStatistics fromMetastoreApiColumnStatistics(ColumnStatisticsObj columnStatistics, OptionalLong rowCount) + { + if (columnStatistics.getStatsData().isSetLongStats()) { + LongColumnStatsData longStatsData = columnStatistics.getStatsData().getLongStats(); + OptionalLong min = longStatsData.isSetLowValue() ? OptionalLong.of(longStatsData.getLowValue()) : OptionalLong.empty(); + OptionalLong max = longStatsData.isSetHighValue() ? OptionalLong.of(longStatsData.getHighValue()) : OptionalLong.empty(); + OptionalLong nullsCount = longStatsData.isSetNumNulls() ? fromMetastoreNullsCount(longStatsData.getNumNulls()) : OptionalLong.empty(); + OptionalLong distinctValuesCount = longStatsData.isSetNumDVs() ? OptionalLong.of(longStatsData.getNumDVs()) : OptionalLong.empty(); + return HiveColumnStatistics.createIntegerColumnStatistics(min, max, nullsCount, fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount)); + } + if (columnStatistics.getStatsData().isSetDoubleStats()) { + DoubleColumnStatsData doubleStatsData = columnStatistics.getStatsData().getDoubleStats(); + OptionalDouble min = doubleStatsData.isSetLowValue() ? OptionalDouble.of(doubleStatsData.getLowValue()) : OptionalDouble.empty(); + OptionalDouble max = doubleStatsData.isSetHighValue() ? OptionalDouble.of(doubleStatsData.getHighValue()) : OptionalDouble.empty(); + OptionalLong nullsCount = doubleStatsData.isSetNumNulls() ? fromMetastoreNullsCount(doubleStatsData.getNumNulls()) : OptionalLong.empty(); + OptionalLong distinctValuesCount = doubleStatsData.isSetNumDVs() ? OptionalLong.of(doubleStatsData.getNumDVs()) : OptionalLong.empty(); + return HiveColumnStatistics.createDoubleColumnStatistics(min, max, nullsCount, fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount)); + } + if (columnStatistics.getStatsData().isSetDecimalStats()) { + DecimalColumnStatsData decimalStatsData = columnStatistics.getStatsData().getDecimalStats(); + Optional min = decimalStatsData.isSetLowValue() ? fromMetastoreDecimal(decimalStatsData.getLowValue()) : Optional.empty(); + Optional max = decimalStatsData.isSetHighValue() ? fromMetastoreDecimal(decimalStatsData.getHighValue()) : Optional.empty(); + OptionalLong nullsCount = decimalStatsData.isSetNumNulls() ? fromMetastoreNullsCount(decimalStatsData.getNumNulls()) : OptionalLong.empty(); + OptionalLong distinctValuesCount = decimalStatsData.isSetNumDVs() ? OptionalLong.of(decimalStatsData.getNumDVs()) : OptionalLong.empty(); + return HiveColumnStatistics.createDecimalColumnStatistics(min, max, nullsCount, fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount)); + } + if (columnStatistics.getStatsData().isSetDateStats()) { + DateColumnStatsData dateStatsData = columnStatistics.getStatsData().getDateStats(); + Optional min = dateStatsData.isSetLowValue() ? fromMetastoreDate(dateStatsData.getLowValue()) : Optional.empty(); + Optional max = dateStatsData.isSetHighValue() ? fromMetastoreDate(dateStatsData.getHighValue()) : Optional.empty(); + OptionalLong nullsCount = dateStatsData.isSetNumNulls() ? fromMetastoreNullsCount(dateStatsData.getNumNulls()) : OptionalLong.empty(); + OptionalLong distinctValuesCount = dateStatsData.isSetNumDVs() ? OptionalLong.of(dateStatsData.getNumDVs()) : OptionalLong.empty(); + return HiveColumnStatistics.createDateColumnStatistics(min, max, nullsCount, fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount)); + } + if (columnStatistics.getStatsData().isSetBooleanStats()) { + BooleanColumnStatsData booleanStatsData = columnStatistics.getStatsData().getBooleanStats(); + OptionalLong trueCount = OptionalLong.empty(); + OptionalLong falseCount = OptionalLong.empty(); + // Impala 'COMPUTE STATS' writes 1 as the numTrue and -1 as the numFalse + if (booleanStatsData.isSetNumTrues() && booleanStatsData.isSetNumFalses() && (booleanStatsData.getNumFalses() != -1)) { + trueCount = OptionalLong.of(booleanStatsData.getNumTrues()); + falseCount = OptionalLong.of(booleanStatsData.getNumFalses()); + } + return HiveColumnStatistics.createBooleanColumnStatistics( + trueCount, + falseCount, + booleanStatsData.isSetNumNulls() ? fromMetastoreNullsCount(booleanStatsData.getNumNulls()) : OptionalLong.empty()); + } + if (columnStatistics.getStatsData().isSetStringStats()) { + StringColumnStatsData stringStatsData = columnStatistics.getStatsData().getStringStats(); + OptionalLong maxColumnLength = stringStatsData.isSetMaxColLen() ? OptionalLong.of(stringStatsData.getMaxColLen()) : OptionalLong.empty(); + OptionalDouble averageColumnLength = stringStatsData.isSetAvgColLen() ? OptionalDouble.of(stringStatsData.getAvgColLen()) : OptionalDouble.empty(); + OptionalLong nullsCount = stringStatsData.isSetNumNulls() ? fromMetastoreNullsCount(stringStatsData.getNumNulls()) : OptionalLong.empty(); + OptionalLong distinctValuesCount = stringStatsData.isSetNumDVs() ? OptionalLong.of(stringStatsData.getNumDVs()) : OptionalLong.empty(); + return HiveColumnStatistics.createStringColumnStatistics( + maxColumnLength, + getTotalSizeInBytes(averageColumnLength, rowCount, nullsCount), + nullsCount, + fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount)); + } + if (columnStatistics.getStatsData().isSetBinaryStats()) { + BinaryColumnStatsData binaryStatsData = columnStatistics.getStatsData().getBinaryStats(); + OptionalLong maxColumnLength = binaryStatsData.isSetMaxColLen() ? OptionalLong.of(binaryStatsData.getMaxColLen()) : OptionalLong.empty(); + OptionalDouble averageColumnLength = binaryStatsData.isSetAvgColLen() ? OptionalDouble.of(binaryStatsData.getAvgColLen()) : OptionalDouble.empty(); + OptionalLong nullsCount = binaryStatsData.isSetNumNulls() ? fromMetastoreNullsCount(binaryStatsData.getNumNulls()) : OptionalLong.empty(); + return HiveColumnStatistics.createBinaryColumnStatistics( + maxColumnLength, + getTotalSizeInBytes(averageColumnLength, rowCount, nullsCount), + nullsCount); + } + else { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Invalid column statistics data: " + columnStatistics); + } + } + + private static Optional fromMetastoreDate(Date date) + { + if (date == null) { + return Optional.empty(); + } + return Optional.of(LocalDate.ofEpochDay(date.getDaysSinceEpoch())); + } + + /** + * Impala `COMPUTE STATS` will write -1 as the null count. + * + * @see IMPALA-7497 + */ + private static OptionalLong fromMetastoreNullsCount(long nullsCount) + { + if (nullsCount == -1L) { + return OptionalLong.empty(); + } + return OptionalLong.of(nullsCount); + } + + private static Optional fromMetastoreDecimal(@Nullable Decimal decimal) + { + if (decimal == null) { + return Optional.empty(); + } + return Optional.of(new BigDecimal(new BigInteger(decimal.getUnscaled()), decimal.getScale())); + } + + private static OptionalLong getTotalSizeInBytes(OptionalDouble averageColumnLength, OptionalLong rowCount, OptionalLong nullsCount) + { + if (averageColumnLength.isPresent() && rowCount.isPresent() && nullsCount.isPresent()) { + long nonNullsCount = rowCount.getAsLong() - nullsCount.getAsLong(); + if (nonNullsCount < 0) { + return OptionalLong.empty(); + } + return OptionalLong.of(round(averageColumnLength.getAsDouble() * nonNullsCount)); + } + return OptionalLong.empty(); + } + + /** + * Hive calculates NDV considering null as a distinct value + */ + private static OptionalLong fromMetastoreDistinctValuesCount(OptionalLong distinctValuesCount, OptionalLong nullsCount, OptionalLong rowCount) + { + if (distinctValuesCount.isPresent() && nullsCount.isPresent() && rowCount.isPresent()) { + return OptionalLong.of(fromMetastoreDistinctValuesCount(distinctValuesCount.getAsLong(), nullsCount.getAsLong(), rowCount.getAsLong())); + } + return OptionalLong.empty(); + } + + private static long fromMetastoreDistinctValuesCount(long distinctValuesCount, long nullsCount, long rowCount) + { + long nonNullsCount = rowCount - nullsCount; + if (nullsCount > 0 && distinctValuesCount > 0) { + distinctValuesCount--; + } + + // normalize distinctValuesCount in case there is a non null element + if (nonNullsCount > 0 && distinctValuesCount == 0) { + distinctValuesCount = 1; + } + + // the metastore may store an estimate, so the value stored may be higher than the total number of rows + if (distinctValuesCount > nonNullsCount) { + return nonNullsCount; + } + return distinctValuesCount; + } + + public static Set fromRolePrincipalGrants(Collection grants, boolean isRoleNameCaseSensitive) + { + return ImmutableSet.copyOf(grants.stream().map(grant -> fromRolePrincipalGrant(grant, isRoleNameCaseSensitive)).collect(toList())); + } + + private static RoleGrant fromRolePrincipalGrant(RolePrincipalGrant grant, boolean isRoleNameCaseSensitive) + { + return new RoleGrant( + new PrestoPrincipal(fromMetastoreApiPrincipalType(grant.getPrincipalType()), grant.getPrincipalName()), + isRoleNameCaseSensitive ? grant.getRoleName() : grant.getRoleName().toLowerCase(ENGLISH), + grant.isGrantOption()); + } + + public static org.apache.hadoop.hive.metastore.api.PrincipalType fromPrestoPrincipalType(PrincipalType principalType) + { + switch (principalType) { + case USER: + return org.apache.hadoop.hive.metastore.api.PrincipalType.USER; + case ROLE: + return org.apache.hadoop.hive.metastore.api.PrincipalType.ROLE; + default: + throw new IllegalArgumentException("Unsupported principal type: " + principalType); + } + } + + public static PrincipalType fromMetastoreApiPrincipalType(org.apache.hadoop.hive.metastore.api.PrincipalType principalType) + { + requireNonNull(principalType, "principalType is null"); + switch (principalType) { + case USER: + return USER; + case ROLE: + return ROLE; + default: + throw new IllegalArgumentException("Unsupported principal type: " + principalType); + } + } + + private static FieldSchema toMetastoreApiFieldSchema(Column column) + { + return new FieldSchema(column.getName(), column.getType().getHiveTypeName().toString(), column.getComment().orElse(null)); + } + + private static Column fromMetastoreApiFieldSchema(FieldSchema fieldSchema) + { + return new Column(fieldSchema.getName(), HiveType.valueOf(fieldSchema.getType()), Optional.ofNullable(emptyToNull(fieldSchema.getComment()))); + } + + private static void fromMetastoreApiStorageDescriptor( + Map tableParameters, + StorageDescriptor storageDescriptor, + Storage.Builder builder, + String tablePartitionName) + { + SerDeInfo serdeInfo = storageDescriptor.getSerdeInfo(); + if (serdeInfo == null) { + throw new PrestoException(HiveErrorCode.HIVE_INVALID_METADATA, "Table storage descriptor is missing SerDe info"); + } + + builder.setStorageFormat(StorageFormat.createNullable(serdeInfo.getSerializationLib(), storageDescriptor.getInputFormat(), storageDescriptor.getOutputFormat())) + .setLocation(nullToEmpty(storageDescriptor.getLocation())) + .setBucketProperty(HiveBucketProperty.fromStorageDescriptor(tableParameters, storageDescriptor, tablePartitionName)) + .setSkewed(storageDescriptor.isSetSkewedInfo() && storageDescriptor.getSkewedInfo().isSetSkewedColNames() && !storageDescriptor.getSkewedInfo().getSkewedColNames().isEmpty()) + .setSerdeParameters(serdeInfo.getParameters() == null ? ImmutableMap.of() : serdeInfo.getParameters()); + } + + private static StorageDescriptor makeStorageDescriptor(String tableName, List columns, Storage storage) + { + if (storage.isSkewed()) { + throw new IllegalArgumentException("Writing to skewed table/partition is not supported"); + } + SerDeInfo serdeInfo = new SerDeInfo(); + serdeInfo.setName(tableName); + serdeInfo.setSerializationLib(storage.getStorageFormat().getSerDeNullable()); + serdeInfo.setParameters(storage.getSerdeParameters()); + + StorageDescriptor sd = new StorageDescriptor(); + sd.setLocation(emptyToNull(storage.getLocation())); + sd.setCols(columns.stream() + .map(ThriftMetastoreUtil::toMetastoreApiFieldSchema) + .collect(toList())); + sd.setSerdeInfo(serdeInfo); + sd.setInputFormat(storage.getStorageFormat().getInputFormatNullable()); + sd.setOutputFormat(storage.getStorageFormat().getOutputFormatNullable()); + sd.setParameters(ImmutableMap.of()); + + Optional bucketProperty = storage.getBucketProperty(); + if (bucketProperty.isPresent()) { + sd.setNumBuckets(bucketProperty.get().getBucketCount()); + sd.setBucketCols(bucketProperty.get().getBucketedBy()); + if (!bucketProperty.get().getSortedBy().isEmpty()) { + sd.setSortCols(bucketProperty.get().getSortedBy().stream() + .map(column -> new Order(column.getColumnName(), column.getOrder().getHiveOrder())) + .collect(toList())); + } + } + else { + //Hive update checks for -1 even though no bucketing property set and fails if its 0. + sd.setNumBuckets(-1); + } + + return sd; + } + + public static Set parsePrivilege(PrivilegeGrantInfo userGrant, Optional grantee) + { + boolean withGrantOption = userGrant.isGrantOption(); + String name = userGrant.getPrivilege().toUpperCase(ENGLISH); + HivePrincipal grantor = new HivePrincipal(fromMetastoreApiPrincipalType(userGrant.getGrantorType()), userGrant.getGrantor()); + switch (name) { + case "ALL": + return Arrays.stream(HivePrivilegeInfo.HivePrivilege.values()) + .map(hivePrivilege -> new HivePrivilegeInfo(hivePrivilege, withGrantOption, grantor, grantee.orElse(grantor))) + .collect(toImmutableSet()); + case "SELECT": + return ImmutableSet.of(new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.SELECT, withGrantOption, grantor, grantee.orElse(grantor))); + case "INSERT": + return ImmutableSet.of(new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.INSERT, withGrantOption, grantor, grantee.orElse(grantor))); + case "UPDATE": + return ImmutableSet.of(new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.UPDATE, withGrantOption, grantor, grantee.orElse(grantor))); + case "DELETE": + return ImmutableSet.of(new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.DELETE, withGrantOption, grantor, grantee.orElse(grantor))); + case "OWNERSHIP": + return ImmutableSet.of(new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.OWNERSHIP, withGrantOption, grantor, grantee.orElse(grantor))); + default: + throw new IllegalArgumentException("Unsupported privilege name: " + name); + } + } + + public static HiveBasicStatistics getHiveBasicStatistics(Map parameters) + { + OptionalLong numFiles = parse(parameters.get(NUM_FILES)); + OptionalLong numRows = parse(parameters.get(NUM_ROWS)); + OptionalLong inMemoryDataSizeInBytes = parse(parameters.get(RAW_DATA_SIZE)); + OptionalLong onDiskDataSizeInBytes = parse(parameters.get(TOTAL_SIZE)); + return new HiveBasicStatistics(numFiles, numRows, inMemoryDataSizeInBytes, onDiskDataSizeInBytes); + } + + private static OptionalLong parse(@Nullable String parameterValue) + { + if (parameterValue == null) { + return OptionalLong.empty(); + } + Long longValue = Longs.tryParse(parameterValue); + if (longValue == null || longValue < 0) { + return OptionalLong.empty(); + } + return OptionalLong.of(longValue); + } + + public static Map updateStatisticsParameters(Map parameters, HiveBasicStatistics statistics) + { + ImmutableMap.Builder result = ImmutableMap.builder(); + + parameters.forEach((key, value) -> { + if (!STATS_PROPERTIES.contains(key)) { + result.put(key, value); + } + }); + + statistics.getFileCount().ifPresent(count -> result.put(NUM_FILES, Long.toString(count))); + statistics.getRowCount().ifPresent(count -> result.put(NUM_ROWS, Long.toString(count))); + statistics.getInMemoryDataSizeInBytes().ifPresent(size -> result.put(RAW_DATA_SIZE, Long.toString(size))); + statistics.getOnDiskDataSizeInBytes().ifPresent(size -> result.put(TOTAL_SIZE, Long.toString(size))); + + return result.build(); + } + + public static ColumnStatisticsObj createMetastoreColumnStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics, OptionalLong rowCount) + { + TypeInfo typeInfo = columnType.getTypeInfo(); + checkArgument(typeInfo.getCategory() == PRIMITIVE, "unsupported type: %s", columnType); + switch (((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory()) { + case BOOLEAN: + return createBooleanStatistics(columnName, columnType, statistics); + case BYTE: + case SHORT: + case INT: + case LONG: + return createLongStatistics(columnName, columnType, statistics); + case FLOAT: + case DOUBLE: + return createDoubleStatistics(columnName, columnType, statistics); + case STRING: + case VARCHAR: + case CHAR: + return createStringStatistics(columnName, columnType, statistics, rowCount); + case DATE: + return createDateStatistics(columnName, columnType, statistics); + case TIMESTAMP: + return createLongStatistics(columnName, columnType, statistics); + case BINARY: + return createBinaryStatistics(columnName, columnType, statistics, rowCount); + case DECIMAL: + return createDecimalStatistics(columnName, columnType, statistics); + default: + throw new IllegalArgumentException(format("unsupported type: %s", columnType)); + } + } + + private static ColumnStatisticsObj createBooleanStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics) + { + BooleanColumnStatsData data = new BooleanColumnStatsData(); + statistics.getNullsCount().ifPresent(data::setNumNulls); + statistics.getBooleanStatistics().ifPresent(booleanStatistics -> { + booleanStatistics.getFalseCount().ifPresent(data::setNumFalses); + booleanStatistics.getTrueCount().ifPresent(data::setNumTrues); + }); + return new ColumnStatisticsObj(columnName, columnType.toString(), booleanStats(data)); + } + + private static ColumnStatisticsObj createLongStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics) + { + LongColumnStatsData data = new LongColumnStatsData(); + statistics.getIntegerStatistics().ifPresent(integerStatistics -> { + integerStatistics.getMin().ifPresent(data::setLowValue); + integerStatistics.getMax().ifPresent(data::setHighValue); + }); + statistics.getNullsCount().ifPresent(data::setNumNulls); + toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumDVs); + return new ColumnStatisticsObj(columnName, columnType.toString(), longStats(data)); + } + + private static ColumnStatisticsObj createDoubleStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics) + { + DoubleColumnStatsData data = new DoubleColumnStatsData(); + statistics.getDoubleStatistics().ifPresent(doubleStatistics -> { + doubleStatistics.getMin().ifPresent(data::setLowValue); + doubleStatistics.getMax().ifPresent(data::setHighValue); + }); + statistics.getNullsCount().ifPresent(data::setNumNulls); + toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumDVs); + return new ColumnStatisticsObj(columnName, columnType.toString(), doubleStats(data)); + } + + private static ColumnStatisticsObj createStringStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics, OptionalLong rowCount) + { + StringColumnStatsData data = new StringColumnStatsData(); + statistics.getNullsCount().ifPresent(data::setNumNulls); + toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumDVs); + data.setMaxColLen(statistics.getMaxValueSizeInBytes().orElse(0)); + data.setAvgColLen(getAverageColumnLength(statistics.getTotalSizeInBytes(), rowCount, statistics.getNullsCount()).orElse(0)); + return new ColumnStatisticsObj(columnName, columnType.toString(), stringStats(data)); + } + + private static ColumnStatisticsObj createDateStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics) + { + DateColumnStatsData data = new DateColumnStatsData(); + statistics.getDateStatistics().ifPresent(dateStatistics -> { + dateStatistics.getMin().ifPresent(value -> data.setLowValue(toMetastoreDate(value))); + dateStatistics.getMax().ifPresent(value -> data.setHighValue(toMetastoreDate(value))); + }); + statistics.getNullsCount().ifPresent(data::setNumNulls); + toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumDVs); + return new ColumnStatisticsObj(columnName, columnType.toString(), dateStats(data)); + } + + private static ColumnStatisticsObj createBinaryStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics, OptionalLong rowCount) + { + BinaryColumnStatsData data = new BinaryColumnStatsData(); + statistics.getNullsCount().ifPresent(data::setNumNulls); + data.setMaxColLen(statistics.getMaxValueSizeInBytes().orElse(0)); + data.setAvgColLen(getAverageColumnLength(statistics.getTotalSizeInBytes(), rowCount, statistics.getNullsCount()).orElse(0)); + return new ColumnStatisticsObj(columnName, columnType.toString(), binaryStats(data)); + } + + private static ColumnStatisticsObj createDecimalStatistics(String columnName, HiveType columnType, HiveColumnStatistics statistics) + { + DecimalColumnStatsData data = new DecimalColumnStatsData(); + statistics.getDecimalStatistics().ifPresent(decimalStatistics -> { + decimalStatistics.getMin().ifPresent(value -> data.setLowValue(toMetastoreDecimal(value))); + decimalStatistics.getMax().ifPresent(value -> data.setHighValue(toMetastoreDecimal(value))); + }); + statistics.getNullsCount().ifPresent(data::setNumNulls); + toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumDVs); + return new ColumnStatisticsObj(columnName, columnType.toString(), decimalStats(data)); + } + + private static Date toMetastoreDate(LocalDate date) + { + return new Date(date.toEpochDay()); + } + + public static Decimal toMetastoreDecimal(BigDecimal decimal) + { + return new Decimal(Shorts.checkedCast(decimal.scale()), ByteBuffer.wrap(decimal.unscaledValue().toByteArray())); + } + + private static OptionalLong toMetastoreDistinctValuesCount(OptionalLong distinctValuesCount, OptionalLong nullsCount) + { + // metastore counts null as a distinct value + if (distinctValuesCount.isPresent() && nullsCount.isPresent()) { + return OptionalLong.of(distinctValuesCount.getAsLong() + (nullsCount.getAsLong() > 0 ? 1 : 0)); + } + return OptionalLong.empty(); + } + + private static OptionalDouble getAverageColumnLength(OptionalLong totalSizeInBytes, OptionalLong rowCount, OptionalLong nullsCount) + { + if (totalSizeInBytes.isPresent() && rowCount.isPresent() && nullsCount.isPresent()) { + long nonNullsCount = rowCount.getAsLong() - nullsCount.getAsLong(); + if (nonNullsCount <= 0) { + return OptionalDouble.empty(); + } + return OptionalDouble.of(((double) totalSizeInBytes.getAsLong()) / nonNullsCount); + } + return OptionalDouble.empty(); + } + + public static Set getSupportedColumnStatistics(Type type) + { + if (type.equals(BOOLEAN)) { + return ImmutableSet.of(NUMBER_OF_NON_NULL_VALUES, NUMBER_OF_TRUE_VALUES); + } + if (isNumericType(type) || type.equals(DATE) || type.equals(TIMESTAMP)) { + // TODO https://github.com/prestodb/presto/issues/7122 support non-legacy TIMESTAMP + return ImmutableSet.of(MIN_VALUE, MAX_VALUE, NUMBER_OF_DISTINCT_VALUES, NUMBER_OF_NON_NULL_VALUES); + } + if (isVarcharType(type) || isCharType(type)) { + // TODO Collect MIN,MAX once it is used by the optimizer + return ImmutableSet.of(NUMBER_OF_NON_NULL_VALUES, NUMBER_OF_DISTINCT_VALUES, TOTAL_SIZE_IN_BYTES, MAX_VALUE_SIZE_IN_BYTES); + } + if (type.equals(VARBINARY)) { + return ImmutableSet.of(NUMBER_OF_NON_NULL_VALUES, TOTAL_SIZE_IN_BYTES, MAX_VALUE_SIZE_IN_BYTES); + } + if (type instanceof ArrayType || type instanceof RowType || type instanceof MapType) { + return ImmutableSet.of(); + } + // Throwing here to make sure this method is updated when a new type is added in Hive connector + throw new IllegalArgumentException("Unsupported type: " + type); + } + + private static boolean isNumericType(Type type) + { + return type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT) || + type.equals(DOUBLE) || type.equals(REAL) || + type instanceof DecimalType; + } + + public static HivePrincipal applyRoleNameCaseSensitive(HivePrincipal hivePrincipal, boolean isRoleNameCaseSensitive) + { + if (hivePrincipal != null && !isRoleNameCaseSensitive && hivePrincipal.getType() == ROLE) { + return new HivePrincipal(hivePrincipal.getType(), hivePrincipal.getName().toLowerCase(ENGLISH)); + } + return hivePrincipal; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/Transport.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/Transport.java new file mode 100644 index 00000000..e988e354 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/metastore/thrift/Transport.java @@ -0,0 +1,221 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.net.HostAndPort; +import io.prestosql.plugin.hive.authentication.HiveMetastoreAuthentication; +import org.apache.thrift.transport.TSocket; +import org.apache.thrift.transport.TTransport; +import org.apache.thrift.transport.TTransportException; + +import javax.net.ssl.SSLContext; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.Proxy; +import java.net.Socket; +import java.util.Optional; + +import static java.lang.String.format; +import static java.net.Proxy.Type.SOCKS; + +public final class Transport +{ + public static TTransport create( + HostAndPort address, + Optional sslContext, + Optional socksProxy, + int timeoutMillis, + HiveMetastoreAuthentication authentication) + throws TTransportException + { + try { + TTransport rawTransport = createRaw(address, sslContext, socksProxy, timeoutMillis); + TTransport authenticatedTransport = authentication.authenticate(rawTransport, address.getHost()); + if (!authenticatedTransport.isOpen()) { + authenticatedTransport.open(); + } + return new TTransportWrapper(authenticatedTransport, address); + } + catch (TTransportException e) { + throw rewriteException(e, address); + } + } + + private Transport() {} + + private static TTransport createRaw(HostAndPort address, Optional sslContext, Optional socksProxy, int timeoutMillis) + throws TTransportException + { + Proxy proxy = socksProxy + .map(socksAddress -> new Proxy(SOCKS, InetSocketAddress.createUnresolved(socksAddress.getHost(), socksAddress.getPort()))) + .orElse(Proxy.NO_PROXY); + + Socket socket = new Socket(proxy); + try { + socket.connect(new InetSocketAddress(address.getHost(), address.getPort()), timeoutMillis); + socket.setSoTimeout(timeoutMillis); + + if (sslContext.isPresent()) { + // SSL will connect to the SOCKS address when present + HostAndPort sslConnectAddress = socksProxy.orElse(address); + + socket = sslContext.get().getSocketFactory().createSocket(socket, sslConnectAddress.getHost(), sslConnectAddress.getPort(), true); + } + return new TSocket(socket); + } + catch (Throwable t) { + // something went wrong, close the socket and rethrow + try { + socket.close(); + } + catch (IOException e) { + t.addSuppressed(e); + } + throw new TTransportException(t); + } + } + + private static TTransportException rewriteException(TTransportException e, HostAndPort address) + { + return new TTransportException(e.getType(), format("%s: %s", address, e.getMessage()), e); + } + + private static class TTransportWrapper + extends TTransport + { + private final TTransport transport; + private final HostAndPort address; + + TTransportWrapper(TTransport transport, HostAndPort address) + { + this.transport = transport; + this.address = address; + } + + @Override + public boolean isOpen() + { + return transport.isOpen(); + } + + @Override + public boolean peek() + { + return transport.peek(); + } + + @Override + public byte[] getBuffer() + { + return transport.getBuffer(); + } + + @Override + public int getBufferPosition() + { + return transport.getBufferPosition(); + } + + @Override + public int getBytesRemainingInBuffer() + { + return transport.getBytesRemainingInBuffer(); + } + + @Override + public void consumeBuffer(int len) + { + transport.consumeBuffer(len); + } + + @Override + public void close() + { + transport.close(); + } + + @Override + public void open() + throws TTransportException + { + try { + transport.open(); + } + catch (TTransportException e) { + throw rewriteException(e, address); + } + } + + @Override + public int readAll(byte[] bytes, int off, int len) + throws TTransportException + { + try { + return transport.readAll(bytes, off, len); + } + catch (TTransportException e) { + throw rewriteException(e, address); + } + } + + @Override + public int read(byte[] bytes, int off, int len) + throws TTransportException + { + try { + return transport.read(bytes, off, len); + } + catch (TTransportException e) { + throw rewriteException(e, address); + } + } + + @Override + public void write(byte[] bytes) + throws TTransportException + { + try { + transport.write(bytes); + } + catch (TTransportException e) { + throw rewriteException(e, address); + } + } + + @Override + public void write(byte[] bytes, int off, int len) + throws TTransportException + { + try { + transport.write(bytes, off, len); + } + catch (TTransportException e) { + throw rewriteException(e, address); + } + } + + @Override + public void flush() + throws TTransportException + { + try { + transport.flush(); + } + catch (TTransportException e) { + throw rewriteException(e, address); + } + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/omnidata/CommunicationConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/omnidata/CommunicationConfig.java new file mode 100644 index 00000000..a0d66a38 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/omnidata/CommunicationConfig.java @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.omnidata; + +import io.airlift.configuration.Config; + +public class CommunicationConfig +{ + private boolean httpsRequired; + + public boolean isHttpsRequired() + { + return httpsRequired; + } + + @Config("internal-communication.https.required") + public CommunicationConfig setHttpsRequired(boolean httpsRequired) + { + this.httpsRequired = httpsRequired; + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/omnidata/OmniDataNodeManager.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/omnidata/OmniDataNodeManager.java new file mode 100644 index 00000000..3b04ccfc --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/omnidata/OmniDataNodeManager.java @@ -0,0 +1,287 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.omnidata; + +import com.google.common.collect.ImmutableList; +import com.google.common.io.ByteStreams; +import com.google.common.net.HttpHeaders; +import io.airlift.configuration.ConfigurationFactory; +import io.airlift.discovery.client.DiscoveryAnnouncementClient; +import io.airlift.discovery.client.DiscoveryClientConfig; +import io.airlift.discovery.client.DiscoveryException; +import io.airlift.discovery.client.ServiceDescriptor; +import io.airlift.discovery.client.ServiceDescriptors; +import io.airlift.discovery.client.ServiceDescriptorsRepresentation; +import io.airlift.http.client.HttpClient; +import io.airlift.http.client.HttpClientConfig; +import io.airlift.http.client.HttpStatus; +import io.airlift.http.client.Request; +import io.airlift.http.client.Response; +import io.airlift.http.client.ResponseHandler; +import io.airlift.http.client.jetty.JettyHttpClient; +import io.airlift.http.client.spnego.KerberosConfig; +import io.airlift.json.JsonCodec; +import io.airlift.log.Logger; +import io.airlift.units.Duration; +import io.prestosql.spi.HostAddress; +import io.prestosql.spi.PrestoException; + +import javax.annotation.PreDestroy; +import javax.annotation.concurrent.GuardedBy; +import javax.inject.Inject; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.text.Normalizer; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.CancellationException; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.airlift.concurrent.Threads.threadsNamed; +import static io.airlift.configuration.ConfigurationLoader.loadPropertiesFrom; +import static io.prestosql.spi.HostAddress.fromParts; +import static io.prestosql.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; +import static java.lang.String.format; +import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor; + +public class OmniDataNodeManager +{ + private static final Logger log = Logger.get(OmniDataNodeManager.class); + public static final String CONFIG_PROPERTY = "config"; + + @GuardedBy("this") + private Map allNodes = new ConcurrentHashMap<>(); + + private final ScheduledExecutorService nodeStateUpdateExecutor; + + private KerberosConfig kerberosConfig; + private HttpClientConfig httpClientConfig; + private DiscoveryClientConfig discoveryClientConfig; + private boolean httpsRequired; + private AtomicBoolean started = new AtomicBoolean(false); + + @Inject + public OmniDataNodeManager() + { + this.nodeStateUpdateExecutor = newSingleThreadScheduledExecutor(threadsNamed("omnidata-node-state-poller-%s")); + } + + private boolean initializeConfiguration() + { + String configFilePath = System.getProperty(CONFIG_PROPERTY); + if (configFilePath == null || configFilePath.isEmpty()) { + log.error("System property %s does not exist.", CONFIG_PROPERTY); + return false; + } + + String filePath; + try { + String normalizePath = Normalizer.normalize(configFilePath, Normalizer.Form.NFKC); + filePath = new File(normalizePath).getCanonicalPath(); + } + catch (IOException | IllegalArgumentException exception) { + log.error("config file path [%s] is invalid, exception %s", configFilePath, exception.getMessage()); + return false; + } + File file = new File(filePath); + if (!file.exists()) { + log.error("config file [%s] does not exist.", filePath); + return false; + } + + Map properties; + try { + properties = loadPropertiesFrom(configFilePath); + } + catch (IOException e) { + log.error("Fail to load config file, Check your configuration."); + return false; + } + + ConfigurationFactory configurationFactory = new ConfigurationFactory(properties); + this.kerberosConfig = configurationFactory.build(KerberosConfig.class); + this.httpClientConfig = configurationFactory.build(HttpClientConfig.class); + this.discoveryClientConfig = configurationFactory.build(DiscoveryClientConfig.class); + CommunicationConfig communicationConfig = configurationFactory.build(CommunicationConfig.class); + this.httpsRequired = communicationConfig.isHttpsRequired(); + return true; + } + + public void startPollingNodeStates() + { + if (started.getAndSet(true)) { + return; + } + + if (!initializeConfiguration()) { + return; + } + nodeStateUpdateExecutor.scheduleWithFixedDelay(() -> { + try { + refreshNodes(); + } + catch (Exception e) { + log.error(e, "Error polling state of omnidata nodes"); + } + }, 10, 5, TimeUnit.SECONDS); + } + + @PreDestroy + public void stop() + { + if (!started.get()) { + return; + } + + nodeStateUpdateExecutor.shutdownNow(); + } + + private synchronized void refreshNodes() + { + allNodes.clear(); + + Set services = getServices().getServiceDescriptors().stream().collect(toImmutableSet()); + for (ServiceDescriptor service : services) { + URI uri = getHttpUri(service, httpsRequired); + String localHdfsIpAddress = service.getProperties().get("local.hdfs.server.address"); + String grpcPort = service.getProperties().get("grpc.server.port"); + String runningTaskNumber = service.getProperties().get("runningTaskNumber"); + String maxTaskNumber = service.getProperties().get("maxTaskNumber"); + if (uri.getHost() != null && localHdfsIpAddress != null) { + try { + OmniDataNodeStatus nodeStatus = new OmniDataNodeStatus(fromParts(uri.getHost(), Integer.parseInt(grpcPort)).toString(), + Integer.parseInt(runningTaskNumber), Integer.parseInt(maxTaskNumber)); + allNodes.put(localHdfsIpAddress, nodeStatus); + } + catch (RuntimeException ignored) { + throw new PrestoException(GENERIC_INTERNAL_ERROR, "omnidata node manger receive wrong arguments"); + } + } + } + } + + private ServiceDescriptors getServices() + { + try (HttpClient httpClient = new JettyHttpClient("omnidata-node-manager", httpClientConfig, kerberosConfig, ImmutableList.of())) { + URI uri = discoveryClientConfig.getDiscoveryServiceURI(); + if (uri == null) { + throw new DiscoveryException("No discovery servers are available"); + } + + String type = "omnidata"; + uri = URI.create(uri + "/v1/service/" + type + "/"); + + Request.Builder requestBuilder = Request.Builder.prepareGet() + .setUri(uri) + .setHeader("User-Agent", System.getProperty("node.id")); + + return httpClient.execute(requestBuilder.build(), new OmniDataNodeManagerResponseHandler(type, uri) { + @Override + public ServiceDescriptors handle(Request request, Response response) + { + if (response.getStatusCode() != HttpStatus.OK.code()) { + throw new DiscoveryException(String.format("Lookup of %s failed with status code %s", type, response.getStatusCode())); + } + + byte[] json; + try { + json = ByteStreams.toByteArray(response.getInputStream()); + } + catch (IOException e) { + throw new DiscoveryException(format("Lookup of %s failed", type), e); + } + + JsonCodec serviceDescriptorsCodec = JsonCodec.jsonCodec(ServiceDescriptorsRepresentation.class); + ServiceDescriptorsRepresentation serviceDescriptorsRepresentation = serviceDescriptorsCodec.fromJson(json); + + Duration maxAge = DiscoveryAnnouncementClient.DEFAULT_DELAY; + String eTag = response.getHeader(HttpHeaders.ETAG); + + return new ServiceDescriptors( + type, + null, + serviceDescriptorsRepresentation.getServiceDescriptors(), + maxAge, + eTag); + } + }); + } + } + + private static URI getHttpUri(ServiceDescriptor descriptor, boolean httpsRequired) + { + String url = descriptor.getProperties().get(httpsRequired ? "https" : "http"); + if (url != null) { + try { + return new URI(url); + } + catch (URISyntaxException ignored) { + } + } + return null; + } + + public synchronized Map getAllNodes() + { + return allNodes; + } + + public synchronized OmniDataNodeStatus getNode(HostAddress host) + { + return allNodes.get(host); + } + + private class OmniDataNodeManagerResponseHandler + implements ResponseHandler + { + private final String type; + private final URI uri; + + protected OmniDataNodeManagerResponseHandler(String name, URI uri) + { + this.type = name; + this.uri = uri; + } + + @Override + public T handle(Request request, Response response) + { + return null; + } + + @Override + public final T handleException(Request request, Exception exception) + { + if (exception instanceof InterruptedException) { + throw new DiscoveryException("Lookup" + type + " was interrupted for " + uri); + } + if (exception instanceof CancellationException) { + throw new DiscoveryException("Lookup" + type + " was canceled for " + uri); + } + if (exception instanceof DiscoveryException) { + throw (DiscoveryException) exception; + } + + throw new DiscoveryException("Lookup" + type + " failed for " + uri, exception); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/omnidata/OmniDataNodeStatus.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/omnidata/OmniDataNodeStatus.java new file mode 100644 index 00000000..f07f7289 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/omnidata/OmniDataNodeStatus.java @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.omnidata; + +public class OmniDataNodeStatus +{ + private String hostAddress; + private int runningTaskNumber; + private int maxTaskNumber; + + public OmniDataNodeStatus(String hostAddress, int runningTask, int maxTaskNumber) + { + this.hostAddress = hostAddress; + this.runningTaskNumber = runningTask; + this.maxTaskNumber = maxTaskNumber; + } + + public int getMaxTaskNumber() + { + return maxTaskNumber; + } + + public String getHostAddress() + { + return hostAddress; + } + + public int getRunningTaskNumber() + { + return runningTaskNumber; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/HdfsOrcDataSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/HdfsOrcDataSource.java new file mode 100644 index 00000000..8296cc8d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/HdfsOrcDataSource.java @@ -0,0 +1,82 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import io.airlift.units.DataSize; +import io.prestosql.orc.AbstractOrcDataSource; +import io.prestosql.orc.OrcDataSourceId; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.spi.PrestoException; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.hdfs.BlockMissingException; + +import java.io.IOException; + +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class HdfsOrcDataSource + extends AbstractOrcDataSource +{ + private final FSDataInputStream inputStream; + private final FileFormatDataSourceStats stats; + + public HdfsOrcDataSource( + OrcDataSourceId id, + long size, + DataSize maxMergeDistance, + DataSize maxReadSize, + DataSize streamBufferSize, + boolean lazyReadSmallRanges, + FSDataInputStream inputStream, + FileFormatDataSourceStats stats, + long lastModifiedTime) + { + super(id, size, maxMergeDistance, maxReadSize, streamBufferSize, lazyReadSmallRanges, lastModifiedTime); + this.inputStream = requireNonNull(inputStream, "inputStream is null"); + this.stats = requireNonNull(stats, "stats is null"); + } + + @Override + public void close() + throws IOException + { + inputStream.close(); + } + + @Override + protected void readInternal(long position, byte[] buffer, int bufferOffset, int bufferLength) + { + try { + long readStart = System.nanoTime(); + inputStream.readFully(position, buffer, bufferOffset, bufferLength); + stats.readDataBytesPerSecond(bufferLength, System.nanoTime() - readStart); + } + catch (PrestoException e) { + // just in case there is a Presto wrapper or hook + throw e; + } + catch (Exception e) { + String message = format("Error reading from %s at position %s", this, position); + if (e instanceof BlockMissingException) { + throw new PrestoException(HiveErrorCode.HIVE_MISSING_DATA, message, e); + } + if (e instanceof IOException) { + throw new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, message, e); + } + throw new PrestoException(HiveErrorCode.HIVE_UNKNOWN_ERROR, message, e); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcAcidRowId.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcAcidRowId.java new file mode 100644 index 00000000..00d0197b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcAcidRowId.java @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import io.prestosql.spi.Page; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.IntegerType.INTEGER; + +public class OrcAcidRowId + implements Comparable +{ + private static final int ORIGINAL_TRANSACTION_INDEX = 0; + private static final int BUCKET_ID_INDEX = 1; + private static final int ROW_ID_INDEX = 2; + + private long originalTransaction; + private int bucket; + private long rowId; + + public OrcAcidRowId(long originalTransaction, int bucket, long rowId) + { + set(originalTransaction, bucket, rowId); + } + + void set(Page page, int position) + { + set(BIGINT.getLong(page.getBlock(ORIGINAL_TRANSACTION_INDEX), position), + (int) INTEGER.getLong(page.getBlock(BUCKET_ID_INDEX), position), + BIGINT.getLong(page.getBlock(ROW_ID_INDEX), position)); + } + + void set(long originalTransaction, int bucket, long rowId) + { + this.originalTransaction = originalTransaction; + this.bucket = bucket; + this.rowId = rowId; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + + if (o == null || getClass() != o.getClass()) { + return false; + } + + OrcAcidRowId other = (OrcAcidRowId) o; + return originalTransaction == other.originalTransaction && + bucket == other.bucket && + rowId == other.rowId; + } + + public long getOriginalTransaction() + { + return originalTransaction; + } + + public int getBucket() + { + return bucket; + } + + public long getRowId() + { + return rowId; + } + + @Override + public int hashCode() + { + return Objects.hash(originalTransaction, bucket, rowId); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("originalTransaction", originalTransaction) + .add("bucket", bucket) + .add("rowId", rowId) + .toString(); + } + + @Override + public int compareTo(Object o) + { + OrcAcidRowId other = (OrcAcidRowId) o; + if (equals(other)) { + return 0; + } + //For transactions deleted from original files, ignore bucket field during comparison + if (originalTransaction == other.originalTransaction && originalTransaction == 0) { + return rowId < other.rowId ? -1 : rowId > other.rowId ? 1 : 0; + } + if (originalTransaction != other.originalTransaction) { + return Long.compare(originalTransaction, other.originalTransaction); + } + else if (bucket != other.bucket) { + return Integer.compare(bucket, other.bucket); + } + else { + return Long.compare(rowId, other.rowId); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcConcatPageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcConcatPageSource.java new file mode 100644 index 00000000..39b7d2b3 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcConcatPageSource.java @@ -0,0 +1,133 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.AbstractIterator; +import io.airlift.log.Logger; +import io.prestosql.spi.Page; +import io.prestosql.spi.connector.ConnectorPageSource; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.collect.Iterators.concat; +import static java.util.stream.Collectors.toList; + +public class OrcConcatPageSource + implements ConnectorPageSource +{ + private static final Logger log = Logger.get(OrcConcatPageSource.class); + + private final List pageSources; + private final Iterator concatPageIterator; + + private boolean closed; + + public OrcConcatPageSource(List pargeSources) + { + this.pageSources = pargeSources; + List> sourceIterators = pageSources.stream().map(source -> new AbstractIterator() + { + @Override + protected Page computeNext() + { + Page nextPage; + do { + nextPage = source.getNextPage(); + if (nextPage == null) { + if (source.isFinished()) { + return endOfData(); + } + return null; + } + } + while (nextPage.getPositionCount() == 0); + + /* Todo(Nitin) Check if loaded block needed here! */ + return nextPage; + } + }).collect(toList()); + + concatPageIterator = concat(sourceIterators.iterator()); + closed = false; + } + + @Override + public long getCompletedBytes() + { + return pageSources.stream().mapToLong(ConnectorPageSource::getCompletedBytes).sum(); + } + + @Override + public long getReadTimeNanos() + { + return pageSources.get(0).getReadTimeNanos(); + } + + @Override + public boolean isFinished() + { + return closed; + } + + @Override + public Page getNextPage() + { + if (!concatPageIterator.hasNext()) { + close(); + return null; + } + + return concatPageIterator.next(); + } + + @Override + public long getSystemMemoryUsage() + { + return pageSources.get(0).getSystemMemoryUsage(); + } + + @Override + public void close() + { + if (closed) { + return; + } + + pageSources.stream().forEach(ps -> { + try { + ps.close(); + } + catch (IOException | RuntimeException e) { + log.warn("failed to close page source"); + } + }); + closed = true; + } + + @Override + public String toString() + { + return toStringHelper(this).toString(); + } + + @VisibleForTesting + public ConnectorPageSource getConnectorPageSource() + { + return pageSources.get(0); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcDeleteDeltaPageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcDeleteDeltaPageSource.java new file mode 100644 index 00000000..8c92a874 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcDeleteDeltaPageSource.java @@ -0,0 +1,228 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import com.google.common.collect.ImmutableList; +import io.airlift.units.DataSize; +import io.prestosql.memory.context.AggregatedMemoryContext; +import io.prestosql.orc.OrcColumn; +import io.prestosql.orc.OrcDataSource; +import io.prestosql.orc.OrcDataSourceId; +import io.prestosql.orc.OrcPredicate; +import io.prestosql.orc.OrcReader; +import io.prestosql.orc.OrcRecordReader; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorPageSource; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.BlockMissingException; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.Map; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Strings.nullToEmpty; +import static com.google.common.collect.Maps.uniqueIndex; +import static io.prestosql.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; +import static io.prestosql.orc.OrcReader.INITIAL_BATCH_SIZE; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static org.joda.time.DateTimeZone.UTC; + +//TODO Raghu add the handling for BloomFilter, BufferSize + +public class OrcDeleteDeltaPageSource + implements ConnectorPageSource +{ + private final OrcRecordReader recordReader; + private final OrcDataSource orcDataSource; + private final FileFormatDataSourceStats stats; + private final AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext(); + + private boolean closed; + + public OrcDeleteDeltaPageSource( + Path path, + long fileSize, + String sessionUser, + Configuration configuration, + HdfsEnvironment hdfsEnvironment, + DataSize maxMergeDistance, + DataSize maxBufferSize, + DataSize streamBufferSize, + DataSize maxReadBlockSize, + DataSize tinyStripeThreshold, + boolean lazyReadSmallRanges, + boolean orcBloomFiltersEnabled, + FileFormatDataSourceStats stats, + long lastModifiedTime) + { + this.stats = requireNonNull(stats, "stats is null"); + + try { + FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration); + FSDataInputStream inputStream = hdfsEnvironment.doAs(sessionUser, () -> fileSystem.open(path)); + orcDataSource = new HdfsOrcDataSource( + new OrcDataSourceId(path.toString()), + fileSize, + maxMergeDistance, + maxReadBlockSize, + streamBufferSize, + lazyReadSmallRanges, + inputStream, + stats, + lastModifiedTime); + } + catch (Exception e) { + if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || + e instanceof FileNotFoundException) { + throw new PrestoException(HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT, e); + } + throw new PrestoException(HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT, openError(e, path), e); + } + + try { + OrcReader reader = new OrcReader(orcDataSource, maxMergeDistance, tinyStripeThreshold, maxReadBlockSize); + + OrcPageSourceFactory.verifyAcidSchema(reader, path); + Map acidColumns = uniqueIndex( + reader.getRootColumn().getNestedColumns(), + orcColumn -> orcColumn.getColumnName()); + List rowIdColumns = ImmutableList.of(acidColumns.get(OrcPageSourceFactory.ACID_COLUMN_ORIGINAL_TRANSACTION), acidColumns.get(OrcPageSourceFactory.ACID_COLUMN_BUCKET), acidColumns.get(OrcPageSourceFactory.ACID_COLUMN_ROW_ID)); + + recordReader = reader.createRecordReader( + rowIdColumns, + ImmutableList.of(BIGINT, INTEGER, BIGINT), + OrcPredicate.TRUE, + 0, + fileSize, + UTC, + systemMemoryContext, + INITIAL_BATCH_SIZE, + exception -> OrcPageSource.handleException(orcDataSource.getId(), exception)); + } + catch (Exception e) { + try { + orcDataSource.close(); + } + catch (IOException ex) { + e.addSuppressed(ex); + } + if (e instanceof PrestoException) { + throw (PrestoException) e; + } + String message = openError(e, path); + if (e instanceof BlockMissingException) { + throw new PrestoException(HiveErrorCode.HIVE_MISSING_DATA, message, e); + } + throw new PrestoException(HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT, message, e); + } + } + + @Override + public long getCompletedBytes() + { + return orcDataSource.getReadBytes(); + } + + @Override + public long getReadTimeNanos() + { + return orcDataSource.getReadTimeNanos(); + } + + @Override + public boolean isFinished() + { + return closed; + } + + @Override + public Page getNextPage() + { + try { + Page page = recordReader.nextPage(); + if (page == null) { + close(); + } + return page; + } + catch (IOException | RuntimeException e) { + closeWithSuppression(e); + throw OrcPageSource.handleException(orcDataSource.getId(), e); + } + } + + @Override + public void close() + { + // some hive input formats are broken and bad things can happen if you close them multiple times + if (closed) { + return; + } + closed = true; + + try { + stats.addMaxCombinedBytesPerRow(recordReader.getMaxCombinedBytesPerRow()); + recordReader.close(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("orcDataSource", orcDataSource.getId()) + .toString(); + } + + @Override + public long getSystemMemoryUsage() + { + return systemMemoryContext.getBytes(); + } + + private void closeWithSuppression(Throwable throwable) + { + requireNonNull(throwable, "throwable is null"); + try { + close(); + } + catch (RuntimeException e) { + // Self-suppression not permitted + if (throwable != e) { + throwable.addSuppressed(e); + } + } + } + + private static String openError(Throwable t, Path path) + { + return format("Error opening Hive delta delete file %s: %s", path, t.getMessage()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcDeleteDeltaPageSourceFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcDeleteDeltaPageSourceFactory.java new file mode 100644 index 00000000..f03edc67 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcDeleteDeltaPageSourceFactory.java @@ -0,0 +1,82 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import io.airlift.units.DataSize; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HdfsEnvironment; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; + +import static java.util.Objects.requireNonNull; + +public class OrcDeleteDeltaPageSourceFactory +{ + private final String sessionUser; + private final Configuration configuration; + private final HdfsEnvironment hdfsEnvironment; + private final FileFormatDataSourceStats stats; + private final DataSize maxMergeDistance; + private final DataSize maxBufferSize; + private final DataSize streamBufferSize; + private final DataSize maxReadBlockSize; + private final DataSize tinyStripeThreshold; + private final boolean lazyReadSmallRanges; + private final boolean orcBloomFiltersEnabled; + + public OrcDeleteDeltaPageSourceFactory( + String sessionUser, + Configuration configuration, + HdfsEnvironment hdfsEnvironment, + DataSize maxMergeDistance, + DataSize maxBufferSize, + DataSize streamBufferSize, + DataSize maxReadBlockSize, + DataSize tinyStripeThreshold, + boolean lazyReadSmallRanges, + boolean orcBloomFiltersEnabled, + FileFormatDataSourceStats stats) + { + this.sessionUser = requireNonNull(sessionUser, "sessionUser is null"); + this.configuration = requireNonNull(configuration, "configuration is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.stats = requireNonNull(stats, "stats is null"); + this.maxMergeDistance = requireNonNull(maxMergeDistance, "maxMergeDistance is null"); + this.maxBufferSize = requireNonNull(maxBufferSize, "maxBufferSize is null"); + this.streamBufferSize = requireNonNull(streamBufferSize, "streamBufferSize is null"); + this.maxReadBlockSize = requireNonNull(maxReadBlockSize, "maxReadBlockSize is null"); + this.tinyStripeThreshold = requireNonNull(tinyStripeThreshold, "tinyStripeThreshold is null"); + this.lazyReadSmallRanges = requireNonNull(lazyReadSmallRanges, "lazyReadSmallRanges is null"); + this.orcBloomFiltersEnabled = requireNonNull(orcBloomFiltersEnabled, "orcBloomFiltersEnabled is null"); + } + + public OrcDeleteDeltaPageSource createPageSource(Path path, long fileSize, long lastModifiedTime) + { + return new OrcDeleteDeltaPageSource( + path, + fileSize, + sessionUser, + configuration, + hdfsEnvironment, + maxMergeDistance, + maxBufferSize, + streamBufferSize, + maxReadBlockSize, + tinyStripeThreshold, + lazyReadSmallRanges, + orcBloomFiltersEnabled, + stats, + lastModifiedTime); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcDeletedRows.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcDeletedRows.java new file mode 100644 index 00000000..094f7554 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcDeletedRows.java @@ -0,0 +1,285 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import com.google.common.collect.ImmutableList; +import io.prestosql.orc.OrcCorruptionException; +import io.prestosql.plugin.hive.DeleteDeltaLocations; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HiveUtil; +import io.prestosql.plugin.hive.WriteIdInfo; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.DictionaryBlock; +import io.prestosql.spi.block.SortOrder; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.type.BigintType; +import io.prestosql.spi.type.IntegerType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import javax.annotation.Nullable; +import javax.annotation.concurrent.NotThreadSafe; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Optional; +import java.util.regex.Pattern; + +import static com.google.common.base.Verify.verify; +import static java.lang.String.format; +import static org.apache.hadoop.hive.ql.io.AcidUtils.deleteDeltaSubdir; + +@NotThreadSafe +public class OrcDeletedRows +{ + private final String sourceFileName; + private final DeleteDeltaLocations deleteDeltaLocations; + private final OrcDeleteDeltaPageSourceFactory pageSourceFactory; + private final String sessionUser; + private final Configuration configuration; + private final HdfsEnvironment hdfsEnvironment; + Optional startRowOffsetOfFile; + + private final Pattern originalFilePattern = Pattern.compile("[0-9]+_[0-9]+"); + private final Pattern originalCopyFilePattern = Pattern.compile("[0-9]+_[0-9]+" + "_copy_" + "[0-9]+"); + private final String bucketPrefix = "bucket_"; + + private final List pageSources = new ArrayList<>(); + private Iterator sortedRowsIterator; + private Page currentPage; + private int currentPageOffset; + private OrcAcidRowId deletedRowId = new OrcAcidRowId(0, 0, 0); + + public OrcDeletedRows( + String sourceFileName, + Optional deleteDeltaLocations, + OrcDeleteDeltaPageSourceFactory pageSourceFactory, + String sessionUser, + Configuration configuration, + HdfsEnvironment hdfsEnvironment, + Optional startRowOffsetOfFile) + { + this.sourceFileName = sourceFileName; + this.pageSourceFactory = pageSourceFactory; + this.sessionUser = sessionUser; + this.configuration = configuration; + this.hdfsEnvironment = hdfsEnvironment; + if (deleteDeltaLocations.isPresent()) { + this.deleteDeltaLocations = deleteDeltaLocations.get(); + } + else { + this.deleteDeltaLocations = null; + } + this.startRowOffsetOfFile = startRowOffsetOfFile; + } + + public MaskDeletedRowsFunction getMaskDeletedRowsFunction(Page sourcePage, Optional pageRowOffset) + { + return new MaskDeletedRowsFunction(sourcePage, pageRowOffset); + } + + @NotThreadSafe + public class MaskDeletedRowsFunction + { + @Nullable + private Page sourcePage; + private int positionCount; + @Nullable + private int[] validPositions; + private Optional pageRowOffset; + + public MaskDeletedRowsFunction(Page sourcePage, Optional pageRowOffset) + { + this.sourcePage = sourcePage; + this.pageRowOffset = pageRowOffset; + } + + public int getPositionCount() + { + if (sourcePage != null) { + loadValidPositions(); + verify(sourcePage == null); + } + + return positionCount; + } + + public Block apply(Block block) + { + if (sourcePage != null) { + loadValidPositions(); + verify(sourcePage == null); + } + + if (positionCount == block.getPositionCount() || block.getPositionCount() == 0 || validPositions == null) { + return block; + } + return new DictionaryBlock(positionCount, block, validPositions); + } + + private void loadValidPositions() + { + if (deleteDeltaLocations == null) { + this.positionCount = sourcePage.getPositionCount(); + this.sourcePage = null; + return; + } + + int[] validPositions = new int[sourcePage.getPositionCount()]; + OrcAcidRowId sourcePageRowId = new OrcAcidRowId(0, 0, 0); + int validPositionsIndex = 0; + for (int pagePosition = 0; pagePosition < sourcePage.getPositionCount(); pagePosition++) { + if (startRowOffsetOfFile.isPresent() && pageRowOffset.isPresent()) { + long currRowId = startRowOffsetOfFile.get() + pageRowOffset.get() + pagePosition; + sourcePageRowId.set(sourcePageRowId.getOriginalTransaction(), sourcePageRowId.getBucket(), currRowId); + } + else { + sourcePageRowId.set(sourcePage, pagePosition); + } + boolean deleted = isDeleted(sourcePageRowId); + if (!deleted) { + validPositions[validPositionsIndex] = pagePosition; + validPositionsIndex++; + } + } + this.positionCount = validPositionsIndex; + this.validPositions = validPositions; + this.sourcePage = null; + } + } + + private boolean isDeleted(OrcAcidRowId sourcePageRowId) + { + if (sortedRowsIterator == null) { + for (WriteIdInfo deleteDeltaInfo : deleteDeltaLocations.getDeleteDeltas()) { + Path path = createPath(deleteDeltaLocations.getPartitionLocation(), deleteDeltaInfo, sourceFileName); + try { + FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration); + FileStatus fileStatus = hdfsEnvironment.doAs(sessionUser, () -> fileSystem.getFileStatus(path)); + + pageSources.add(pageSourceFactory.createPageSource(fileStatus.getPath(), fileStatus.getLen(), fileStatus.getModificationTime())); + } + catch (FileNotFoundException ignored) { + // source file does not have a delta delete file in this location + continue; + } + catch (PrestoException e) { + throw e; + } + catch (OrcCorruptionException e) { + throw new PrestoException(HiveErrorCode.HIVE_BAD_DATA, format("Failed to read ORC file: %s", path), e); + } + catch (RuntimeException | IOException e) { + throw new PrestoException(HiveErrorCode.HIVE_CURSOR_ERROR, format("Failed to read ORC file: %s", path), e); + } + } + List columnTypes = ImmutableList.of(BigintType.BIGINT, IntegerType.INTEGER, BigintType.BIGINT); + //Last index for rowIdHandle + List sortFields = ImmutableList.of(0, 1, 2); + List sortOrders = ImmutableList.of(SortOrder.ASC_NULLS_FIRST, SortOrder.ASC_NULLS_FIRST, SortOrder.ASC_NULLS_FIRST); + sortedRowsIterator = HiveUtil.getMergeSortedPages(pageSources, columnTypes, sortFields, + sortOrders); + } + do { + if (currentPage == null || currentPageOffset >= currentPage.getPositionCount()) { + currentPage = null; + currentPageOffset = 0; + if (sortedRowsIterator.hasNext()) { + currentPage = sortedRowsIterator.next(); + } + else { + //No more entries in deleted_delta + return false; + } + } + do { + deletedRowId.set(currentPage, currentPageOffset); + if (deletedRowId.compareTo(sourcePageRowId) == 0) { + //source row is deleted. + return true; + } + else if (deletedRowId.compareTo(sourcePageRowId) > 0) { + //source row entry not found, but next deleted entry is greater than current source row. + //So current source row is not deleted. + return false; + } + currentPageOffset++; + } + while (currentPageOffset < currentPage.getPositionCount()); + } + while (sortedRowsIterator.hasNext()); + //No more entries; + return false; + } + + private int getBucketNumber(String fileName) + { + if (fileName.startsWith(bucketPrefix)) { + return Integer.parseInt(fileName.substring(fileName.indexOf('_') + 1)); + } + else if (originalFilePattern.matcher(fileName).matches() || originalCopyFilePattern.matcher(fileName).matches()) { + return Integer.parseInt(fileName.substring(0, fileName.indexOf('_'))); + } + return -1; + } + + private Path createPath(String partitionLocation, WriteIdInfo deleteDeltaInfo, String fileName) + { + //Honor statement=-1 as well. Because Minor compacted delta directories will not have statementId + Path directory; + if (deleteDeltaInfo.getStatementId() == -1) { + directory = new Path(partitionLocation, deleteDeltaSubdir( + deleteDeltaInfo.getMinWriteId(), + deleteDeltaInfo.getMaxWriteId())); + } + else { + directory = new Path(partitionLocation, deleteDeltaSubdir( + deleteDeltaInfo.getMinWriteId(), + deleteDeltaInfo.getMaxWriteId(), + deleteDeltaInfo.getStatementId())); + } + String bucketFileName = getBucketFileName(fileName); + return new Path(directory, bucketFileName); + } + + private String getBucketFileName(String fileName) + { + String bucketDigit = "%05d"; + int bucketNumber = getBucketNumber(fileName); + return bucketPrefix + String.format(bucketDigit, bucketNumber); + } + + void close() + { + pageSources.forEach(pageSource -> + { + try { + pageSource.close(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + }); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcPageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcPageSource.java new file mode 100644 index 00000000..c6b882bb --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcPageSource.java @@ -0,0 +1,332 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import com.google.common.collect.ImmutableList; +import io.prestosql.memory.context.AggregatedMemoryContext; +import io.prestosql.orc.OrcCorruptionException; +import io.prestosql.orc.OrcDataSource; +import io.prestosql.orc.OrcDataSourceId; +import io.prestosql.orc.OrcRecordReader; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.LazyBlock; +import io.prestosql.spi.block.LazyBlockLoader; +import io.prestosql.spi.block.RowBlock; +import io.prestosql.spi.block.RunLengthEncodedBlock; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class OrcPageSource + implements ConnectorPageSource +{ + private static final int NULL_ENTRY_SIZE = 0; + private final OrcRecordReader recordReader; + private final List columnAdaptations; + private final OrcDataSource orcDataSource; + private final OrcDeletedRows deletedRows; + private final boolean loadEagerly; + + private boolean closed; + + private final AggregatedMemoryContext systemMemoryContext; + + private final FileFormatDataSourceStats stats; + + public OrcPageSource( + OrcRecordReader recordReader, + List columnAdaptations, + OrcDataSource orcDataSource, + OrcDeletedRows deletedRows, + boolean loadEagerly, + AggregatedMemoryContext systemMemoryContext, + FileFormatDataSourceStats stats) + { + this.recordReader = requireNonNull(recordReader, "recordReader is null"); + this.columnAdaptations = ImmutableList.copyOf(requireNonNull(columnAdaptations, "columnAdaptations is null")); + this.orcDataSource = requireNonNull(orcDataSource, "orcDataSource is null"); + this.deletedRows = requireNonNull(deletedRows, "deletedRows is null"); + this.loadEagerly = loadEagerly; + this.stats = requireNonNull(stats, "stats is null"); + this.systemMemoryContext = requireNonNull(systemMemoryContext, "systemMemoryContext is null"); + } + + @Override + public long getCompletedBytes() + { + return orcDataSource.getReadBytes(); + } + + @Override + public long getReadTimeNanos() + { + return orcDataSource.getReadTimeNanos(); + } + + @Override + public boolean isFinished() + { + return closed; + } + + @Override + public Page getNextPage() + { + Page page; + try { + page = recordReader.nextPage(); + } + catch (IOException | RuntimeException e) { + closeWithSuppression(e); + throw handleException(orcDataSource.getId(), e); + } + + if (page == null) { + close(); + return null; + } + + //Eager load the page to filter out rows using HeuristicIndex when reading from ORC file + if (loadEagerly) { + page = page.getLoadedPage(); + } + + Optional pageRowOffset = Optional.of(recordReader.getFilePosition()); + OrcDeletedRows.MaskDeletedRowsFunction maskDeletedRowsFunction = deletedRows.getMaskDeletedRowsFunction(page, pageRowOffset); + Block[] blocks = new Block[columnAdaptations.size()]; + for (int i = 0; i < columnAdaptations.size(); i++) { + blocks[i] = columnAdaptations.get(i).block(page, maskDeletedRowsFunction); + } + return new Page(maskDeletedRowsFunction.getPositionCount(), page.getPageMetadata(), blocks); + } + + static PrestoException handleException(OrcDataSourceId dataSourceId, Exception exception) + { + if (exception instanceof PrestoException) { + return (PrestoException) exception; + } + if (exception instanceof OrcCorruptionException) { + return new PrestoException(HiveErrorCode.HIVE_BAD_DATA, exception); + } + return new PrestoException(HiveErrorCode.HIVE_CURSOR_ERROR, format("Failed to read ORC file: %s", dataSourceId), exception); + } + + @Override + public void close() + { + // some hive input formats are broken and bad things can happen if you close them multiple times + if (closed) { + return; + } + closed = true; + + try { + deletedRows.close(); + stats.addMaxCombinedBytesPerRow(recordReader.getMaxCombinedBytesPerRow()); + recordReader.close(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("orcDataSource", orcDataSource.getId()) + .add("columns", columnAdaptations) + .toString(); + } + + @Override + public long getSystemMemoryUsage() + { + return systemMemoryContext.getBytes(); + } + + private void closeWithSuppression(Throwable throwable) + { + requireNonNull(throwable, "throwable is null"); + try { + close(); + } + catch (RuntimeException e) { + // Self-suppression not permitted + if (throwable != e) { + throwable.addSuppressed(e); + } + } + } + + public interface ColumnAdaptation + { + Block block(Page sourcePage, OrcDeletedRows.MaskDeletedRowsFunction maskDeletedRowsFunction); + + static ColumnAdaptation nullColumn(Type type) + { + return new NullColumn(type); + } + + static ColumnAdaptation sourceColumn(int index) + { + return new SourceColumn(index); + } + + static ColumnAdaptation sourceColumn(int index, boolean mask) + { + return new SourceColumn(index, mask); + } + + static ColumnAdaptation structColumn(StructTypeInfo structTypeInfo, List adaptations) + { + return new StructColumn(structTypeInfo, adaptations); + } + } + + private static class NullColumn + implements ColumnAdaptation + { + private final Type type; + private final Block nullBlock; + + public NullColumn(Type type) + { + this.type = requireNonNull(type, "type is null"); + this.nullBlock = type.createBlockBuilder(null, 1, 0) + .appendNull() + .build(); + } + + @Override + public Block block(Page sourcePage, OrcDeletedRows.MaskDeletedRowsFunction maskDeletedRowsFunction) + { + return new RunLengthEncodedBlock(nullBlock, maskDeletedRowsFunction.getPositionCount()); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("type", type) + .toString(); + } + } + + private static class SourceColumn + implements ColumnAdaptation + { + private final int index; + private final boolean mask; + + public SourceColumn(int index) + { + this(index, true); + } + + public SourceColumn(int index, boolean mask) + { + checkArgument(index >= 0, "index is negative"); + this.index = index; + this.mask = mask; + } + + @Override + public Block block(Page sourcePage, OrcDeletedRows.MaskDeletedRowsFunction maskDeletedRowsFunction) + { + Block block = sourcePage.getBlock(index); + if (mask) { + return new LazyBlock(maskDeletedRowsFunction.getPositionCount(), new MaskingBlockLoader(maskDeletedRowsFunction, block)); + } + return block; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("index", index) + .toString(); + } + + private static final class MaskingBlockLoader + implements LazyBlockLoader + { + private OrcDeletedRows.MaskDeletedRowsFunction maskDeletedRowsFunction; + private Block sourceBlock; + + public MaskingBlockLoader(OrcDeletedRows.MaskDeletedRowsFunction maskDeletedRowsFunction, Block sourceBlock) + { + this.maskDeletedRowsFunction = requireNonNull(maskDeletedRowsFunction, "maskDeletedRowsFunction is null"); + this.sourceBlock = requireNonNull(sourceBlock, "sourceBlock is null"); + } + + @Override + public void load(LazyBlock block) + { + checkState(maskDeletedRowsFunction != null, "Already loaded"); + + Block resultBlock = maskDeletedRowsFunction.apply(sourceBlock.getLoadedBlock()); + + maskDeletedRowsFunction = null; + sourceBlock = null; + + block.setBlock(resultBlock); + } + } + } + + private static class StructColumn + implements ColumnAdaptation + { + List columnAdaptations; + StructTypeInfo structTypeInfo; + + StructColumn(StructTypeInfo structTypeInfo, List adaptations) + { + this.structTypeInfo = structTypeInfo; + this.columnAdaptations = adaptations; + } + + @Override + public Block block(Page sourcePage, OrcDeletedRows.MaskDeletedRowsFunction maskDeletedRowsFunction) + { + List types = structTypeInfo.getAllStructFieldTypeInfos().stream() + .map(typeInfo -> HiveType + .getPrimitiveType((PrimitiveTypeInfo.class.cast(typeInfo)))) + .collect(Collectors.toList()); + List fieldBlocks = columnAdaptations.stream().map(adaptation -> adaptation.block(sourcePage, maskDeletedRowsFunction)).collect(Collectors.toList()); + Block block = RowBlock.fromFieldBlocks(sourcePage.getPositionCount(), Optional.empty(), fieldBlocks.toArray(new Block[fieldBlocks.size()])); + return new LazyBlock(maskDeletedRowsFunction.getPositionCount(), new SourceColumn.MaskingBlockLoader(maskDeletedRowsFunction, block)); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcPageSourceFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcPageSourceFactory.java new file mode 100644 index 00000000..c34ad692 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcPageSourceFactory.java @@ -0,0 +1,714 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import com.google.common.util.concurrent.UncheckedExecutionException; +import com.huawei.boostkit.omnidata.block.BlockDeserializer; +import com.huawei.boostkit.omnidata.model.Predicate; +import com.huawei.boostkit.omnidata.model.TaskSource; +import com.huawei.boostkit.omnidata.model.datasource.DataSource; +import com.huawei.boostkit.omnidata.reader.DataReader; +import com.huawei.boostkit.omnidata.reader.DataReaderFactory; +import io.airlift.log.Logger; +import io.airlift.units.DataSize; +import io.prestosql.memory.context.AggregatedMemoryContext; +import io.prestosql.orc.OrcCacheProperties; +import io.prestosql.orc.OrcCacheStore; +import io.prestosql.orc.OrcColumn; +import io.prestosql.orc.OrcDataSource; +import io.prestosql.orc.OrcDataSourceId; +import io.prestosql.orc.OrcDataSourceIdWithTimeStamp; +import io.prestosql.orc.OrcFileTail; +import io.prestosql.orc.OrcFileTailCacheKey; +import io.prestosql.orc.OrcReader; +import io.prestosql.orc.OrcRecordReader; +import io.prestosql.orc.TupleDomainOrcPredicate; +import io.prestosql.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder; +import io.prestosql.orc.metadata.OrcType.OrcTypeKind; +import io.prestosql.plugin.hive.DeleteDeltaLocations; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveOffloadExpression; +import io.prestosql.plugin.hive.HivePageSourceFactory; +import io.prestosql.plugin.hive.HivePartitionKey; +import io.prestosql.plugin.hive.HiveSessionProperties; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.HiveUtil; +import io.prestosql.plugin.hive.orc.OrcPageSource.ColumnAdaptation; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.FixedPageSource; +import io.prestosql.spi.dynamicfilter.DynamicFilterSupplier; +import io.prestosql.spi.heuristicindex.IndexMetadata; +import io.prestosql.spi.heuristicindex.SplitMetadata; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.RowType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PositionedReadable; +import org.apache.hadoop.fs.Seekable; +import org.apache.hadoop.hdfs.BlockMissingException; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.io.orc.OrcSerde; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.joda.time.DateTimeZone; + +import javax.inject.Inject; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.concurrent.ExecutionException; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.nullToEmpty; +import static com.google.common.collect.Maps.uniqueIndex; +import static com.huawei.boostkit.omnidata.OmniDataProperty.GRPC_CLIENT_TARGET_LIST; +import static io.prestosql.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; +import static io.prestosql.orc.OrcReader.INITIAL_BATCH_SIZE; +import static io.prestosql.orc.OrcReader.handleCacheLoadException; +import static io.prestosql.orc.metadata.OrcType.OrcTypeKind.INT; +import static io.prestosql.orc.metadata.OrcType.OrcTypeKind.LONG; +import static io.prestosql.orc.metadata.OrcType.OrcTypeKind.STRUCT; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILE_MISSING_COLUMN_NAMES; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA; +import static io.prestosql.plugin.hive.HiveSessionProperties.getOrcLazyReadSmallRanges; +import static io.prestosql.plugin.hive.HiveSessionProperties.getOrcMaxBufferSize; +import static io.prestosql.plugin.hive.HiveSessionProperties.getOrcMaxMergeDistance; +import static io.prestosql.plugin.hive.HiveSessionProperties.getOrcMaxReadBlockSize; +import static io.prestosql.plugin.hive.HiveSessionProperties.getOrcStreamBufferSize; +import static io.prestosql.plugin.hive.HiveSessionProperties.getOrcTinyStripeThreshold; +import static io.prestosql.plugin.hive.HiveSessionProperties.isOrcBloomFiltersCacheEnabled; +import static io.prestosql.plugin.hive.HiveSessionProperties.isOrcBloomFiltersEnabled; +import static io.prestosql.plugin.hive.HiveSessionProperties.isOrcFileTailCacheEnabled; +import static io.prestosql.plugin.hive.HiveSessionProperties.isOrcRowDataCacheEnabled; +import static io.prestosql.plugin.hive.HiveSessionProperties.isOrcRowIndexCacheEnabled; +import static io.prestosql.plugin.hive.HiveSessionProperties.isOrcStripeFooterCacheEnabled; +import static io.prestosql.plugin.hive.orc.OrcPageSource.handleException; +import static io.prestosql.plugin.hive.util.PageSourceUtil.buildPushdownContext; +import static io.prestosql.plugin.hive.util.PageSourceUtil.getSslConfiguredProperties; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static java.lang.String.format; +import static java.util.Locale.ENGLISH; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toMap; +import static org.apache.hadoop.hive.ql.io.AcidUtils.isFullAcidTable; + +public class OrcPageSourceFactory + implements HivePageSourceFactory +{ + // ACID format column names + public static final String ACID_COLUMN_OPERATION = "operation"; + public static final String ACID_COLUMN_ORIGINAL_TRANSACTION = "originalTransaction"; + public static final String ACID_COLUMN_BUCKET = "bucket"; + public static final String ACID_COLUMN_ROW_ID = "rowId"; + public static final String ACID_COLUMN_CURRENT_TRANSACTION = "currentTransaction"; + public static final String ACID_COLUMN_ROW_STRUCT = "row"; + public static final List EAGER_LOAD_INDEX_ID = ImmutableList.of("BITMAP"); + + private static final Pattern DEFAULT_HIVE_COLUMN_NAME_PATTERN = Pattern.compile("_col\\d+"); + private static final Logger log = Logger.get(OrcPageSourceFactory.class); + + private final TypeManager typeManager; + private final boolean useOrcColumnNames; + private final HdfsEnvironment hdfsEnvironment; + private final FileFormatDataSourceStats stats; + private final OrcCacheStore orcCacheStore; + private final int domainCompactionThreshold; + private final DateTimeZone legacyTimeZone; + private final ImmutableMap sslPropertyMap; + private String omniDataServerTarget; + + @Inject + public OrcPageSourceFactory(TypeManager typeManager, HiveConfig config, HdfsEnvironment hdfsEnvironment, FileFormatDataSourceStats stats, OrcCacheStore orcCacheStore) + { + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + requireNonNull(config, "config is null"); + this.useOrcColumnNames = config.isUseOrcColumnNames(); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.stats = requireNonNull(stats, "stats is null"); + this.orcCacheStore = orcCacheStore; + this.domainCompactionThreshold = config.getDomainCompactionThreshold(); + this.legacyTimeZone = requireNonNull(config, "hiveConfig is null").getOrcLegacyDateTimeZone(); + this.omniDataServerTarget = null; + this.sslPropertyMap = getSslConfiguredProperties(config); + } + + @Override + public Optional createPageSource( + Configuration configuration, + ConnectorSession session, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + TupleDomain effectivePredicate, + Optional dynamicFilters, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + SplitMetadata splitMetadata, + boolean splitCacheable, + long dataSourceLastModifiedTime, + List partitionKeys, + OptionalInt bucketNumber, + Optional omniDataAddress, + HiveOffloadExpression expression) + { + if (!HiveUtil.isDeserializerClass(schema, OrcSerde.class)) { + return Optional.empty(); + } + + // per HIVE-13040 and ORC-162, empty files are allowed + if (fileSize == 0) { + return Optional.of(new FixedPageSource(ImmutableList.of())); + } + if (omniDataAddress.isPresent()) { + omniDataServerTarget = omniDataAddress.get(); + } + + if (expression.isPresent()) { + checkArgument(omniDataAddress.isPresent(), "omniDataAddress is empty"); + } + + // todo: add other condition for push down to sdi or not + if (HiveSessionProperties.isOmniDataEnabled(session) + && omniDataAddress.isPresent() + && expression.isPresent()) { + Predicate predicate = buildPushdownContext(columns, expression, typeManager, + effectivePredicate, partitionKeys, bucketNumber, path); + return Optional.of(createOrcPushDownPageSource(path, start, length, predicate, stats)); + } + + return createPageSource( + configuration, + session, + path, + start, + length, + fileSize, + schema, + columns, + effectivePredicate, + dynamicFilters, + deleteDeltaLocations, + startRowOffsetOfFile, + indexes, + splitMetadata, + splitCacheable, + dataSourceLastModifiedTime); + } + + @Override + public Optional createPageSource( + Configuration configuration, + ConnectorSession session, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + TupleDomain effectivePredicate, + Optional dynamicFilters, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + SplitMetadata splitMetadata, + boolean splitCacheable, + long dataSourceLastModifiedTime) + { + if (!HiveUtil.isDeserializerClass(schema, OrcSerde.class)) { + return Optional.empty(); + } + + // per HIVE-13040 and ORC-162, empty files are allowed + if (fileSize == 0) { + return Optional.of(new FixedPageSource(ImmutableList.of())); + } + OrcCacheProperties orcCacheProperties = new OrcCacheProperties( + isOrcFileTailCacheEnabled(session), + isOrcStripeFooterCacheEnabled(session), + isOrcRowIndexCacheEnabled(session), + isOrcBloomFiltersCacheEnabled(session), + isOrcRowDataCacheEnabled(session) && splitCacheable); + return Optional.of(createOrcPageSource( + hdfsEnvironment, + session.getUser(), + configuration, + path, + start, + length, + fileSize, + columns, + useOrcColumnNames, + isFullAcidTable(Maps.fromProperties(schema)), + effectivePredicate, + legacyTimeZone, + typeManager, + getOrcMaxMergeDistance(session), + getOrcMaxBufferSize(session), + getOrcStreamBufferSize(session), + getOrcTinyStripeThreshold(session), + getOrcMaxReadBlockSize(session), + getOrcLazyReadSmallRanges(session), + isOrcBloomFiltersEnabled(session), + stats, + dynamicFilters, + deleteDeltaLocations, + startRowOffsetOfFile, + indexes, + splitMetadata, + orcCacheStore, + orcCacheProperties, + domainCompactionThreshold, + session.isPageMetadataEnabled(), + dataSourceLastModifiedTime)); + } + + public static OrcPageSource createOrcPageSource( + HdfsEnvironment hdfsEnvironment, + String sessionUser, + Configuration configuration, + Path path, + long start, + long length, + long fileSize, + List columns, + boolean useOrcColumnNames, + boolean isFullAcid, + TupleDomain effectivePredicate, + DateTimeZone legacyFileTimeZone, + TypeManager typeManager, + DataSize maxMergeDistance, + DataSize maxBufferSize, + DataSize streamBufferSize, + DataSize tinyStripeThreshold, + DataSize maxReadBlockSize, + boolean lazyReadSmallRanges, + boolean orcBloomFiltersEnabled, + FileFormatDataSourceStats stats, + Optional dynamicFilters, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + SplitMetadata splitMetadata, + OrcCacheStore orcCacheStore, + OrcCacheProperties orcCacheProperties, + int domainCompactionThreshold, + boolean pageMetadataEnabled, + long dataSourceLastModifiedTime) + { + for (HiveColumnHandle column : columns) { + checkArgument( + column.getColumnType() == HiveColumnHandle.ColumnType.REGULAR || column.getHiveColumnIndex() == HiveColumnHandle.ROW_ID__COLUMN_INDEX, + "column type must be regular: %s", column); + } + checkArgument(!effectivePredicate.isNone()); + + OrcDataSource orcDataSource; + try { + //Always create a lazy Stream. HDFS stream opened only when required. + FSDataInputStream inputStream = new FSDataInputStream(new LazyFSInputStream(() -> { + FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration); + return hdfsEnvironment.doAs(sessionUser, () -> fileSystem.open(path)); + })); + orcDataSource = new HdfsOrcDataSource( + new OrcDataSourceId(path.toString()), + fileSize, + maxMergeDistance, + maxBufferSize, + streamBufferSize, + lazyReadSmallRanges, + inputStream, + stats, + dataSourceLastModifiedTime); + } + catch (Exception e) { + if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || + e instanceof FileNotFoundException) { + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e); + } + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e); + } + + AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext(); + try { + OrcDataSource readerLocalDataSource = OrcReader.wrapWithCacheIfTiny(orcDataSource, tinyStripeThreshold); + OrcFileTail fileTail; + if (orcCacheProperties.isFileTailCacheEnabled()) { + try { + OrcDataSourceIdWithTimeStamp orcDataSourceIdWithTimeStamp = new OrcDataSourceIdWithTimeStamp(readerLocalDataSource.getId(), readerLocalDataSource.getLastModifiedTime()); + fileTail = orcCacheStore.getFileTailCache().get(new OrcFileTailCacheKey(orcDataSourceIdWithTimeStamp), () -> OrcPageSourceFactory.createFileTail(orcDataSource)); + } + catch (UncheckedExecutionException | ExecutionException executionException) { + handleCacheLoadException(executionException); + log.debug(executionException.getCause(), "Error while caching the Orc file tail. Falling back to default flow"); + fileTail = OrcPageSourceFactory.createFileTail(orcDataSource); + } + } + else { + fileTail = OrcPageSourceFactory.createFileTail(orcDataSource); + } + OrcReader reader = new OrcReader(readerLocalDataSource, fileTail, maxMergeDistance, tinyStripeThreshold, maxReadBlockSize); + + List fileColumns = reader.getRootColumn().getNestedColumns(); + List fileReadColumns = isFullAcid ? new ArrayList<>(columns.size() + 5) : new ArrayList<>(columns.size()); + List fileReadTypes = isFullAcid ? new ArrayList<>(columns.size() + 5) : new ArrayList<>(columns.size()); + ImmutableList acidColumnNames = null; + List columnAdaptations = new ArrayList<>(columns.size()); + // Only Hive ACID files will begin with bucket_ + boolean fileNameContainsBucket = path.getName().contains("bucket"); + if (isFullAcid && fileNameContainsBucket) { // Skip the acid schema check in case of non-ACID files + acidColumnNames = ImmutableList.builder().add(ACID_COLUMN_ORIGINAL_TRANSACTION, + ACID_COLUMN_BUCKET, + ACID_COLUMN_ROW_ID, + ACID_COLUMN_CURRENT_TRANSACTION, + ACID_COLUMN_OPERATION).build(); + verifyAcidSchema(reader, path); + Map acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH)); + if (AcidUtils.isDeleteDelta(path.getParent())) { + //Avoid reading column data from delete_delta files. + // Call will come here in case of Minor VACUUM where all delete_delta files are merge together. + fileColumns = ImmutableList.of(); + } + else { + fileColumns = acidColumnsByName.get(ACID_COLUMN_ROW_STRUCT).getNestedColumns(); + } + + fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_ORIGINAL_TRANSACTION.toLowerCase(ENGLISH))); + fileReadTypes.add(BIGINT); + fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_BUCKET.toLowerCase(ENGLISH))); + fileReadTypes.add(INTEGER); + fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_ROW_ID.toLowerCase(ENGLISH))); + fileReadTypes.add(BIGINT); + fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_CURRENT_TRANSACTION.toLowerCase(ENGLISH))); + fileReadTypes.add(BIGINT); + fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_OPERATION.toLowerCase(ENGLISH))); + fileReadTypes.add(INTEGER); + } + + Map fileColumnsByName = ImmutableMap.of(); + if (useOrcColumnNames || isFullAcid) { + verifyFileHasColumnNames(fileColumns, path); + + // Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore + fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName().toLowerCase(ENGLISH)); + } + + TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder() + .setBloomFiltersEnabled(orcBloomFiltersEnabled); + Map effectivePredicateDomains = effectivePredicate.getDomains() + .orElseThrow(() -> new IllegalArgumentException("Effective predicate is none")); + for (HiveColumnHandle column : columns) { + OrcColumn orcColumn = null; + if (useOrcColumnNames || isFullAcid) { + orcColumn = fileColumnsByName.get(column.getName()); + } + else if (column.getHiveColumnIndex() >= 0 && column.getHiveColumnIndex() < fileColumns.size()) { + orcColumn = fileColumns.get(column.getHiveColumnIndex()); + } + + Type readType = typeManager.getType(column.getTypeSignature()); + if (orcColumn != null) { + int sourceIndex = fileReadColumns.size(); + columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex)); + fileReadColumns.add(orcColumn); + fileReadTypes.add(readType); + + Domain domain = effectivePredicateDomains.get(column); + if (domain != null) { + predicateBuilder.addColumn(orcColumn.getColumnId(), domain); + } + } + else if (isFullAcid && readType instanceof RowType && column.getName().equalsIgnoreCase(HiveColumnHandle.UPDATE_ROW_ID_COLUMN_NAME)) { + HiveType hiveType = column.getHiveType(); + StructTypeInfo structTypeInfo = (StructTypeInfo) hiveType.getTypeInfo(); + ImmutableList.Builder builder = new ImmutableList.Builder<>(); + ArrayList fieldNames = structTypeInfo.getAllStructFieldNames(); + List adaptations = fieldNames.stream() + .map(acidColumnNames::indexOf) + .map(c -> ColumnAdaptation.sourceColumn(c, false)) + .collect(Collectors.toList()); + columnAdaptations.add(ColumnAdaptation.structColumn(structTypeInfo, adaptations)); + } + else { + columnAdaptations.add(ColumnAdaptation.nullColumn(readType)); + } + } + + Map domains = effectivePredicate.getDomains().get().entrySet().stream().collect(toMap(e -> e.getKey().getName(), Map.Entry::getValue)); + OrcRecordReader recordReader = reader.createRecordReader( + fileReadColumns, + fileReadTypes, + predicateBuilder.build(), + start, + length, + legacyFileTimeZone, + systemMemoryUsage, + INITIAL_BATCH_SIZE, + exception -> handleException(orcDataSource.getId(), exception), + indexes, + splitMetadata, + domains, + orcCacheStore, + orcCacheProperties, + pageMetadataEnabled); + + OrcDeletedRows deletedRows = new OrcDeletedRows( + path.getName(), + deleteDeltaLocations, + new OrcDeleteDeltaPageSourceFactory(sessionUser, + configuration, hdfsEnvironment, maxMergeDistance, maxBufferSize, streamBufferSize, + maxReadBlockSize, tinyStripeThreshold, lazyReadSmallRanges, orcBloomFiltersEnabled, stats), + sessionUser, + configuration, + hdfsEnvironment, + startRowOffsetOfFile); + + boolean eagerload = false; + if (indexes.isPresent()) { + eagerload = indexes.get().stream().anyMatch(indexMetadata -> EAGER_LOAD_INDEX_ID.contains(indexMetadata.getIndex().getId())); + } + + return new OrcPageSource( + recordReader, + columnAdaptations, + orcDataSource, + deletedRows, + eagerload, + systemMemoryUsage, + stats); + } + catch (Exception e) { + try { + orcDataSource.close(); + } + catch (IOException ignored) { + } + if (e instanceof PrestoException) { + throw (PrestoException) e; + } + String message = splitError(e, path, start, length); + if (e instanceof BlockMissingException) { + throw new PrestoException(HIVE_MISSING_DATA, message, e); + } + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e); + } + } + + public OrcPushDownPageSource createOrcPushDownPageSource( + Path path, + long start, + long length, + Predicate predicate, + FileFormatDataSourceStats stats) + { + AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext(); + Properties transProperties = new Properties(); + transProperties.put(GRPC_CLIENT_TARGET_LIST, omniDataServerTarget); + transProperties.putAll(sslPropertyMap); + + DataSource orcPushDownDataSource = new com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsOrcDataSource( + path.toString(), + start, + length, + false); + + TaskSource readTaskInfo = new TaskSource( + orcPushDownDataSource, + predicate, + TaskSource.ONE_MEGABYTES); + DataReader dataReader = DataReaderFactory.create(transProperties, readTaskInfo, new BlockDeserializer()); + + return new OrcPushDownPageSource( + dataReader, + orcPushDownDataSource, + systemMemoryUsage, + stats); + } + + interface FSDataInputStreamProvider + { + FSDataInputStream provide() + throws IOException; + } + + static class LazyFSInputStream + extends InputStream + implements Seekable, PositionedReadable + { + private FSDataInputStreamProvider fsDataInputStreamProvider; + private FSDataInputStream fsDataInputStream; + private boolean isStreamAvailable; + + public LazyFSInputStream(FSDataInputStreamProvider fsDataInputStreamProvider) + { + this.fsDataInputStreamProvider = fsDataInputStreamProvider; + } + + @Override + public int read(long position, byte[] buffer, int offset, int length) + throws IOException + { + ensureActualStream(); + return fsDataInputStream.read(position, buffer, offset, length); + } + + @Override + public void readFully(long position, byte[] buffer, int offset, int length) + throws IOException + { + ensureActualStream(); + fsDataInputStream.readFully(position, buffer, offset, length); + } + + @Override + public void readFully(long position, byte[] buffer) + throws IOException + { + ensureActualStream(); + fsDataInputStream.readFully(position, buffer); + } + + @Override + public void seek(long pos) + throws IOException + { + ensureActualStream(); + fsDataInputStream.seek(pos); + } + + @Override + public long getPos() + throws IOException + { + ensureActualStream(); + return fsDataInputStream.getPos(); + } + + @Override + public boolean seekToNewSource(long targetPos) + throws IOException + { + ensureActualStream(); + return fsDataInputStream.seekToNewSource(targetPos); + } + + @Override + public int read() + throws IOException + { + ensureActualStream(); + return fsDataInputStream.read(); + } + + @Override + public void close() + throws IOException + { + if (isStreamAvailable) { + fsDataInputStream.close(); + isStreamAvailable = false; + } + } + + private void ensureActualStream() + throws IOException + { + if (isStreamAvailable) { + return; + } + synchronized (this) { + if (!isStreamAvailable) { + fsDataInputStream = fsDataInputStreamProvider.provide(); + } + } + isStreamAvailable = true; + } + } + + private static OrcFileTail createFileTail(OrcDataSource orcDataSource) + throws IOException + { + return OrcFileTail.readFrom(orcDataSource, Optional.empty()); + } + + private static String splitError(Throwable t, Path path, long start, long length) + { + return format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, t.getMessage()); + } + + private static void verifyFileHasColumnNames(List columns, Path path) + { + if (!columns.isEmpty() && columns.stream().map(OrcColumn::getColumnName).allMatch(physicalColumnName -> DEFAULT_HIVE_COLUMN_NAME_PATTERN.matcher(physicalColumnName).matches())) { + throw new PrestoException( + HIVE_FILE_MISSING_COLUMN_NAMES, + "ORC file does not contain column names in the footer: " + path); + } + } + + static void verifyAcidSchema(OrcReader orcReader, Path path) + { + OrcColumn rootColumn = orcReader.getRootColumn(); + if (rootColumn.getNestedColumns().size() != 6) { + throw new PrestoException(HIVE_BAD_DATA, format("ORC ACID file should have 6 columns: %s", path)); + } + verifyAcidColumn(orcReader, 0, ACID_COLUMN_OPERATION, INT, path); + verifyAcidColumn(orcReader, 1, ACID_COLUMN_ORIGINAL_TRANSACTION, LONG, path); + verifyAcidColumn(orcReader, 2, ACID_COLUMN_BUCKET, INT, path); + verifyAcidColumn(orcReader, 3, ACID_COLUMN_ROW_ID, LONG, path); + verifyAcidColumn(orcReader, 4, ACID_COLUMN_CURRENT_TRANSACTION, LONG, path); + verifyAcidColumn(orcReader, 5, ACID_COLUMN_ROW_STRUCT, STRUCT, path); + } + + private static void verifyAcidColumn(OrcReader orcReader, int columnIndex, String columnName, OrcTypeKind columnType, Path path) + { + OrcColumn column = orcReader.getRootColumn().getNestedColumns().get(columnIndex); + if (!column.getColumnName().toLowerCase(ENGLISH).equals(columnName.toLowerCase(ENGLISH))) { + throw new PrestoException( + HIVE_BAD_DATA, + format("ORC ACID file column %s should be named %s: %s", columnIndex, columnName, path)); + } + if (column.getColumnType() != columnType) { + throw new PrestoException( + HIVE_BAD_DATA, + format("ORC ACID file %s column should be type %s: %s", columnName, columnType, path)); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcPushDownPageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcPushDownPageSource.java new file mode 100644 index 00000000..924e9e3c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcPushDownPageSource.java @@ -0,0 +1,125 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import com.huawei.boostkit.omnidata.model.datasource.DataSource; +import com.huawei.boostkit.omnidata.reader.DataReader; +import io.prestosql.memory.context.AggregatedMemoryContext; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.util.PageSourceUtil; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorPageSource; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_OPERATOR_OFFLOAD_FAIL; +import static java.util.Objects.requireNonNull; + +public class OrcPushDownPageSource + implements ConnectorPageSource +{ + private final DataReader dataReader; + private final DataSource dataSource; + private boolean closed; + private final AggregatedMemoryContext systemMemoryContext; + private final FileFormatDataSourceStats stats; + private long readTimeNanos; + private long readBytes; + + public OrcPushDownPageSource( + DataReader dataReader, + DataSource dataSource, + AggregatedMemoryContext systemMemoryContext, + FileFormatDataSourceStats stats) + { + this.dataReader = requireNonNull(dataReader, "dataReader is null"); + this.dataSource = requireNonNull(dataSource, "orcDataSource is null"); + this.stats = requireNonNull(stats, "stats is null"); + this.systemMemoryContext = requireNonNull(systemMemoryContext, "systemMemoryContext is null"); + } + + @Override + public long getCompletedBytes() + { + return readBytes; + } + + @Override + public long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public boolean isFinished() + { + return closed; + } + + @Override + public Page getNextPage() + { + long start = System.nanoTime(); + + if (dataReader.isFinished()) { + close(); + return null; + } + + Page page = null; + try { + page = (Page) dataReader.getNextPageBlocking(); + } + catch (Exception exception) { + PageSourceUtil.closeWithSuppression(this, exception); + throw new PrestoException(HIVE_OPERATOR_OFFLOAD_FAIL, exception.getMessage()); + } + + readTimeNanos += System.nanoTime() - start; + if (page != null) { + readBytes += page.getSizeInBytes(); + } + + return page; + } + + @Override + public void close() + { + if (closed) { + return; + } + closed = true; + try { + dataReader.close(); + } + catch (Exception e) { + e.printStackTrace(); + } + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("dataSource", dataSource.toString()) + .toString(); + } + + @Override + public long getSystemMemoryUsage() + { + return systemMemoryContext.getBytes(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcSelectivePageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcSelectivePageSource.java new file mode 100644 index 00000000..4c131951 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcSelectivePageSource.java @@ -0,0 +1,162 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import io.prestosql.memory.context.AggregatedMemoryContext; +import io.prestosql.orc.OrcCorruptionException; +import io.prestosql.orc.OrcDataSource; +import io.prestosql.orc.OrcSelectiveRecordReader; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HivePageSourceProvider.ColumnMapping; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_CURSOR_ERROR; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class OrcSelectivePageSource + implements ConnectorPageSource +{ + private final OrcSelectiveRecordReader recordReader; + private final OrcDataSource orcDataSource; + private final AggregatedMemoryContext systemMemoryContext; + private final FileFormatDataSourceStats stats; + + private boolean closed; + private final Type[] types; + + public OrcSelectivePageSource( + OrcSelectiveRecordReader recordReader, + OrcDataSource orcDataSource, + AggregatedMemoryContext systemMemoryContext, + FileFormatDataSourceStats stats, + List columnMappings, + TypeManager typeManager) + { + this.recordReader = requireNonNull(recordReader, "recordReader is null"); + this.orcDataSource = requireNonNull(orcDataSource, "orcDataSource is null"); + this.systemMemoryContext = requireNonNull(systemMemoryContext, "systemMemoryContext is null"); + this.stats = requireNonNull(stats, "stats is null"); + + types = new Type[columnMappings.size()]; + for (int columnIndex = 0; columnIndex < columnMappings.size(); columnIndex++) { + ColumnMapping columnMapping = columnMappings.get(columnIndex); + HiveColumnHandle column = columnMapping.getHiveColumnHandle(); + + String name = column.getName(); + Type type = typeManager.getType(column.getTypeSignature()); + types[columnIndex] = type; + } + } + + @Override + public long getCompletedBytes() + { + return orcDataSource.getReadBytes(); + } + + @Override + public long getReadTimeNanos() + { + return orcDataSource.getReadTimeNanos(); + } + + @Override + public boolean isFinished() + { + return closed; + } + + @Override + public Page getNextPage() + { + try { + Page page = recordReader.getNextPage(); + if (page == null) { + close(); + return null; + } + + return page; + } + catch (PrestoException e) { + closeWithSuppression(e); + throw e; + } + catch (OrcCorruptionException e) { + closeWithSuppression(e); + throw new PrestoException(HIVE_BAD_DATA, e); + } + catch (IOException | RuntimeException e) { + closeWithSuppression(e); + throw new PrestoException(HIVE_CURSOR_ERROR, format("Failed to read ORC file: %s", orcDataSource.getId()), e); + } + } + + @Override + public void close() + { + // some hive input formats are broken and bad things can happen if you close them multiple times + if (closed) { + return; + } + + closed = true; + + try { + stats.addMaxCombinedBytesPerRow(recordReader.getMaxCombinedBytesPerRow()); + recordReader.close(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public String toString() + { + return toStringHelper(this).toString(); + } + + @Override + public long getSystemMemoryUsage() + { + return systemMemoryContext.getBytes(); + } + + protected void closeWithSuppression(Throwable throwable) + { + requireNonNull(throwable, "throwable is null"); + try { + close(); + } + catch (RuntimeException e) { + // Self-suppression not permitted + if (throwable != e) { + throwable.addSuppressed(e); + } + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcSelectivePageSourceFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcSelectivePageSourceFactory.java new file mode 100644 index 00000000..37a3ddf0 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/orc/OrcSelectivePageSourceFactory.java @@ -0,0 +1,717 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import com.google.common.collect.ImmutableBiMap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import io.airlift.units.DataSize; +import io.prestosql.memory.context.AggregatedMemoryContext; +import io.prestosql.orc.OrcCacheProperties; +import io.prestosql.orc.OrcCacheStore; +import io.prestosql.orc.OrcColumn; +import io.prestosql.orc.OrcDataSource; +import io.prestosql.orc.OrcDataSourceId; +import io.prestosql.orc.OrcDataSourceIdWithTimeStamp; +import io.prestosql.orc.OrcFileTail; +import io.prestosql.orc.OrcFileTailCacheKey; +import io.prestosql.orc.OrcReader; +import io.prestosql.orc.OrcSelectiveRecordReader; +import io.prestosql.orc.TupleDomainFilter; +import io.prestosql.orc.TupleDomainFilterUtils; +import io.prestosql.orc.TupleDomainOrcPredicate; +import io.prestosql.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder; +import io.prestosql.orc.metadata.OrcType.OrcTypeKind; +import io.prestosql.plugin.hive.DeleteDeltaLocations; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HivePageSourceProvider; +import io.prestosql.plugin.hive.HiveSelectivePageSourceFactory; +import io.prestosql.plugin.hive.HiveSessionProperties; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.HiveUtil; +import io.prestosql.plugin.hive.coercions.HiveCoercer; +import io.prestosql.plugin.hive.orc.OrcPageSource.ColumnAdaptation; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.FixedPageSource; +import io.prestosql.spi.heuristicindex.IndexMetadata; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.RowType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PositionedReadable; +import org.apache.hadoop.fs.Seekable; +import org.apache.hadoop.hdfs.BlockMissingException; +import org.apache.hadoop.hive.ql.io.orc.OrcSerde; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.joda.time.DateTimeZone; + +import javax.inject.Inject; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.Function; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.nullToEmpty; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.Maps.uniqueIndex; +import static io.prestosql.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; +import static io.prestosql.orc.OrcReader.INITIAL_BATCH_SIZE; +import static io.prestosql.orc.metadata.OrcType.OrcTypeKind.INT; +import static io.prestosql.orc.metadata.OrcType.OrcTypeKind.LONG; +import static io.prestosql.orc.metadata.OrcType.OrcTypeKind.STRUCT; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILE_MISSING_COLUMN_NAMES; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA; +import static io.prestosql.plugin.hive.HiveSessionProperties.getOrcLazyReadSmallRanges; +import static io.prestosql.plugin.hive.HiveSessionProperties.getOrcMaxBufferSize; +import static io.prestosql.plugin.hive.HiveSessionProperties.getOrcMaxMergeDistance; +import static io.prestosql.plugin.hive.HiveSessionProperties.getOrcMaxReadBlockSize; +import static io.prestosql.plugin.hive.HiveSessionProperties.getOrcStreamBufferSize; +import static io.prestosql.plugin.hive.HiveSessionProperties.getOrcTinyStripeThreshold; +import static io.prestosql.plugin.hive.HiveSessionProperties.isOrcBloomFiltersCacheEnabled; +import static io.prestosql.plugin.hive.HiveSessionProperties.isOrcBloomFiltersEnabled; +import static io.prestosql.plugin.hive.HiveSessionProperties.isOrcFileTailCacheEnabled; +import static io.prestosql.plugin.hive.HiveSessionProperties.isOrcRowDataCacheEnabled; +import static io.prestosql.plugin.hive.HiveSessionProperties.isOrcRowIndexCacheEnabled; +import static io.prestosql.plugin.hive.HiveSessionProperties.isOrcStripeFooterCacheEnabled; +import static io.prestosql.plugin.hive.HiveUtil.typedPartitionKey; +import static io.prestosql.plugin.hive.orc.OrcPageSource.handleException; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static java.lang.String.format; +import static java.util.Locale.ENGLISH; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toMap; +import static org.apache.hadoop.hive.ql.io.AcidUtils.isFullAcidTable; + +public class OrcSelectivePageSourceFactory + implements HiveSelectivePageSourceFactory +{ + // ACID format column names + public static final String ACID_COLUMN_OPERATION = "operation"; + public static final String ACID_COLUMN_ORIGINAL_TRANSACTION = "originalTransaction"; + public static final String ACID_COLUMN_BUCKET = "bucket"; + public static final String ACID_COLUMN_ROW_ID = "rowId"; + public static final String ACID_COLUMN_CURRENT_TRANSACTION = "currentTransaction"; + public static final String ACID_COLUMN_ROW_STRUCT = "row"; + + private static final Pattern DEFAULT_HIVE_COLUMN_NAME_PATTERN = Pattern.compile("_col\\d+"); + private final TypeManager typeManager; + private final boolean useOrcColumnNames; + private final HdfsEnvironment hdfsEnvironment; + private final FileFormatDataSourceStats stats; + private final OrcCacheStore orcCacheStore; + private final DateTimeZone legacyTimeZone; + + @Inject + public OrcSelectivePageSourceFactory(TypeManager typeManager, HiveConfig config, HdfsEnvironment hdfsEnvironment, FileFormatDataSourceStats stats, OrcCacheStore orcCacheStore) + { + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + requireNonNull(config, "config is null"); + this.useOrcColumnNames = config.isUseOrcColumnNames(); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.stats = requireNonNull(stats, "stats is null"); + this.orcCacheStore = orcCacheStore; + this.legacyTimeZone = requireNonNull(config, "hiveConfig is null").getOrcLegacyDateTimeZone(); + } + + @Override + public Optional createPageSource( + Configuration configuration, + ConnectorSession session, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + Map prefilledValues, + List outputColumns, + TupleDomain domainPredicate, + Optional>> additionPredicates, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + boolean splitCacheable, + List columnMappings, + Map coercers, + long dataSourceLastModifiedTime) + { + if (!HiveUtil.isDeserializerClass(schema, OrcSerde.class)) { + return Optional.empty(); + } + + // per HIVE-13040 and ORC-162, empty files are allowed + if (fileSize == 0) { + return Optional.of(new FixedPageSource(ImmutableList.of())); + } + OrcCacheProperties orcCacheProperties = new OrcCacheProperties( + isOrcFileTailCacheEnabled(session), + isOrcStripeFooterCacheEnabled(session), + isOrcRowIndexCacheEnabled(session), + isOrcBloomFiltersCacheEnabled(session), + isOrcRowDataCacheEnabled(session) && splitCacheable); + if (additionPredicates.isPresent() + && additionPredicates.get().size() > 0 + && !additionPredicates.get().get(0).isAll() + && !additionPredicates.get().get(0).isNone()) { + List pageSources = new ArrayList<>(); + List positions = new ArrayList<>(10); + + return Optional.of(createOrcPageSource( + hdfsEnvironment, + session, + configuration, + path, + start, + length, + fileSize, + columns, + useOrcColumnNames, + isFullAcidTable(Maps.fromProperties(schema)), + prefilledValues, + outputColumns, + domainPredicate, + legacyTimeZone, + typeManager, + getOrcMaxMergeDistance(session), + getOrcMaxBufferSize(session), + getOrcStreamBufferSize(session), + getOrcTinyStripeThreshold(session), + getOrcMaxReadBlockSize(session), + getOrcLazyReadSmallRanges(session), + isOrcBloomFiltersEnabled(session), + stats, + deleteDeltaLocations, + startRowOffsetOfFile, + indexes, + orcCacheStore, + orcCacheProperties, + additionPredicates.orElseGet(() -> ImmutableList.of()), + positions, + columnMappings, + coercers, + dataSourceLastModifiedTime)); + + /* Todo(Nitin): For Append Pattern + appendPredicates.get().stream().forEach(newDomainPredicate -> + pageSources.add(createOrcPageSource( + hdfsEnvironment, + session.getUser(), + configuration, + path, + start, + length, + fileSize, + columns, + useOrcColumnNames, + isFullAcidTable(Maps.fromProperties(schema)), + prefilledValues, + outputColumns, + newDomainPredicate, + hiveStorageTimeZone, + typeManager, + getOrcMaxMergeDistance(session), + getOrcMaxBufferSize(session), + getOrcStreamBufferSize(session), + getOrcTinyStripeThreshold(session), + getOrcMaxReadBlockSize(session), + getOrcLazyReadSmallRanges(session), + isOrcBloomFiltersEnabled(session), + stats, + deleteDeltaLocations, + startRowOffsetOfFile, + indexes, + orcCacheStore, + orcCacheProperties, + additionPredicates.orElseGet(() -> ImmutableList.of()), + positions))); + + // Create a Concatenating Page Source + return Optional.of(new OrcConcatPageSource(pageSources)); + */ + } + + return Optional.of(createOrcPageSource( + hdfsEnvironment, + session, + configuration, + path, + start, + length, + fileSize, + columns, + useOrcColumnNames, + isFullAcidTable(Maps.fromProperties(schema)), + prefilledValues, + outputColumns, + domainPredicate, + legacyTimeZone, + typeManager, + getOrcMaxMergeDistance(session), + getOrcMaxBufferSize(session), + getOrcStreamBufferSize(session), + getOrcTinyStripeThreshold(session), + getOrcMaxReadBlockSize(session), + getOrcLazyReadSmallRanges(session), + isOrcBloomFiltersEnabled(session), + stats, + deleteDeltaLocations, + startRowOffsetOfFile, + indexes, + orcCacheStore, + orcCacheProperties, + ImmutableList.of(), + null, + columnMappings, + coercers, + dataSourceLastModifiedTime)); + } + + public static OrcSelectivePageSource createOrcPageSource( + HdfsEnvironment hdfsEnvironment, + ConnectorSession session, + Configuration configuration, + Path path, + long start, + long length, + long fileSize, + List columns, + boolean useOrcColumnNames, + boolean isFullAcid, + Map prefilledValues, + List outputColumns, + TupleDomain domainPredicate, + DateTimeZone hiveStorageTimeZone, + TypeManager typeManager, + DataSize maxMergeDistance, + DataSize maxBufferSize, + DataSize streamBufferSize, + DataSize tinyStripeThreshold, + DataSize maxReadBlockSize, + boolean lazyReadSmallRanges, + boolean orcBloomFiltersEnabled, + FileFormatDataSourceStats stats, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + OrcCacheStore orcCacheStore, + OrcCacheProperties orcCacheProperties, + List> disjunctDomains, + List positions, + List columnMappings, + Map coercers, + long dataSourceLastModifiedTime) + { + checkArgument(!domainPredicate.isNone(), "Unexpected NONE domain"); + String sessionUser = session.getUser(); + OrcDataSource orcDataSource; + try { + //Always create a lazy Stream. HDFS stream opened only when required. + FSDataInputStream inputStream = new FSDataInputStream(new LazyFSInputStream(() -> { + FileSystem fileSystem = hdfsEnvironment.getFileSystem(sessionUser, path, configuration); + return hdfsEnvironment.doAs(sessionUser, () -> fileSystem.open(path)); + })); + orcDataSource = new HdfsOrcDataSource( + new OrcDataSourceId(path.toString()), + fileSize, + maxMergeDistance, + maxBufferSize, + streamBufferSize, + lazyReadSmallRanges, + inputStream, + stats, + dataSourceLastModifiedTime); + } + catch (Exception e) { + if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || + e instanceof FileNotFoundException) { + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e); + } + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e); + } + + AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext(); + try { + OrcDataSource readerLocalDataSource = OrcReader.wrapWithCacheIfTiny(orcDataSource, tinyStripeThreshold); + OrcFileTail fileTail; + if (orcCacheProperties.isFileTailCacheEnabled()) { + OrcDataSourceIdWithTimeStamp orcDataSourceIdWithTimeStamp = new OrcDataSourceIdWithTimeStamp(readerLocalDataSource.getId(), readerLocalDataSource.getLastModifiedTime()); + fileTail = orcCacheStore.getFileTailCache().get(new OrcFileTailCacheKey(orcDataSourceIdWithTimeStamp), () -> OrcSelectivePageSourceFactory.createFileTail(orcDataSource)); + } + else { + fileTail = OrcSelectivePageSourceFactory.createFileTail(orcDataSource); + } + OrcReader reader = new OrcReader(readerLocalDataSource, fileTail, maxMergeDistance, tinyStripeThreshold, maxReadBlockSize); + + List fileColumns = reader.getRootColumn().getNestedColumns(); + List fileReadColumns = isFullAcid ? new ArrayList<>(columns.size() + 3) : new ArrayList<>(columns.size()); + List fileReadTypes = isFullAcid ? new ArrayList<>(columns.size() + 3) : new ArrayList<>(columns.size()); + ImmutableList acidColumnNames = null; + List columnAdaptations = new ArrayList<>(columns.size()); + if (isFullAcid && fileColumns.size() != columns.size()) { // Skip the acid schema check in case of non-ACID files + acidColumnNames = ImmutableList.builder().add(ACID_COLUMN_ORIGINAL_TRANSACTION, + ACID_COLUMN_BUCKET, + ACID_COLUMN_ROW_ID).build(); + verifyAcidSchema(reader, path); + Map acidColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName()); + fileColumns = acidColumnsByName.get(ACID_COLUMN_ROW_STRUCT).getNestedColumns(); + + fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_ORIGINAL_TRANSACTION)); + fileReadTypes.add(BIGINT); + fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_BUCKET)); + fileReadTypes.add(INTEGER); + fileReadColumns.add(acidColumnsByName.get(ACID_COLUMN_ROW_ID)); + fileReadTypes.add(BIGINT); + } + + Map fileColumnsByName = ImmutableMap.of(); + if (useOrcColumnNames || isFullAcid) { + verifyFileHasColumnNames(fileColumns, path); + + // Convert column names read from ORC files to lower case to be consistent with those stored in Hive Metastore + fileColumnsByName = uniqueIndex(fileColumns, orcColumn -> orcColumn.getColumnName()); + } + + TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder() + .setBloomFiltersEnabled(orcBloomFiltersEnabled); + Map effectivePredicateDomains = domainPredicate.getDomains() + .orElseThrow(() -> new IllegalArgumentException("Effective predicate is none")); + + /* Fixme(Nitin): If same-columns or conditions can be merged as TreeMap in optimization step; below code can be spared */ + Map disjunctPredicateDomains = new HashMap<>(); + disjunctDomains.stream() + .forEach(ap -> ap.getDomains().get().forEach((k, v) -> disjunctPredicateDomains.merge(k, v, (v1, v2) -> v1.union(v2)))); + + boolean hasParitionKeyORPredicate = disjunctPredicateDomains.keySet().stream().anyMatch(c -> c.isPartitionKey()); + Map> orDomains = new ConcurrentHashMap<>(); + Set missingColumns = new HashSet<>(); + for (HiveColumnHandle column : columns) { + OrcColumn orcColumn = null; + int missingColumn = -1; + if (useOrcColumnNames || isFullAcid) { + orcColumn = fileColumnsByName.get(column.getName()); + } + else if (column.getHiveColumnIndex() >= 0) { + if (column.getHiveColumnIndex() < fileColumns.size()) { + orcColumn = fileColumns.get(column.getHiveColumnIndex()); + } + else { + missingColumn = column.getHiveColumnIndex(); + } + } + + Type readType = typeManager.getType(column.getTypeSignature()); + if (orcColumn != null) { + int sourceIndex = fileReadColumns.size(); + columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex)); + fileReadColumns.add(orcColumn); + fileReadTypes.add(readType); + + Domain domain = effectivePredicateDomains.get(column); + if (domain != null) { + predicateBuilder.addColumn(orcColumn.getColumnId(), domain); + } + + domain = disjunctPredicateDomains.get(column); + if (!hasParitionKeyORPredicate && domain != null) { + predicateBuilder.addOrColumn(orcColumn.getColumnId(), domain); + orDomains.computeIfAbsent(column.getName(), l -> new ArrayList<>()).add(domain); + } + } + else if (isFullAcid && readType instanceof RowType && column.getName().equalsIgnoreCase("row__id")) { + HiveType hiveType = column.getHiveType(); + StructTypeInfo structTypeInfo = (StructTypeInfo) hiveType.getTypeInfo(); + ImmutableList.Builder builder = new ImmutableList.Builder<>(); + ArrayList fieldNames = structTypeInfo.getAllStructFieldNames(); + List adaptations = fieldNames.stream() + .map(acidColumnNames::indexOf) + .map(ColumnAdaptation::sourceColumn) + .collect(Collectors.toList()); + columnAdaptations.add(ColumnAdaptation.structColumn(structTypeInfo, adaptations)); + } + else if (missingColumn >= 0) { + missingColumns.add(missingColumn); + } + else { + columnAdaptations.add(ColumnAdaptation.nullColumn(readType)); + } + } + + // missingColumns are list of columns which are not part of file but being part of projection. + // This happens if a table was altered to add more columns. + predicateBuilder.setMissingColumns(missingColumns); + Map columnTypes = columns.stream() + .collect(toImmutableMap(HiveColumnHandle::getHiveColumnIndex, column -> typeManager.getType(column.getTypeSignature()))); + + Map columnNames = columns.stream() + .collect(toImmutableMap(HiveColumnHandle::getHiveColumnIndex, HiveColumnHandle::getName)); + + Map typedPrefilledValues = new HashMap<>(); + for (Map.Entry prefilledValue : prefilledValues.entrySet()) { + typedPrefilledValues.put(Integer.valueOf(prefilledValue.getKey().toString()), + typedPartitionKey(prefilledValue.getValue().toString(), columnTypes.get(prefilledValue.getKey()), columnNames.get(prefilledValue.getKey()))); + } + + // Convert the predicate to each column id wise. Will be used to associate as filter with each column reader + Map tupleDomainFilters = toTupleDomainFilters(domainPredicate, ImmutableBiMap.copyOf(columnNames).inverse()); + Map> orFilters = new HashMap<>(); + + disjunctDomains.stream() + .forEach(ap -> toTupleDomainFilters(ap, ImmutableBiMap.copyOf(columnNames).inverse()).entrySet().stream() + .forEach(td -> orFilters.computeIfAbsent(td.getKey(), list -> new ArrayList<>()).add(td.getValue()))); + + // domains still required by index (refer AbstractOrcRecordReader). + Map domainMap = effectivePredicateDomains.entrySet().stream().collect(toMap(e -> e.getKey().getName(), Map.Entry::getValue)); + OrcSelectiveRecordReader recordReader = reader.createSelectiveRecordReader( + fileColumns, + fileReadColumns, + fileReadTypes, + outputColumns, + columnTypes, + tupleDomainFilters, + typedPrefilledValues, + predicateBuilder.build(), + start, + length, + hiveStorageTimeZone, + systemMemoryUsage, + INITIAL_BATCH_SIZE, + exception -> handleException(orcDataSource.getId(), exception), + indexes, + domainMap, + orcCacheStore, + orcCacheProperties, + Optional.empty(), + orFilters, + positions, + HiveSessionProperties.isOrcPushdownDataCacheEnabled(session), + Maps.transformValues(coercers, Function.class::cast), + orDomains, + missingColumns); + + OrcDeletedRows deletedRows = new OrcDeletedRows( + path.getName(), + deleteDeltaLocations, + new OrcDeleteDeltaPageSourceFactory(sessionUser, + configuration, hdfsEnvironment, maxMergeDistance, maxBufferSize, streamBufferSize, + maxReadBlockSize, tinyStripeThreshold, lazyReadSmallRanges, orcBloomFiltersEnabled, stats), + sessionUser, + configuration, + hdfsEnvironment, + startRowOffsetOfFile); + + /* Todo(Nitin): Create a Separate OrcSelectivePageSource and Use MergingPageIterator + * to progressively scan and yeild pages. */ + + return new OrcSelectivePageSource( + recordReader, + orcDataSource, //TODO- Rajeev: Do we need to pass deleted rows here? +// deletedRows, +// isFullAcid && indexes.isPresent(), + systemMemoryUsage, + stats, + columnMappings, + typeManager); + } + catch (Exception e) { + try { + orcDataSource.close(); + } + catch (IOException ignored) { + } + if (e instanceof PrestoException) { + throw (PrestoException) e; + } + String message = splitError(e, path, start, length); + if (e instanceof BlockMissingException) { + throw new PrestoException(HIVE_MISSING_DATA, message, e); + } + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e); + } + } + + private static Map toTupleDomainFilters(TupleDomain domainPredicate, Map columnIndices) + { + // convert the predicate from column name based map to column id based map. + // toFilter is the function which actually will convert o corresponding Comparator. + // toFilter will be called lazily during initialization of corresponding column reader (createColumnReaders). + Map tupleDomainFilterMap = new HashMap<>(); + domainPredicate.transform(columnHandle -> columnIndices.get(columnHandle.getColumnName())).getDomains().get().forEach((k, v) -> tupleDomainFilterMap.put(k, TupleDomainFilterUtils.toFilter(v))); + return tupleDomainFilterMap; + } + + interface FSDataInputStreamProvider + { + FSDataInputStream provide() throws IOException; + } + + static class LazyFSInputStream + extends InputStream + implements Seekable, PositionedReadable + { + private FSDataInputStreamProvider fsDataInputStreamProvider; + private FSDataInputStream fsDataInputStream; + private boolean isStreamAvailable; + + public LazyFSInputStream(FSDataInputStreamProvider fsDataInputStreamProvider) + { + this.fsDataInputStreamProvider = fsDataInputStreamProvider; + } + + @Override + public int read(long position, byte[] buffer, int offset, int length) throws IOException + { + ensureActualStream(); + return fsDataInputStream.read(position, buffer, offset, length); + } + + @Override + public void readFully(long position, byte[] buffer, int offset, int length) throws IOException + { + ensureActualStream(); + fsDataInputStream.readFully(position, buffer, offset, length); + } + + @Override + public void readFully(long position, byte[] buffer) throws IOException + { + ensureActualStream(); + fsDataInputStream.readFully(position, buffer); + } + + @Override + public void seek(long pos) throws IOException + { + ensureActualStream(); + fsDataInputStream.seek(pos); + } + + @Override + public long getPos() throws IOException + { + ensureActualStream(); + return fsDataInputStream.getPos(); + } + + @Override + public boolean seekToNewSource(long targetPos) throws IOException + { + ensureActualStream(); + return fsDataInputStream.seekToNewSource(targetPos); + } + + @Override + public int read() throws IOException + { + ensureActualStream(); + return fsDataInputStream.read(); + } + + @Override + public void close() throws IOException + { + if (isStreamAvailable) { + fsDataInputStream.close(); + isStreamAvailable = false; + } + } + + private void ensureActualStream() throws IOException + { + if (isStreamAvailable) { + return; + } + synchronized (this) { + if (!isStreamAvailable) { + fsDataInputStream = fsDataInputStreamProvider.provide(); + } + } + isStreamAvailable = true; + } + } + + private static OrcFileTail createFileTail(OrcDataSource orcDataSource) throws IOException + { + return OrcFileTail.readFrom(orcDataSource, Optional.empty()); + } + + private static String splitError(Throwable t, Path path, long start, long length) + { + return format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, t.getMessage()); + } + + private static void verifyFileHasColumnNames(List columns, Path path) + { + if (!columns.isEmpty() && columns.stream().map(OrcColumn::getColumnName).allMatch(physicalColumnName -> DEFAULT_HIVE_COLUMN_NAME_PATTERN.matcher(physicalColumnName).matches())) { + throw new PrestoException( + HIVE_FILE_MISSING_COLUMN_NAMES, + "ORC file does not contain column names in the footer: " + path); + } + } + + static void verifyAcidSchema(OrcReader orcReader, Path path) + { + OrcColumn rootColumn = orcReader.getRootColumn(); + if (rootColumn.getNestedColumns().size() != 6) { + throw new PrestoException(HIVE_BAD_DATA, format("ORC ACID file should have 6 columns: %s", path)); + } + verifyAcidColumn(orcReader, 0, ACID_COLUMN_OPERATION, INT, path); + verifyAcidColumn(orcReader, 1, ACID_COLUMN_ORIGINAL_TRANSACTION, LONG, path); + verifyAcidColumn(orcReader, 2, ACID_COLUMN_BUCKET, INT, path); + verifyAcidColumn(orcReader, 3, ACID_COLUMN_ROW_ID, LONG, path); + verifyAcidColumn(orcReader, 4, ACID_COLUMN_CURRENT_TRANSACTION, LONG, path); + verifyAcidColumn(orcReader, 5, ACID_COLUMN_ROW_STRUCT, STRUCT, path); + } + + private static void verifyAcidColumn(OrcReader orcReader, int columnIndex, String columnName, OrcTypeKind columnType, Path path) + { + OrcColumn column = orcReader.getRootColumn().getNestedColumns().get(columnIndex); + if (!column.getColumnName().toLowerCase(ENGLISH).equals(columnName.toLowerCase(ENGLISH))) { + throw new PrestoException( + HIVE_BAD_DATA, + format("ORC ACID file column %s should be named %s: %s", columnIndex, columnName, path)); + } + if (column.getColumnType() != columnType) { + throw new PrestoException( + HIVE_BAD_DATA, + format("ORC ACID file %s column should be type %s: %s", columnName, columnType, path)); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/HdfsParquetDataSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/HdfsParquetDataSource.java new file mode 100644 index 00000000..7711e714 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/HdfsParquetDataSource.java @@ -0,0 +1,115 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet; + +import io.prestosql.parquet.ParquetDataSource; +import io.prestosql.parquet.ParquetDataSourceId; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.spi.PrestoException; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; + +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class HdfsParquetDataSource + implements ParquetDataSource +{ + private final ParquetDataSourceId id; + private final long size; + private final FSDataInputStream inputStream; + private long readTimeNanos; + private long readBytes; + private final FileFormatDataSourceStats stats; + + public HdfsParquetDataSource(ParquetDataSourceId id, long size, FSDataInputStream inputStream, FileFormatDataSourceStats stats) + { + this.id = requireNonNull(id, "id is null"); + this.size = size; + this.inputStream = inputStream; + this.stats = stats; + } + + @Override + public ParquetDataSourceId getId() + { + return id; + } + + @Override + public final long getReadBytes() + { + return readBytes; + } + + @Override + public long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public final long getSize() + { + return size; + } + + @Override + public void close() + throws IOException + { + inputStream.close(); + } + + @Override + public final void readFully(long position, byte[] buffer) + { + readFully(position, buffer, 0, buffer.length); + } + + @Override + public final void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) + { + readBytes += bufferLength; + + long start = System.nanoTime(); + readInternal(position, buffer, bufferOffset, bufferLength); + long currentReadTimeNanos = System.nanoTime() - start; + + readTimeNanos += currentReadTimeNanos; + stats.readDataBytesPerSecond(bufferLength, currentReadTimeNanos); + } + + private void readInternal(long position, byte[] buffer, int bufferOffset, int bufferLength) + { + try { + inputStream.readFully(position, buffer, bufferOffset, bufferLength); + } + catch (PrestoException e) { + // just in case there is a Presto wrapper or hook + throw e; + } + catch (Exception e) { + throw new PrestoException(HIVE_FILESYSTEM_ERROR, format("Error reading from %s at position %s", id, position), e); + } + } + + public static HdfsParquetDataSource buildHdfsParquetDataSource(FSDataInputStream inputStream, Path path, long fileSize, FileFormatDataSourceStats stats) + { + return new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), fileSize, inputStream, stats); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetColumnIOConverter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetColumnIOConverter.java new file mode 100644 index 00000000..4dc26ebd --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetColumnIOConverter.java @@ -0,0 +1,97 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet; + +import com.google.common.collect.ImmutableList; +import io.prestosql.parquet.Field; +import io.prestosql.parquet.GroupField; +import io.prestosql.parquet.PrimitiveField; +import io.prestosql.parquet.RichColumnDescriptor; +import io.prestosql.spi.type.MapType; +import io.prestosql.spi.type.NamedTypeSignature; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeSignatureParameter; +import org.apache.parquet.io.ColumnIO; +import org.apache.parquet.io.GroupColumnIO; +import org.apache.parquet.io.PrimitiveColumnIO; + +import java.util.List; +import java.util.Locale; +import java.util.Optional; + +import static io.prestosql.parquet.ParquetTypeUtils.getArrayElementColumn; +import static io.prestosql.parquet.ParquetTypeUtils.getMapKeyValueColumn; +import static io.prestosql.parquet.ParquetTypeUtils.lookupColumnByName; +import static io.prestosql.spi.type.StandardTypes.ARRAY; +import static io.prestosql.spi.type.StandardTypes.MAP; +import static io.prestosql.spi.type.StandardTypes.ROW; +import static org.apache.parquet.io.ColumnIOUtil.columnDefinitionLevel; +import static org.apache.parquet.io.ColumnIOUtil.columnRepetitionLevel; +import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; + +final class ParquetColumnIOConverter +{ + private ParquetColumnIOConverter() {} + + public static Optional constructField(Type type, ColumnIO columnIO) + { + if (columnIO == null) { + return Optional.empty(); + } + boolean required = columnIO.getType().getRepetition() != OPTIONAL; + int repetitionLevel = columnRepetitionLevel(columnIO); + int definitionLevel = columnDefinitionLevel(columnIO); + if (ROW.equals(type.getTypeSignature().getBase())) { + GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO; + List parameters = type.getTypeParameters(); + ImmutableList.Builder> fieldsBuilder = ImmutableList.builder(); + List fields = type.getTypeSignature().getParameters(); + boolean structHasParameters = false; + for (int i = 0; i < fields.size(); i++) { + NamedTypeSignature namedTypeSignature = fields.get(i).getNamedTypeSignature(); + String name = namedTypeSignature.getName().get().toLowerCase(Locale.ENGLISH); + Optional field = constructField(parameters.get(i), lookupColumnByName(groupColumnIO, name)); + structHasParameters |= field.isPresent(); + fieldsBuilder.add(field); + } + if (structHasParameters) { + return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, fieldsBuilder.build())); + } + return Optional.empty(); + } + if (MAP.equals(type.getTypeSignature().getBase())) { + GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO; + MapType mapType = (MapType) type; + GroupColumnIO keyValueColumnIO = getMapKeyValueColumn(groupColumnIO); + if (keyValueColumnIO.getChildrenCount() != 2) { + return Optional.empty(); + } + Optional keyField = constructField(mapType.getKeyType(), keyValueColumnIO.getChild(0)); + Optional valueField = constructField(mapType.getValueType(), keyValueColumnIO.getChild(1)); + return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, ImmutableList.of(keyField, valueField))); + } + if (ARRAY.equals(type.getTypeSignature().getBase())) { + GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO; + List types = type.getTypeParameters(); + if (groupColumnIO.getChildrenCount() != 1) { + return Optional.empty(); + } + Optional field = constructField(types.get(0), getArrayElementColumn(groupColumnIO.getChild(0))); + return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, ImmutableList.of(field))); + } + PrimitiveColumnIO primitiveColumnIO = (PrimitiveColumnIO) columnIO; + RichColumnDescriptor column = new RichColumnDescriptor(primitiveColumnIO.getColumnDescriptor(), columnIO.getType().asPrimitiveType()); + return Optional.of(new PrimitiveField(type, repetitionLevel, definitionLevel, required, column, primitiveColumnIO.getId())); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetPageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetPageSource.java new file mode 100644 index 00000000..c8336840 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetPageSource.java @@ -0,0 +1,251 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet; + +import com.google.common.collect.ImmutableList; +import io.prestosql.parquet.Field; +import io.prestosql.parquet.ParquetCorruptionException; +import io.prestosql.parquet.reader.ParquetReader; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.LazyBlock; +import io.prestosql.spi.block.LazyBlockLoader; +import io.prestosql.spi.block.RunLengthEncodedBlock; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.schema.MessageType; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.Optional; +import java.util.Properties; + +import static com.google.common.base.Preconditions.checkState; +import static io.prestosql.parquet.ParquetTypeUtils.getFieldIndex; +import static io.prestosql.parquet.ParquetTypeUtils.lookupColumnByName; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_CURSOR_ERROR; +import static java.util.Objects.requireNonNull; + +public class ParquetPageSource + implements ConnectorPageSource +{ + private static final int MAX_VECTOR_LENGTH = 1024; + + private final ParquetReader parquetReader; + private final MessageType fileSchema; + // for debugging heap dump + private final List columnNames; + private final List types; + private final List> fields; + + private final Block[] constantBlocks; + private final int[] hiveColumnIndexes; + + private int batchId; + private boolean closed; + private final boolean useParquetColumnNames; + + public ParquetPageSource( + ParquetReader parquetReader, + MessageType fileSchema, + MessageColumnIO messageColumnIO, + TypeManager typeManager, + Properties splitSchema, + List columns, + TupleDomain effectivePredicate, + boolean useParquetColumnNames) + { + requireNonNull(splitSchema, "splitSchema is null"); + requireNonNull(columns, "columns is null"); + requireNonNull(effectivePredicate, "effectivePredicate is null"); + this.parquetReader = requireNonNull(parquetReader, "parquetReader is null"); + this.fileSchema = requireNonNull(fileSchema, "fileSchema is null"); + this.useParquetColumnNames = useParquetColumnNames; + + int size = columns.size(); + this.constantBlocks = new Block[size]; + this.hiveColumnIndexes = new int[size]; + + ImmutableList.Builder namesBuilder = ImmutableList.builder(); + ImmutableList.Builder typesBuilder = ImmutableList.builder(); + ImmutableList.Builder> fieldsBuilder = ImmutableList.builder(); + for (int columnIndex = 0; columnIndex < size; columnIndex++) { + HiveColumnHandle column = columns.get(columnIndex); + checkState(column.getColumnType() == REGULAR, "column type must be regular"); + + String name = column.getName(); + Type type = typeManager.getType(column.getTypeSignature()); + + namesBuilder.add(name); + typesBuilder.add(type); + hiveColumnIndexes[columnIndex] = column.getHiveColumnIndex(); + + if (ParquetPageSourceFactory.getParquetType(column, fileSchema, useParquetColumnNames) == null) { + constantBlocks[columnIndex] = RunLengthEncodedBlock.create(type, null, MAX_VECTOR_LENGTH); + fieldsBuilder.add(Optional.empty()); + } + else { + String columnName = useParquetColumnNames ? name : fileSchema.getFields().get(column.getHiveColumnIndex()).getName(); + fieldsBuilder.add(ParquetColumnIOConverter.constructField(type, lookupColumnByName(messageColumnIO, columnName))); + } + } + types = typesBuilder.build(); + fields = fieldsBuilder.build(); + columnNames = namesBuilder.build(); + } + + @Override + public long getCompletedBytes() + { + return parquetReader.getDataSource().getReadBytes(); + } + + @Override + public long getReadTimeNanos() + { + return parquetReader.getDataSource().getReadTimeNanos(); + } + + @Override + public boolean isFinished() + { + return closed; + } + + @Override + public long getSystemMemoryUsage() + { + return parquetReader.getSystemMemoryContext().getBytes(); + } + + @Override + public Page getNextPage() + { + try { + batchId++; + int batchSize = parquetReader.nextBatch(); + + if (closed || batchSize <= 0) { + close(); + return null; + } + + Block[] blocks = new Block[hiveColumnIndexes.length]; + for (int fieldId = 0; fieldId < blocks.length; fieldId++) { + if (constantBlocks[fieldId] != null) { + blocks[fieldId] = constantBlocks[fieldId].getRegion(0, batchSize); + } + else { + Type type = types.get(fieldId); + Optional field = fields.get(fieldId); + int fieldIndex; + if (useParquetColumnNames) { + fieldIndex = getFieldIndex(fileSchema, columnNames.get(fieldId)); + } + else { + fieldIndex = hiveColumnIndexes[fieldId]; + } + if (fieldIndex != -1 && field.isPresent()) { + blocks[fieldId] = new LazyBlock(batchSize, new ParquetBlockLoader(field.get())); + } + else { + blocks[fieldId] = RunLengthEncodedBlock.create(type, null, batchSize); + } + } + } + return new Page(batchSize, blocks); + } + catch (PrestoException e) { + closeWithSuppression(e); + throw e; + } + catch (RuntimeException e) { + closeWithSuppression(e); + throw new PrestoException(HIVE_CURSOR_ERROR, e); + } + } + + private void closeWithSuppression(Throwable throwable) + { + requireNonNull(throwable, "throwable is null"); + try { + close(); + } + catch (RuntimeException e) { + // Self-suppression not permitted + if (e != throwable) { + throwable.addSuppressed(e); + } + } + } + + @Override + public void close() + { + if (closed) { + return; + } + closed = true; + + try { + parquetReader.close(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private final class ParquetBlockLoader + implements LazyBlockLoader + { + private final int expectedBatchId = batchId; + private final Field field; + private boolean loaded; + + public ParquetBlockLoader(Field field) + { + this.field = requireNonNull(field, "field is null"); + } + + @Override + public final void load(LazyBlock lazyBlock) + { + if (loaded) { + return; + } + + checkState(batchId == expectedBatchId); + + try { + Block block = parquetReader.readBlock(field); + lazyBlock.setBlock(block); + } + catch (ParquetCorruptionException e) { + throw new PrestoException(HIVE_BAD_DATA, e); + } + catch (IOException e) { + throw new PrestoException(HIVE_CURSOR_ERROR, e); + } + loaded = true; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetPageSourceFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetPageSourceFactory.java new file mode 100644 index 00000000..3a818249 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetPageSourceFactory.java @@ -0,0 +1,407 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.huawei.boostkit.omnidata.block.BlockDeserializer; +import com.huawei.boostkit.omnidata.model.TaskSource; +import com.huawei.boostkit.omnidata.model.datasource.DataSource; +import com.huawei.boostkit.omnidata.reader.DataReader; +import com.huawei.boostkit.omnidata.reader.DataReaderFactory; +import io.airlift.units.DataSize; +import io.prestosql.memory.context.AggregatedMemoryContext; +import io.prestosql.parquet.ParquetCorruptionException; +import io.prestosql.parquet.ParquetDataSource; +import io.prestosql.parquet.RichColumnDescriptor; +import io.prestosql.parquet.predicate.Predicate; +import io.prestosql.parquet.reader.MetadataReader; +import io.prestosql.parquet.reader.ParquetReader; +import io.prestosql.plugin.hive.DeleteDeltaLocations; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveOffloadExpression; +import io.prestosql.plugin.hive.HivePageSourceFactory; +import io.prestosql.plugin.hive.HivePartitionKey; +import io.prestosql.plugin.hive.HiveSessionProperties; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.dynamicfilter.DynamicFilterSupplier; +import io.prestosql.spi.heuristicindex.IndexMetadata; +import io.prestosql.spi.heuristicindex.SplitMetadata; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.BlockMissingException; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.hadoop.metadata.FileMetaData; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.schema.MessageType; +import org.joda.time.DateTimeZone; + +import javax.inject.Inject; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.nullToEmpty; +import static com.huawei.boostkit.omnidata.OmniDataProperty.GRPC_CLIENT_TARGET_LIST; +import static io.prestosql.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; +import static io.prestosql.parquet.ParquetTypeUtils.getColumnIO; +import static io.prestosql.parquet.ParquetTypeUtils.getDescriptors; +import static io.prestosql.parquet.ParquetTypeUtils.getParquetTypeByName; +import static io.prestosql.parquet.predicate.PredicateUtils.buildPredicate; +import static io.prestosql.parquet.predicate.PredicateUtils.predicateMatches; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA; +import static io.prestosql.plugin.hive.HiveSessionProperties.getParquetMaxReadBlockSize; +import static io.prestosql.plugin.hive.HiveSessionProperties.isFailOnCorruptedParquetStatistics; +import static io.prestosql.plugin.hive.HiveSessionProperties.isUseParquetColumnNames; +import static io.prestosql.plugin.hive.HiveUtil.getDeserializerClassName; +import static io.prestosql.plugin.hive.HiveUtil.shouldUseRecordReaderFromInputFormat; +import static io.prestosql.plugin.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource; +import static io.prestosql.plugin.hive.util.PageSourceUtil.buildPushdownContext; +import static io.prestosql.plugin.hive.util.PageSourceUtil.getSslConfiguredProperties; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE; + +public class ParquetPageSourceFactory + implements HivePageSourceFactory +{ + private static final Set PARQUET_SERDE_CLASS_NAMES = ImmutableSet.builder() + .add("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe") + .add("parquet.hive.serde.ParquetHiveSerDe") + .build(); + public static final String WRITER_TIME_ZONE_KEY = "writer.time.zone"; + + private final TypeManager typeManager; + private final HdfsEnvironment hdfsEnvironment; + private final FileFormatDataSourceStats stats; + + private final DateTimeZone timeZone; + private final ImmutableMap sslPropertyMap; + private String omniDataServerTarget; + + @Inject + public ParquetPageSourceFactory(TypeManager typeManager, HdfsEnvironment hdfsEnvironment, FileFormatDataSourceStats stats, HiveConfig hiveConfig) + { + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.stats = requireNonNull(stats, "stats is null"); + this.timeZone = requireNonNull(hiveConfig, "hiveConfig is null").getParquetDateTimeZone(); + this.sslPropertyMap = getSslConfiguredProperties(hiveConfig); + } + + @Override + public Optional createPageSource( + Configuration configuration, + ConnectorSession session, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + TupleDomain effectivePredicate, + Optional dynamicFilter, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + SplitMetadata splitMetadata, + boolean splitCacheable, + long dataSourceLastModifiedTime, + List partitionKeys, + OptionalInt bucketNumber, + Optional omniDataAddress, + HiveOffloadExpression offloadExpression) + { + if (!PARQUET_SERDE_CLASS_NAMES.contains(getDeserializerClassName( + schema))) { + return Optional.empty(); + } + + omniDataAddress.ifPresent(s -> omniDataServerTarget = s); + + if (offloadExpression.isPresent()) { + checkArgument(omniDataAddress.isPresent(), "omniDataAddress is empty"); + } + + if (HiveSessionProperties.isOmniDataEnabled(session) + && omniDataAddress.isPresent() + && offloadExpression.isPresent()) { + com.huawei.boostkit.omnidata.model.Predicate predicate = + buildPushdownContext(columns, offloadExpression, typeManager, + effectivePredicate, partitionKeys, bucketNumber, path); + return Optional.of(createParquetPushDownPageSource( + path, + start, + length, + fileSize, + predicate, + stats)); + } + + return createPageSource( + configuration, + session, + path, + start, + length, + fileSize, + schema, + columns, + effectivePredicate, + dynamicFilter, + deleteDeltaLocations, + startRowOffsetOfFile, + indexes, + splitMetadata, + splitCacheable, + dataSourceLastModifiedTime); + } + + @Override + public Optional createPageSource( + Configuration configuration, + ConnectorSession session, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + TupleDomain effectivePredicate, + Optional dynamicFilter, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + SplitMetadata splitMetadata, + boolean splitCacheable, + long dataSourceLastModifiedTime) + { + if (!PARQUET_SERDE_CLASS_NAMES.contains(getDeserializerClassName(schema)) || shouldUseRecordReaderFromInputFormat(configuration, schema)) { + return Optional.empty(); + } + + checkArgument(!deleteDeltaLocations.isPresent(), "Delete delta is not supported"); + + return Optional.of(createParquetPageSource( + hdfsEnvironment, + session.getUser(), + configuration, + path, + start, + length, + fileSize, + schema, + columns, + isUseParquetColumnNames(session), + isFailOnCorruptedParquetStatistics(session), + getParquetMaxReadBlockSize(session), + typeManager, + effectivePredicate, + stats, + timeZone)); + } + + public static ParquetPageSource createParquetPageSource( + HdfsEnvironment hdfsEnvironment, + String user, + Configuration configuration, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + boolean useParquetColumnNames, + boolean failOnCorruptedParquetStatistics, + DataSize maxReadBlockSize, + TypeManager typeManager, + TupleDomain effectivePredicate, + FileFormatDataSourceStats stats, + DateTimeZone timeZone) + { + AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext(); + + ParquetDataSource dataSource = null; + DateTimeZone readerTimeZone = timeZone; + try { + FileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration); + FSDataInputStream inputStream = hdfsEnvironment.doAs(user, () -> fileSystem.open(path)); + ParquetMetadata parquetMetadata = MetadataReader.readFooter(inputStream, path, fileSize); + FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); + MessageType fileSchema = fileMetaData.getSchema(); + dataSource = buildHdfsParquetDataSource(inputStream, path, fileSize, stats); + String writerTimeZoneId = fileMetaData.getKeyValueMetaData().get(WRITER_TIME_ZONE_KEY); + if (writerTimeZoneId != null && !writerTimeZoneId.equalsIgnoreCase(readerTimeZone.getID())) { + readerTimeZone = DateTimeZone.forID(writerTimeZoneId); + } + + List fields = columns.stream() + .filter(column -> column.getColumnType() == REGULAR) + .map(column -> getParquetType(column, fileSchema, useParquetColumnNames)) + .filter(Objects::nonNull) + .collect(toList()); + + MessageType requestedSchema = new MessageType(fileSchema.getName(), fields); + + ImmutableList.Builder footerBlocks = ImmutableList.builder(); + for (BlockMetaData block : parquetMetadata.getBlocks()) { + long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); + if (firstDataPage >= start && firstDataPage < start + length) { + footerBlocks.add(block); + } + } + + Map, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema); + TupleDomain parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate); + Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath); + final ParquetDataSource finalDataSource = dataSource; + ImmutableList.Builder blocks = ImmutableList.builder(); + for (BlockMetaData block : footerBlocks.build()) { + if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, failOnCorruptedParquetStatistics)) { + blocks.add(block); + } + } + MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema); + ParquetReader parquetReader = new ParquetReader( + messageColumnIO, + blocks.build(), + dataSource, + readerTimeZone, + systemMemoryContext, + maxReadBlockSize); + + return new ParquetPageSource( + parquetReader, + fileSchema, + messageColumnIO, + typeManager, + schema, + columns, + effectivePredicate, + useParquetColumnNames); + } + catch (Exception e) { + try { + if (dataSource != null) { + dataSource.close(); + } + } + catch (IOException ignored) { + } + if (e instanceof PrestoException) { + throw (PrestoException) e; + } + if (e instanceof ParquetCorruptionException) { + throw new PrestoException(HIVE_BAD_DATA, e); + } + if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || + e instanceof FileNotFoundException) { + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e); + } + String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage()); + if (e instanceof BlockMissingException) { + throw new PrestoException(HIVE_MISSING_DATA, message, e); + } + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e); + } + } + + public ParquetPushDownPageSource createParquetPushDownPageSource( + Path path, + long start, + long length, + long fileSize, + com.huawei.boostkit.omnidata.model.Predicate predicate, + FileFormatDataSourceStats stats) + { + AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext(); + Properties transProperties = new Properties(); + transProperties.put(GRPC_CLIENT_TARGET_LIST, omniDataServerTarget); + transProperties.putAll(sslPropertyMap); + + DataSource parquetPushDownDataSource = new com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsParquetDataSource(path.toString(), start, length, false); + + TaskSource readTaskInfo = new TaskSource( + parquetPushDownDataSource, + predicate, + TaskSource.ONE_MEGABYTES); + DataReader dataReader = DataReaderFactory.create(transProperties, readTaskInfo, new BlockDeserializer()); + + return new ParquetPushDownPageSource( + dataReader, + parquetPushDownDataSource, + systemMemoryUsage, + stats); + } + + public static TupleDomain getParquetTupleDomain(Map, RichColumnDescriptor> descriptorsByPath, TupleDomain effectivePredicate) + { + if (effectivePredicate.isNone()) { + return TupleDomain.none(); + } + + ImmutableMap.Builder predicate = ImmutableMap.builder(); + for (Entry entry : effectivePredicate.getDomains().get().entrySet()) { + HiveColumnHandle columnHandle = entry.getKey(); + // skip looking up predicates for complex types as Parquet only stores stats for primitives + if (!columnHandle.getHiveType().getCategory().equals(PRIMITIVE)) { + continue; + } + + RichColumnDescriptor descriptor = descriptorsByPath.get(ImmutableList.of(columnHandle.getName())); + if (descriptor != null) { + predicate.put(descriptor, entry.getValue()); + } + } + return TupleDomain.withColumnDomains(predicate.build()); + } + + public static org.apache.parquet.schema.Type getParquetType(HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames) + { + if (useParquetColumnNames) { + return getParquetTypeByName(column.getName(), messageType); + } + + if (column.getHiveColumnIndex() < messageType.getFieldCount()) { + return messageType.getType(column.getHiveColumnIndex()); + } + return null; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetPushDownPageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetPushDownPageSource.java new file mode 100644 index 00000000..befa8c17 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetPushDownPageSource.java @@ -0,0 +1,120 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet; + +import com.huawei.boostkit.omnidata.model.datasource.DataSource; +import com.huawei.boostkit.omnidata.reader.DataReader; +import io.prestosql.memory.context.AggregatedMemoryContext; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.util.PageSourceUtil; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorPageSource; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_OPERATOR_OFFLOAD_FAIL; + +public class ParquetPushDownPageSource + implements ConnectorPageSource +{ + private final DataReader dataReader; + private final DataSource dataSource; + private boolean closed; + private final AggregatedMemoryContext systemMemoryContext; + private final FileFormatDataSourceStats stats; + private long readTimeNanos; + private long readBytes; + + public ParquetPushDownPageSource(DataReader reader, DataSource dataSource, AggregatedMemoryContext systemMemoryContext, FileFormatDataSourceStats stats) + { + this.dataReader = reader; + this.dataSource = dataSource; + this.stats = stats; + this.systemMemoryContext = systemMemoryContext; + } + + @Override + public long getCompletedBytes() + { + return readBytes; + } + + @Override + public long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public boolean isFinished() + { + return closed; + } + + @Override + public Page getNextPage() + { + long start = System.nanoTime(); + + if (dataReader.isFinished()) { + close(); + return null; + } + + Page page = null; + try { + page = (Page) dataReader.getNextPageBlocking(); + } + catch (Exception e) { + PageSourceUtil.closeWithSuppression(this, e); + throw new PrestoException(HIVE_OPERATOR_OFFLOAD_FAIL, e.getMessage()); + } + + readTimeNanos += System.nanoTime() - start; + if (page != null) { + readBytes += page.getSizeInBytes(); + } + + return page; + } + + @Override + public long getSystemMemoryUsage() + { + return systemMemoryContext.getBytes(); + } + + @Override + public void close() + { + if (closed) { + return; + } + closed = true; + try { + dataReader.close(); + } + catch (Exception e) { + e.printStackTrace(); + } + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("dataSource", dataSource.toString()) + .toString(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetRecordWriter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetRecordWriter.java new file mode 100644 index 00000000..2ce996dc --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/parquet/ParquetRecordWriter.java @@ -0,0 +1,108 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet; + +import io.prestosql.plugin.hive.RecordFileWriter.ExtendedRecordWriter; +import io.prestosql.spi.connector.ConnectorSession; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; +import org.apache.hadoop.hive.ql.io.parquet.write.ParquetRecordWriterWrapper; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Reporter; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetOutputFormat; + +import java.io.IOException; +import java.lang.reflect.Field; +import java.util.Properties; + +import static io.prestosql.plugin.hive.HiveSessionProperties.getParquetWriterBlockSize; +import static io.prestosql.plugin.hive.HiveSessionProperties.getParquetWriterPageSize; +import static java.util.Objects.requireNonNull; + +public final class ParquetRecordWriter + implements ExtendedRecordWriter +{ + private static final Field REAL_WRITER_FIELD; + private static final Field INTERNAL_WRITER_FIELD; + private static final Field FILE_WRITER_FIELD; + + static { + try { + REAL_WRITER_FIELD = ParquetRecordWriterWrapper.class.getDeclaredField("realWriter"); + INTERNAL_WRITER_FIELD = org.apache.parquet.hadoop.ParquetRecordWriter.class.getDeclaredField("internalWriter"); + FILE_WRITER_FIELD = INTERNAL_WRITER_FIELD.getType().getDeclaredField("parquetFileWriter"); + + REAL_WRITER_FIELD.setAccessible(true); + INTERNAL_WRITER_FIELD.setAccessible(true); + FILE_WRITER_FIELD.setAccessible(true); + } + catch (ReflectiveOperationException e) { + throw new AssertionError(e); + } + } + + public static RecordWriter create(Path target, JobConf conf, Properties properties, ConnectorSession session) + throws IOException, ReflectiveOperationException + { + conf.setLong(ParquetOutputFormat.BLOCK_SIZE, getParquetWriterBlockSize(session).toBytes()); + conf.setLong(ParquetOutputFormat.PAGE_SIZE, getParquetWriterPageSize(session).toBytes()); + + RecordWriter recordWriter = new MapredParquetOutputFormat() + .getHiveRecordWriter(conf, target, Text.class, false, properties, Reporter.NULL); + + Object realWriter = REAL_WRITER_FIELD.get(recordWriter); + Object internalWriter = INTERNAL_WRITER_FIELD.get(realWriter); + ParquetFileWriter fileWriter = (ParquetFileWriter) FILE_WRITER_FIELD.get(internalWriter); + + return new ParquetRecordWriter(recordWriter, fileWriter); + } + + private final RecordWriter recordWriter; + private final ParquetFileWriter fileWriter; + private long length; + + private ParquetRecordWriter(RecordWriter recordWriter, ParquetFileWriter fileWriter) + { + this.recordWriter = requireNonNull(recordWriter, "recordWriter is null"); + this.fileWriter = requireNonNull(fileWriter, "fileWriter is null"); + } + + @Override + public long getWrittenBytes() + { + return length; + } + + @Override + public void write(Writable value) + throws IOException + { + recordWriter.write(value); + length = fileWriter.getPos(); + } + + @Override + public void close(boolean abort) + throws IOException + { + recordWriter.close(abort); + if (!abort) { + length = fileWriter.getPos(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rcfile/HdfsRcFileDataSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rcfile/HdfsRcFileDataSource.java new file mode 100644 index 00000000..a62df3f6 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rcfile/HdfsRcFileDataSource.java @@ -0,0 +1,96 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.rcfile; + +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.rcfile.RcFileDataSource; +import io.prestosql.rcfile.RcFileDataSourceId; +import org.apache.hadoop.fs.FSDataInputStream; + +import java.io.IOException; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class HdfsRcFileDataSource + implements RcFileDataSource +{ + private final FSDataInputStream inputStream; + private final String path; + private final long size; + private final FileFormatDataSourceStats stats; + private long readTimeNanos; + private long readBytes; + + public HdfsRcFileDataSource(String path, FSDataInputStream inputStream, long size, FileFormatDataSourceStats stats) + { + this.path = requireNonNull(path, "path is null"); + this.inputStream = requireNonNull(inputStream, "inputStream is null"); + this.size = size; + checkArgument(size >= 0, "size is negative"); + this.stats = requireNonNull(stats, "stats is null"); + } + + @Override + public RcFileDataSourceId getId() + { + return new RcFileDataSourceId(path); + } + + @Override + public void close() + throws IOException + { + inputStream.close(); + } + + @Override + public long getReadBytes() + { + return readBytes; + } + + @Override + public long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public long getSize() + { + return size; + } + + @Override + public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) + throws IOException + { + long start = System.nanoTime(); + + inputStream.readFully(position, buffer, bufferOffset, bufferLength); + + long readDuration = System.nanoTime() - start; + stats.readDataBytesPerSecond(bufferLength, readDuration); + + readTimeNanos += readDuration; + readBytes += bufferLength; + } + + @Override + public String toString() + { + return path; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rcfile/RcFilePageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rcfile/RcFilePageSource.java new file mode 100644 index 00000000..6e7479ac --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rcfile/RcFilePageSource.java @@ -0,0 +1,246 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.rcfile; + +import com.google.common.collect.ImmutableList; +import io.airlift.units.DataSize; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.rcfile.RcFileCorruptionException; +import io.prestosql.rcfile.RcFileReader; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.block.LazyBlock; +import io.prestosql.spi.block.LazyBlockLoader; +import io.prestosql.spi.block.RunLengthEncodedBlock; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; + +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkState; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_CURSOR_ERROR; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class RcFilePageSource + implements ConnectorPageSource +{ + private static final long GUESSED_MEMORY_USAGE = new DataSize(16, DataSize.Unit.MEGABYTE).toBytes(); + + private static final int NULL_ENTRY_SIZE = 0; + private final RcFileReader rcFileReader; + + private final List columnNames; + private final List types; + + private final Block[] constantBlocks; + private final int[] hiveColumnIndexes; + + private int pageId; + + private boolean closed; + + public RcFilePageSource(RcFileReader rcFileReader, List columns, TypeManager typeManager) + { + requireNonNull(rcFileReader, "rcReader is null"); + requireNonNull(columns, "columns is null"); + requireNonNull(typeManager, "typeManager is null"); + + this.rcFileReader = rcFileReader; + + int size = columns.size(); + + this.constantBlocks = new Block[size]; + this.hiveColumnIndexes = new int[size]; + + ImmutableList.Builder namesBuilder = ImmutableList.builder(); + ImmutableList.Builder typesBuilder = ImmutableList.builder(); + ImmutableList.Builder hiveTypesBuilder = ImmutableList.builder(); + for (int columnIndex = 0; columnIndex < columns.size(); columnIndex++) { + HiveColumnHandle column = columns.get(columnIndex); + + String name = column.getName(); + Type type = typeManager.getType(column.getTypeSignature()); + + namesBuilder.add(name); + typesBuilder.add(type); + hiveTypesBuilder.add(column.getHiveType()); + + hiveColumnIndexes[columnIndex] = column.getHiveColumnIndex(); + + if (hiveColumnIndexes[columnIndex] >= rcFileReader.getColumnCount()) { + // this file may contain fewer fields than what's declared in the schema + // this happens when additional columns are added to the hive table after files have been created + BlockBuilder blockBuilder = type.createBlockBuilder(null, 1, NULL_ENTRY_SIZE); + blockBuilder.appendNull(); + constantBlocks[columnIndex] = blockBuilder.build(); + } + } + types = typesBuilder.build(); + columnNames = namesBuilder.build(); + } + + @Override + public long getCompletedBytes() + { + return rcFileReader.getBytesRead(); + } + + @Override + public long getReadTimeNanos() + { + return rcFileReader.getReadTimeNanos(); + } + + @Override + public boolean isFinished() + { + return closed; + } + + @Override + public Page getNextPage() + { + try { + // advance in the current batch + pageId++; + + // if the batch has been consumed, read the next batch + int currentPageSize = rcFileReader.advance(); + if (currentPageSize < 0) { + close(); + return null; + } + + Block[] blocks = new Block[hiveColumnIndexes.length]; + for (int fieldId = 0; fieldId < blocks.length; fieldId++) { + if (constantBlocks[fieldId] != null) { + blocks[fieldId] = new RunLengthEncodedBlock(constantBlocks[fieldId], currentPageSize); + } + else { + blocks[fieldId] = createBlock(currentPageSize, fieldId); + } + } + + Page page = new Page(currentPageSize, blocks); + + return page; + } + catch (PrestoException e) { + closeWithSuppression(e); + throw e; + } + catch (RcFileCorruptionException e) { + closeWithSuppression(e); + throw new PrestoException(HIVE_BAD_DATA, format("Corrupted RC file: %s", rcFileReader.getId()), e); + } + catch (IOException | RuntimeException e) { + closeWithSuppression(e); + throw new PrestoException(HIVE_CURSOR_ERROR, format("Failed to read RC file: %s", rcFileReader.getId()), e); + } + } + + @Override + public void close() + throws IOException + { + // some hive input formats are broken and bad things can happen if you close them multiple times + if (closed) { + return; + } + closed = true; + + rcFileReader.close(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("columnNames", columnNames) + .add("types", types) + .toString(); + } + + @Override + public long getSystemMemoryUsage() + { + return GUESSED_MEMORY_USAGE; + } + + private void closeWithSuppression(Throwable throwable) + { + requireNonNull(throwable, "throwable is null"); + try { + close(); + } + catch (Exception e) { + if (e != throwable) { + throwable.addSuppressed(e); + } + } + } + + private Block createBlock(int currentPageSize, int fieldId) + { + int hiveColumnIndex = hiveColumnIndexes[fieldId]; + + return new LazyBlock( + currentPageSize, + new RcFileBlockLoader(hiveColumnIndex)); + } + + private final class RcFileBlockLoader + implements LazyBlockLoader + { + private final int expectedBatchId = pageId; + private final int columnIndex; + private boolean loaded; + + public RcFileBlockLoader(int columnIndex) + { + this.columnIndex = columnIndex; + } + + @Override + public final void load(LazyBlock lazyBlock) + { + if (loaded) { + return; + } + + checkState(pageId == expectedBatchId); + + try { + Block block = rcFileReader.readBlock(columnIndex); + lazyBlock.setBlock(block); + } + catch (RcFileCorruptionException e) { + throw new PrestoException(HIVE_BAD_DATA, format("Corrupted RC file: %s", rcFileReader.getId()), e); + } + catch (IOException | RuntimeException e) { + throw new PrestoException(HIVE_CURSOR_ERROR, format("Failed to read RC file: %s", rcFileReader.getId()), e); + } + + loaded = true; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rcfile/RcFilePageSourceFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rcfile/RcFilePageSourceFactory.java new file mode 100644 index 00000000..a55aa8ec --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rcfile/RcFilePageSourceFactory.java @@ -0,0 +1,237 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.rcfile; + +import com.google.common.collect.ImmutableMap; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.airlift.units.DataSize; +import io.airlift.units.DataSize.Unit; +import io.prestosql.plugin.hive.DeleteDeltaLocations; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HivePageSourceFactory; +import io.prestosql.rcfile.AircompressorCodecFactory; +import io.prestosql.rcfile.HadoopCodecFactory; +import io.prestosql.rcfile.RcFileCorruptionException; +import io.prestosql.rcfile.RcFileEncoding; +import io.prestosql.rcfile.RcFileReader; +import io.prestosql.rcfile.binary.BinaryRcFileEncoding; +import io.prestosql.rcfile.text.TextRcFileEncoding; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.dynamicfilter.DynamicFilterSupplier; +import io.prestosql.spi.heuristicindex.IndexMetadata; +import io.prestosql.spi.heuristicindex.SplitMetadata; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.BlockMissingException; +import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; +import org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe; +import org.joda.time.DateTimeZone; + +import javax.inject.Inject; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; +import java.util.Properties; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.nullToEmpty; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT; +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA; +import static io.prestosql.plugin.hive.HiveUtil.getDeserializerClassName; +import static io.prestosql.rcfile.text.TextRcFileEncoding.DEFAULT_NULL_SEQUENCE; +import static io.prestosql.rcfile.text.TextRcFileEncoding.DEFAULT_SEPARATORS; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static org.apache.hadoop.hive.serde.serdeConstants.COLLECTION_DELIM; +import static org.apache.hadoop.hive.serde.serdeConstants.ESCAPE_CHAR; +import static org.apache.hadoop.hive.serde.serdeConstants.FIELD_DELIM; +import static org.apache.hadoop.hive.serde.serdeConstants.MAPKEY_DELIM; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_NULL_FORMAT; +import static org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters.SERIALIZATION_EXTEND_NESTING_LEVELS; +import static org.apache.hadoop.hive.serde2.lazy.LazyUtils.getByte; + +public class RcFilePageSourceFactory + implements HivePageSourceFactory +{ + private static final int TEXT_LEGACY_NESTING_LEVELS = 8; + private static final int TEXT_EXTENDED_NESTING_LEVELS = 29; + + private final TypeManager typeManager; + private final HdfsEnvironment hdfsEnvironment; + private final FileFormatDataSourceStats stats; + private final DateTimeZone timeZone; + + @Inject + public RcFilePageSourceFactory(TypeManager typeManager, HdfsEnvironment hdfsEnvironment, FileFormatDataSourceStats stats, HiveConfig hiveConfig) + { + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null"); + this.stats = requireNonNull(stats, "stats is null"); + this.timeZone = requireNonNull(hiveConfig, "hiveConfig is null").getRcfileDateTimeZone(); + } + + @Override + public Optional createPageSource( + Configuration configuration, + ConnectorSession session, + Path path, + long start, + long length, + long fileSize, + Properties schema, + List columns, + TupleDomain effectivePredicate, + Optional dynamicFilters, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Optional> indexes, + SplitMetadata splitMetadata, + boolean splitCacheable, + long dataSourceLastModifiedTime) + { + RcFileEncoding rcFileEncoding; + String deserializerClassName = getDeserializerClassName(schema); + if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) { + rcFileEncoding = new BinaryRcFileEncoding(timeZone); + } + else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) { + rcFileEncoding = createTextVectorEncoding(schema); + } + else { + return Optional.empty(); + } + + checkArgument(!deleteDeltaLocations.isPresent(), "Delete delta is not supported"); + + if (fileSize == 0) { + throw new PrestoException(HIVE_BAD_DATA, "RCFile is empty: " + path); + } + + FSDataInputStream inputStream; + try { + FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration); + inputStream = hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.open(path)); + } + catch (Exception e) { + if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || + e instanceof FileNotFoundException) { + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e); + } + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e); + } + + try { + ImmutableMap.Builder readColumns = ImmutableMap.builder(); + for (HiveColumnHandle column : columns) { + readColumns.put(column.getHiveColumnIndex(), column.getHiveType().getType(typeManager)); + } + + RcFileReader rcFileReader = new RcFileReader( + new HdfsRcFileDataSource(path.toString(), inputStream, fileSize, stats), + rcFileEncoding, + readColumns.build(), + new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), + start, + length, + new DataSize(8, Unit.MEGABYTE)); + + return Optional.of(new RcFilePageSource(rcFileReader, columns, typeManager)); + } + catch (Throwable e) { + try { + inputStream.close(); + } + catch (IOException ignored) { + } + if (e instanceof PrestoException) { + throw (PrestoException) e; + } + String message = splitError(e, path, start, length); + if (e instanceof RcFileCorruptionException) { + throw new PrestoException(HIVE_BAD_DATA, message, e); + } + if (e instanceof BlockMissingException) { + throw new PrestoException(HIVE_MISSING_DATA, message, e); + } + throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e); + } + } + + private static String splitError(Throwable t, Path path, long start, long length) + { + return format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, t.getMessage()); + } + + public static TextRcFileEncoding createTextVectorEncoding(Properties schema) + { + // separators + int nestingLevels; + if (!"true".equalsIgnoreCase(schema.getProperty(SERIALIZATION_EXTEND_NESTING_LEVELS))) { + nestingLevels = TEXT_LEGACY_NESTING_LEVELS; + } + else { + nestingLevels = TEXT_EXTENDED_NESTING_LEVELS; + } + byte[] separators = Arrays.copyOf(DEFAULT_SEPARATORS, nestingLevels); + + // the first three separators are set by old-old properties + separators[0] = getByte(schema.getProperty(FIELD_DELIM, schema.getProperty(SERIALIZATION_FORMAT)), DEFAULT_SEPARATORS[0]); + separators[1] = getByte(schema.getProperty(COLLECTION_DELIM), DEFAULT_SEPARATORS[1]); + separators[2] = getByte(schema.getProperty(MAPKEY_DELIM), DEFAULT_SEPARATORS[2]); + + // null sequence + Slice nullSequence; + String nullSequenceString = schema.getProperty(SERIALIZATION_NULL_FORMAT); + if (nullSequenceString == null) { + nullSequence = DEFAULT_NULL_SEQUENCE; + } + else { + nullSequence = Slices.utf8Slice(nullSequenceString); + } + + // last column takes rest + String lastColumnTakesRestString = schema.getProperty(SERIALIZATION_LAST_COLUMN_TAKES_REST); + boolean lastColumnTakesRest = "true".equalsIgnoreCase(lastColumnTakesRestString); + + // escaped + String escapeProperty = schema.getProperty(ESCAPE_CHAR); + Byte escapeByte = null; + if (escapeProperty != null) { + escapeByte = getByte(escapeProperty, (byte) '\\'); + } + + return new TextRcFileEncoding( + nullSequence, + separators, + escapeByte, + lastColumnTakesRest); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HiveFilterPushdown.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HiveFilterPushdown.java new file mode 100644 index 00000000..5e46f0dd --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HiveFilterPushdown.java @@ -0,0 +1,500 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.rule; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.BiMap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.huawei.boostkit.omnidata.expression.OmniExpressionChecker; +import io.airlift.log.Logger; +import io.prestosql.expressions.LogicalRowExpressions; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveMetadata; +import io.prestosql.plugin.hive.HiveOffloadExpression; +import io.prestosql.plugin.hive.HivePartitionManager; +import io.prestosql.plugin.hive.HiveSessionProperties; +import io.prestosql.plugin.hive.HiveTableHandle; +import io.prestosql.plugin.hive.HiveTransactionManager; +import io.prestosql.spi.ConnectorPlanOptimizer; +import io.prestosql.spi.SymbolAllocator; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorMetadata; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.ConnectorTableMetadata; +import io.prestosql.spi.connector.Constraint; +import io.prestosql.spi.function.FunctionMetadataManager; +import io.prestosql.spi.function.StandardFunctionResolution; +import io.prestosql.spi.metadata.TableHandle; +import io.prestosql.spi.plan.FilterNode; +import io.prestosql.spi.plan.FilterStatsCalculatorService; +import io.prestosql.spi.plan.PlanNode; +import io.prestosql.spi.plan.PlanNodeIdAllocator; +import io.prestosql.spi.plan.PlanVisitor; +import io.prestosql.spi.plan.Symbol; +import io.prestosql.spi.plan.TableScanNode; +import io.prestosql.spi.plan.ValuesNode; +import io.prestosql.spi.predicate.NullableValue; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.ConstantExpression; +import io.prestosql.spi.relation.DomainTranslator; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.relation.RowExpressionService; +import io.prestosql.spi.relation.VariableReferenceExpression; +import io.prestosql.spi.statistics.Estimate; +import io.prestosql.spi.statistics.TableStatistics; +import io.prestosql.spi.type.Type; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableBiMap.toImmutableBiMap; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.google.common.collect.Sets.intersection; +import static io.prestosql.expressions.LogicalRowExpressions.FALSE_CONSTANT; +import static io.prestosql.expressions.LogicalRowExpressions.TRUE_CONSTANT; +import static io.prestosql.expressions.LogicalRowExpressions.extractConjuncts; +import static io.prestosql.expressions.RowExpressionNodeInliner.replaceExpression; +import static io.prestosql.plugin.hive.rule.HivePushdownUtil.checkStorageFormat; +import static io.prestosql.plugin.hive.rule.HivePushdownUtil.isColumnsCanOffload; +import static io.prestosql.spi.relation.DomainTranslator.BASIC_COLUMN_EXTRACTOR; +import static java.util.Objects.requireNonNull; + +public class HiveFilterPushdown + implements ConnectorPlanOptimizer +{ + private static final String DYNAMIC_FILTER_FUNCTION_NAME = "$internal$dynamic_filter_function"; + private static final Logger log = Logger.get(HiveFilterPushdown.class); + + private final HiveTransactionManager transactionManager; + private final RowExpressionService rowExpressionService; + private final StandardFunctionResolution functionResolution; + private final HivePartitionManager partitionManager; + private final FunctionMetadataManager functionMetadataManager; + private final FilterStatsCalculatorService filterCalculatorService; + + public HiveFilterPushdown( + HiveTransactionManager transactionManager, + RowExpressionService rowExpressionService, + StandardFunctionResolution functionResolution, + HivePartitionManager partitionManager, + FilterStatsCalculatorService filterCalculatorService, + FunctionMetadataManager functionMetadataManager) + { + this.transactionManager = requireNonNull(transactionManager, "transactionManager is null"); + this.rowExpressionService = requireNonNull(rowExpressionService, "rowExpressionService is null"); + this.functionResolution = requireNonNull(functionResolution, "functionResolution is null"); + this.partitionManager = requireNonNull(partitionManager, "partitionManager is null"); + this.functionMetadataManager = requireNonNull(functionMetadataManager, "functionMetadataManager is null"); + this.filterCalculatorService = filterCalculatorService; + } + + @Override + public PlanNode optimize( + PlanNode maxSubPlan, + ConnectorSession session, + Map types, + SymbolAllocator symbolAllocator, + PlanNodeIdAllocator idAllocator) + { + if (!HiveSessionProperties.isOmniDataEnabled(session) || !HiveSessionProperties.isFilterOffloadEnabled(session)) { + return maxSubPlan; + } + return maxSubPlan.accept(new Visitor(session, idAllocator, types, symbolAllocator), null); + } + + private static ExpressionExtractResult extractOffloadExpression( + RowExpression predicate, + LogicalRowExpressions logicalRowExpressions, + RowExpressionService rowExpressionService) + { + List offloadList = new ArrayList<>(); + List remainingList = new ArrayList<>(); + List conjuncts = extractConjuncts(predicate); + for (RowExpression expression : conjuncts) { + // filter nondeterministic and dynamic filter expression + if (!rowExpressionService.getDeterminismEvaluator().isDeterministic(expression)) { + remainingList.add(expression); + continue; + } + if (expression instanceof CallExpression) { + CallExpression call = (CallExpression) expression; + if (call.getDisplayName().equals(DYNAMIC_FILTER_FUNCTION_NAME)) { + remainingList.add(expression); + continue; + } + } + + if (!OmniExpressionChecker.checkExpression(expression)) { + remainingList.add(expression); + continue; + } + + offloadList.add(expression); + } + + RowExpression offloadExpression = offloadList.isEmpty() ? TRUE_CONSTANT : logicalRowExpressions.combineConjuncts(offloadList); + RowExpression remainingExpression = remainingList.isEmpty() ? TRUE_CONSTANT : logicalRowExpressions.combineConjuncts(remainingList); + return new ExpressionExtractResult(offloadExpression, remainingExpression); + } + + private static boolean determineOffloadExpression( + RowExpression offloadExpression, + ConnectorTableHandle tableHandle, + HiveMetadata metadata, + ConnectorSession session, + RowExpressionService rowExpressionService, + Map columnHandlesMap, + FilterStatsCalculatorService filterCalculatorService, + Map typesMap) + { + // decompose expression + DomainTranslator translator = rowExpressionService.getDomainTranslator(); + DomainTranslator.ExtractionResult decomposedFilter = + translator.fromPredicate(session, offloadExpression, BASIC_COLUMN_EXTRACTOR); + TupleDomain entireColumnDomain = decomposedFilter.getTupleDomain() + .transform(variableName -> columnHandlesMap.get(variableName.getName())); + Constraint constraint; + if (TRUE_CONSTANT.equals(decomposedFilter.getRemainingExpression())) { + constraint = new Constraint(entireColumnDomain); + } + else { + // TODO: evaluator is needed? or getTupleDomain().isAll()? + ConstraintEvaluator evaluator = new ConstraintEvaluator(rowExpressionService, session, columnHandlesMap, decomposedFilter.getRemainingExpression()); + constraint = new Constraint(entireColumnDomain, evaluator::isCandidate); + } + return evaluateFilterBenefit(tableHandle, columnHandlesMap, metadata, filterCalculatorService, offloadExpression, constraint, session, typesMap); + } + + @VisibleForTesting + public static ConnectorPushdownFilterResult pushdownFilter( + HiveMetadata metadata, + ConnectorSession session, + ConnectorTableHandle tableHandle, + RowExpression predicate, + Map typesMap, + RowExpressionService rowExpressionService, + StandardFunctionResolution functionResolution, + FunctionMetadataManager functionMetadataManager, + FilterStatsCalculatorService filterCalculatorService) + { + checkArgument(!FALSE_CONSTANT.equals(predicate), "Cannot pushdown filter that is always false"); + checkArgument(tableHandle instanceof HiveTableHandle, "Only supports hive TableHandle"); + + LogicalRowExpressions logicalRowExpressions = + new LogicalRowExpressions(rowExpressionService.getDeterminismEvaluator(), functionResolution, functionMetadataManager); + ExpressionExtractResult expressionExtractResult = extractOffloadExpression(predicate, logicalRowExpressions, rowExpressionService); + if (TRUE_CONSTANT.equals(expressionExtractResult.getOffloadExpression())) { + return new ConnectorPushdownFilterResult(Optional.empty(), TRUE_CONSTANT); + } + + /// TODO: handle partition column? handle predicate in tableScan node? + HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle; + Map columnHandlesMap = metadata.getColumnHandles(session, tableHandle); + HiveOffloadExpression oldOffloadExpression = hiveTableHandle.getOffloadExpression(); + RowExpression filterExpression = TRUE_CONSTANT.equals(oldOffloadExpression.getFilterExpression()) ? + expressionExtractResult.getOffloadExpression() : logicalRowExpressions.combineConjuncts(oldOffloadExpression.getFilterExpression(), expressionExtractResult.getOffloadExpression()); + RowExpression optimizedExpression = filterExpression; + if (true != determineOffloadExpression(optimizedExpression, tableHandle, metadata, session, rowExpressionService, columnHandlesMap, filterCalculatorService, typesMap)) { + return new ConnectorPushdownFilterResult(Optional.empty(), TRUE_CONSTANT); + } + + Set offloadColumns = HivePushdownUtil.extractAll(optimizedExpression).stream() + .map(entry -> (HiveColumnHandle) columnHandlesMap.get(entry.getName())).collect(Collectors.toSet()); + Optional newTableHandle = + Optional.of(hiveTableHandle.withOffloadExpression(oldOffloadExpression.updateFilter(optimizedExpression, offloadColumns))); + return new ConnectorPushdownFilterResult(newTableHandle, expressionExtractResult.getRemainingExpression()); + } + + private static Map formSymbolsLayout(Map columnHandlesMap) + { + Map layout = new LinkedHashMap<>(); + int channel = 0; + for (Map.Entry entry : columnHandlesMap.entrySet()) { + layout.put(channel++, new Symbol(entry.getValue())); + } + return layout; + } + + private static boolean evaluateFilterBenefit( + ConnectorTableHandle tableHandle, + Map columnHandlesMap, + HiveMetadata metadata, + FilterStatsCalculatorService filterCalculatorService, + RowExpression predicate, + Constraint constraint, + ConnectorSession session, + Map typesMap) + { + // TODO: total data size + TableStatistics statistics = metadata.getTableStatistics(session, tableHandle, constraint, true); + if (statistics.getRowCount().isUnknown() || statistics.getRowCount().getValue() < HiveSessionProperties.getMinOffloadRowNumber(session)) { + log.info("Filter:Table %s row number[%d], expect min row number[%d], predicate[%s].", + tableHandle.getTableName(), + (long) statistics.getRowCount().getValue(), + HiveSessionProperties.getMinOffloadRowNumber(session), + predicate.toString()); + return false; + } + + Map allColumns = columnHandlesMap.entrySet().stream().collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey)); + Map allColumnTypes = allColumns.entrySet().stream().collect(toImmutableMap( + entry -> entry.getValue(), entry -> metadata.getColumnMetadata(session, tableHandle, entry.getKey()).getType())); + Map symbolsMap = typesMap.entrySet().stream() + .collect(toImmutableMap(entry -> new Symbol(entry.getKey()), entry -> entry.getValue())); + TableStatistics filterStatistics = filterCalculatorService.filterStats(statistics, predicate, session, + allColumns, allColumnTypes, symbolsMap, formSymbolsLayout(allColumns)); + Estimate filteredRowCount = filterStatistics.getRowCount().isUnknown() ? statistics.getRowCount() : filterStatistics.getRowCount(); + double filterFactor = filteredRowCount.getValue() / statistics.getRowCount().getValue(); + if (filterFactor <= HiveSessionProperties.getMinFilterOffloadFactor(session)) { + log.info("Offloading: table %s, size[%d], predicate[%s], filter factor[%.2f%%].", + tableHandle.getTableName(), (long) statistics.getRowCount().getValue(), + predicate.toString(), filterFactor * 100); + return true; + } + else { + log.info("No need to offload: table %s, size[%d], predicate[%s], filter factor[%.2f%%].", + tableHandle.getTableName(), (long) statistics.getRowCount().getValue(), + predicate.toString(), filterFactor * 100); + } + return false; + } + + private static class ConnectorPushdownFilterResult + { + private final Optional tableHandle; + private final RowExpression remainingExpression; + + public ConnectorPushdownFilterResult( + Optional tableHandle, RowExpression remainingExpression) + { + this.tableHandle = requireNonNull(tableHandle, "handle is null"); + this.remainingExpression = requireNonNull(remainingExpression, "remainingExpression is null"); + } + + public Optional getTableHandle() + { + return tableHandle; + } + + public RowExpression getRemainingExpression() + { + return remainingExpression; + } + } + + private class Visitor + extends PlanVisitor + { + private final ConnectorSession session; + private final PlanNodeIdAllocator idAllocator; + private final Map types; + private final SymbolAllocator symbolAllocator; + + Visitor( + ConnectorSession session, + PlanNodeIdAllocator idAllocator, + Map types, + SymbolAllocator symbolAllocator) + { + this.session = requireNonNull(session, "session is null"); + this.idAllocator = requireNonNull(idAllocator, "idAllocator is null"); + this.types = requireNonNull(types, "types is null"); + this.symbolAllocator = requireNonNull(symbolAllocator, "symbolAllocator is null"); + } + + @Override + public PlanNode visitPlan(PlanNode node, Void context) + { + ImmutableList.Builder children = ImmutableList.builder(); + boolean changed = false; + for (PlanNode child : node.getSources()) { + PlanNode newChild = child.accept(this, null); + if (newChild != child) { + changed = true; + } + children.add(newChild); + } + + if (!changed) { + return node; + } + return node.replaceChildren(children.build()); + } + + private String getColumnName( + ConnectorSession session, + HiveMetadata metadata, + ConnectorTableHandle tableHandle, + ColumnHandle columnHandle) + { + return metadata.getColumnMetadata(session, tableHandle, columnHandle).getName(); + } + + @Override + public PlanNode visitFilter(FilterNode filterNode, Void context) + { + if (!(filterNode.getSource() instanceof TableScanNode)) { + return visitPlan(filterNode, context); + } + + TableScanNode tableScan = (TableScanNode) filterNode.getSource(); + if (!isOperatorOffloadSupported(session, tableScan.getTable())) { + return filterNode; + } + + if (!HivePushdownUtil.isOmniDataNodesNormal() || !isColumnsCanOffload(tableScan.getTable().getConnectorHandle(), tableScan.getOutputSymbols(), types)) { + return filterNode; + } + + RowExpression expression = filterNode.getPredicate(); + TableHandle tableHandle = tableScan.getTable(); + HiveMetadata hiveMetadata = getMetadata(tableHandle); + + BiMap symbolToColumnMapping = + tableScan.getAssignments().entrySet().stream().collect(toImmutableBiMap( + entry -> new VariableReferenceExpression(entry.getKey().getName(), types.get(entry.getKey().getName())), + entry -> new VariableReferenceExpression(getColumnName(session, hiveMetadata, + tableHandle.getConnectorHandle(), entry.getValue()), types.get(entry.getKey().getName())))); + RowExpression replacedExpression = replaceExpression(expression, symbolToColumnMapping); + // replaceExpression() may further optimize the expression; if the resulting expression is always false, + // then return empty Values node + if (FALSE_CONSTANT.equals(replacedExpression)) { + return new ValuesNode(idAllocator.getNextId(), tableScan.getOutputSymbols(), ImmutableList.of()); + } + + ConnectorPushdownFilterResult pushdownFilterResult = pushdownFilter(hiveMetadata, session, tableHandle.getConnectorHandle(), + replacedExpression, types, rowExpressionService, functionResolution, functionMetadataManager, filterCalculatorService); + if (!pushdownFilterResult.getTableHandle().isPresent()) { + return filterNode; + } + + TableHandle newTableHandle = + new TableHandle( + tableHandle.getCatalogName(), + pushdownFilterResult.getTableHandle().get(), + tableHandle.getTransaction(), + tableHandle.getLayout()); + TableScanNode newTableScan = + new TableScanNode( + tableScan.getId(), + newTableHandle, + tableScan.getOutputSymbols(), + tableScan.getAssignments(), + tableScan.getEnforcedConstraint(), + tableScan.getPredicate(), + tableScan.getStrategy(), + tableScan.getReuseTableScanMappingId(), + tableScan.getConsumerTableScanNodeCount(), + tableScan.isForDelete()); + + if (!TRUE_CONSTANT.equals(pushdownFilterResult.getRemainingExpression())) { + return new FilterNode( + idAllocator.getNextId(), newTableScan, + replaceExpression(pushdownFilterResult.getRemainingExpression(), symbolToColumnMapping.inverse())); + } + return newTableScan; + } + } + + private static class ConstraintEvaluator + { + private final Map assignments; + private final RowExpressionService evaluator; + private final ConnectorSession session; + private final RowExpression expression; + private final Set arguments; + + public ConstraintEvaluator(RowExpressionService evaluator, ConnectorSession session, Map assignments, RowExpression expression) + { + this.assignments = assignments; + this.evaluator = evaluator; + this.session = session; + this.expression = expression; + + arguments = ImmutableSet.copyOf(HivePushdownUtil.extractAll(expression)).stream() + .map(VariableReferenceExpression::getName) + .map(assignments::get) + .collect(toImmutableSet()); + } + + private boolean isCandidate(Map bindings) + { + if (intersection(bindings.keySet(), arguments).isEmpty()) { + return true; + } + + Function variableResolver = variable -> { + ColumnHandle column = assignments.get(variable.getName()); + checkArgument(column != null, "Missing column assignment for %s", variable); + + if (!bindings.containsKey(column)) { + return variable; + } + + return bindings.get(column).getValue(); + }; + + // If any conjuncts evaluate to FALSE or null, then the whole predicate will never be true and so the partition should be pruned + return !Boolean.FALSE.equals(expression) && expression != null && (!(expression instanceof ConstantExpression) || !((ConstantExpression) expression).isNull()); + } + } + + private HiveMetadata getMetadata(TableHandle tableHandle) + { + ConnectorMetadata metadata = transactionManager.get(tableHandle.getTransaction()); + checkState(metadata instanceof HiveMetadata, "metadata must be HiveMetadata"); + return (HiveMetadata) metadata; + } + + protected boolean isOperatorOffloadSupported(ConnectorSession session, TableHandle tableHandle) + { + ConnectorTableMetadata metadata = getMetadata(tableHandle).getTableMetadata(session, tableHandle.getConnectorHandle()); + return checkStorageFormat(metadata); + } + + public static class ExpressionExtractResult + { + private final RowExpression offloadExpression; + private final RowExpression remainingExpression; + + public ExpressionExtractResult(RowExpression offloadExpression, RowExpression remainingExpression) + { + this.offloadExpression = requireNonNull(offloadExpression, "offloadExpression is null"); + this.remainingExpression = requireNonNull(remainingExpression, "remainingExpression is null"); + } + + public RowExpression getOffloadExpression() + { + return offloadExpression; + } + + public RowExpression getRemainingExpression() + { + return remainingExpression; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HiveLimitPushdown.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HiveLimitPushdown.java new file mode 100644 index 00000000..d08b1c6d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HiveLimitPushdown.java @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.rule; + +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.HiveMetadata; +import io.prestosql.plugin.hive.HiveOffloadExpression; +import io.prestosql.plugin.hive.HiveSessionProperties; +import io.prestosql.plugin.hive.HiveTableHandle; +import io.prestosql.plugin.hive.HiveTransactionManager; +import io.prestosql.spi.ConnectorPlanOptimizer; +import io.prestosql.spi.SymbolAllocator; +import io.prestosql.spi.connector.ConnectorMetadata; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorTableMetadata; +import io.prestosql.spi.metadata.TableHandle; +import io.prestosql.spi.plan.LimitNode; +import io.prestosql.spi.plan.PlanNode; +import io.prestosql.spi.plan.PlanNodeIdAllocator; +import io.prestosql.spi.plan.PlanVisitor; +import io.prestosql.spi.plan.TableScanNode; +import io.prestosql.spi.type.Type; + +import java.util.Map; +import java.util.OptionalLong; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.prestosql.plugin.hive.rule.HivePushdownUtil.isColumnsCanOffload; +import static java.util.Objects.requireNonNull; + +public class HiveLimitPushdown + implements ConnectorPlanOptimizer +{ + private final HiveTransactionManager transactionManager; + + public HiveLimitPushdown(HiveTransactionManager transactionManager) + { + this.transactionManager = requireNonNull(transactionManager, "transactionManager is null"); + } + + @Override + public PlanNode optimize( + PlanNode maxSubPlan, + ConnectorSession session, + Map types, + SymbolAllocator symbolAllocator, + PlanNodeIdAllocator idAllocator) + { + if (!HiveSessionProperties.isOmniDataEnabled(session)) { + return maxSubPlan; + } + return maxSubPlan.accept(new Visitor(types, session), null); + } + + private class Visitor + extends PlanVisitor + { + private final Map types; + private final ConnectorSession session; + + public Visitor(Map types, ConnectorSession session) + { + this.session = session; + this.types = types; + } + + @Override + public PlanNode visitPlan(PlanNode node, Void context) + { + ImmutableList.Builder children = ImmutableList.builder(); + boolean changed = false; + for (PlanNode child : node.getSources()) { + PlanNode newChild = child.accept(this, null); + if (newChild != child) { + changed = true; + } + children.add(newChild); + } + + if (!changed) { + return node; + } + return node.replaceChildren(children.build()); + } + + @Override + public PlanNode visitLimit(LimitNode limitNode, Void context) + { + if (!(limitNode.getSource() instanceof TableScanNode && limitNode.isPartial())) { + return visitPlan(limitNode, context); + } + checkArgument(limitNode.getCount() >= 0, "limit must be at least zero"); + TableScanNode tableScan = (TableScanNode) limitNode.getSource(); + TableHandle tableHandle = tableScan.getTable(); + + ConnectorMetadata connectorMetadata = transactionManager.get(tableHandle.getTransaction()); + if (!(connectorMetadata instanceof HiveMetadata)) { + return visitPlan(limitNode, context); + } + ConnectorTableMetadata metadata = connectorMetadata.getTableMetadata(session, tableHandle.getConnectorHandle()); + if (true != HivePushdownUtil.checkTableCanOffload(tableScan, metadata)) { + return visitPlan(limitNode, context); + } + + if (!HivePushdownUtil.isOmniDataNodesNormal() || !isColumnsCanOffload(tableHandle.getConnectorHandle(), tableScan.getOutputSymbols(), types)) { + return visitPlan(limitNode, context); + } + + HiveOffloadExpression offloadExpression = ((HiveTableHandle) tableHandle.getConnectorHandle()).getOffloadExpression(); + TableHandle newTableHandle = + new TableHandle( + tableHandle.getCatalogName(), + ((HiveTableHandle) tableHandle.getConnectorHandle()).withOffloadExpression(offloadExpression.updateLimit(OptionalLong.of(limitNode.getCount()))), + tableHandle.getTransaction(), + tableHandle.getLayout()); + TableScanNode newTableScan = + new TableScanNode( + tableScan.getId(), + newTableHandle, + tableScan.getOutputSymbols(), + tableScan.getAssignments(), + tableScan.getEnforcedConstraint(), + tableScan.getPredicate(), + tableScan.getStrategy(), + tableScan.getReuseTableScanMappingId(), + tableScan.getConsumerTableScanNodeCount(), + tableScan.isForDelete()); + return newTableScan; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HivePartialAggregationPushdown.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HivePartialAggregationPushdown.java new file mode 100644 index 00000000..0621f138 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HivePartialAggregationPushdown.java @@ -0,0 +1,397 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.rule; + +import com.google.common.collect.BiMap; +import com.google.common.collect.ImmutableMap; +import com.huawei.boostkit.omnidata.expression.OmniExpressionChecker; +import com.huawei.boostkit.omnidata.model.AggregationInfo; +import io.airlift.log.Logger; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveOffloadExpression; +import io.prestosql.plugin.hive.HiveSessionProperties; +import io.prestosql.plugin.hive.HiveTableHandle; +import io.prestosql.plugin.hive.HiveTableProperties; +import io.prestosql.plugin.hive.HiveTransactionManager; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.HiveTypeTranslator; +import io.prestosql.plugin.hive.TransactionalMetadata; +import io.prestosql.spi.ConnectorPlanOptimizer; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.SymbolAllocator; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorMetadata; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.ConnectorTableMetadata; +import io.prestosql.spi.function.FunctionMetadataManager; +import io.prestosql.spi.function.StandardFunctionResolution; +import io.prestosql.spi.metadata.TableHandle; +import io.prestosql.spi.plan.AggregationNode; +import io.prestosql.spi.plan.PlanNode; +import io.prestosql.spi.plan.PlanNodeIdAllocator; +import io.prestosql.spi.plan.PlanVisitor; +import io.prestosql.spi.plan.ProjectNode; +import io.prestosql.spi.plan.Symbol; +import io.prestosql.spi.plan.TableScanNode; +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.relation.VariableReferenceExpression; +import io.prestosql.spi.statistics.ColumnStatistics; +import io.prestosql.spi.statistics.TableStatistics; +import io.prestosql.spi.type.RowType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeSignature; + +import javax.inject.Inject; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableBiMap.toImmutableBiMap; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.prestosql.expressions.RowExpressionNodeInliner.replaceExpression; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.DUMMY_OFFLOADED; +import static io.prestosql.plugin.hive.HiveColumnHandle.DUMMY_OFFLOADED_COLUMN_INDEX; +import static io.prestosql.plugin.hive.HiveColumnHandle.DUMMY_OFFLOADED_COLUMN_NAME; +import static io.prestosql.plugin.hive.rule.HivePushdownUtil.checkTableCanOffload; +import static io.prestosql.plugin.hive.rule.HivePushdownUtil.getDataSourceColumns; +import static io.prestosql.plugin.hive.rule.HivePushdownUtil.isColumnsCanOffload; +import static io.prestosql.spi.StandardErrorCode.NOT_FOUND; +import static io.prestosql.spi.connector.Constraint.alwaysTrue; +import static io.prestosql.spi.plan.AggregationNode.Step.PARTIAL; +import static java.util.Objects.requireNonNull; + +public class HivePartialAggregationPushdown + implements ConnectorPlanOptimizer +{ + private static final Logger log = Logger.get(HivePartialAggregationPushdown.class); + private static final double AGGREGATION_FACTOR_MAX = 1.0; + private static final double AGGREGATION_FACTOR_MIN = 0.0; + private final FunctionMetadataManager functionMetadataManager; + private final StandardFunctionResolution standardFunctionResolution; + private final Supplier metadataFactory; + private final HiveTransactionManager transactionManager; + + @Inject + public HivePartialAggregationPushdown( + HiveTransactionManager transactionManager, + FunctionMetadataManager functionMetadataManager, + StandardFunctionResolution standardFunctionResolution, + Supplier metadataFactory) + { + this.transactionManager = requireNonNull(transactionManager, "transactionManager is null"); + this.functionMetadataManager = requireNonNull(functionMetadataManager, "function manager is null"); + this.standardFunctionResolution = requireNonNull(standardFunctionResolution, "standard function resolution is null"); + this.metadataFactory = requireNonNull(metadataFactory, "metadata factory is null"); + } + + private static Optional getHiveTableHandle(TableScanNode tableScanNode) + { + TableHandle table = tableScanNode.getTable(); + if (table != null) { + ConnectorTableHandle connectorHandle = table.getConnectorHandle(); + if (connectorHandle instanceof HiveTableHandle) { + return Optional.of((HiveTableHandle) connectorHandle); + } + } + return Optional.empty(); + } + + private static PlanNode replaceChildren(PlanNode node, List children) + { + return children.containsAll(node.getSources()) ? node : node.replaceChildren(children); + } + + @Override + public PlanNode optimize( + PlanNode maxSubplan, + ConnectorSession session, + Map types, + SymbolAllocator symbolAllocator, + PlanNodeIdAllocator idAllocator) + { + if (!HiveSessionProperties.isOmniDataEnabled(session) || !HiveSessionProperties.isAggregatorOffloadEnabled(session)) { + return maxSubplan; + } + return maxSubplan.accept(new Visitor(session, idAllocator, types, symbolAllocator), null); + } + + private class Visitor + extends PlanVisitor + { + private final PlanNodeIdAllocator idAllocator; + private final ConnectorSession session; + private final Map types; + private final SymbolAllocator symbolAllocator; + + public Visitor( + ConnectorSession session, + PlanNodeIdAllocator idAllocator, + Map types, + SymbolAllocator symbolAllocator) + { + this.session = session; + this.idAllocator = idAllocator; + this.symbolAllocator = symbolAllocator; + this.types = types; + } + + private boolean isAggregationPushdownSupported(AggregationNode partialAggregationNode) + { + if (!partialAggregationNode.getPreGroupedSymbols().isEmpty() + || partialAggregationNode.getHashSymbol().isPresent() + || partialAggregationNode.getGroupIdSymbol().isPresent()) { + return false; + } + + for (Map.Entry entry : partialAggregationNode.getAggregations().entrySet()) { + AggregationNode.Aggregation aggregation = entry.getValue(); + if (aggregation.isDistinct() || aggregation.getFilter().isPresent() + || aggregation.getMask().isPresent() || aggregation.getOrderingScheme().isPresent()) { + return false; + } + if (!OmniExpressionChecker.checkAggregateFunction(aggregation.getFunctionCall())) { + return false; + } + } + + TableScanNode tableScanNode; + if (partialAggregationNode.getSource() instanceof TableScanNode) { + tableScanNode = (TableScanNode) partialAggregationNode.getSource(); + } + else { + if (!(partialAggregationNode.getSource() instanceof ProjectNode) + || !(((ProjectNode) partialAggregationNode.getSource()).getSource() instanceof TableScanNode)) { + return false; + } + tableScanNode = (TableScanNode) (((ProjectNode) partialAggregationNode.getSource()).getSource()); + } + ConnectorTableMetadata connectorTableMetadata = metadataFactory.get().getTableMetadata(session, tableScanNode.getTable().getConnectorHandle()); + Optional rawFormat = Optional.ofNullable(connectorTableMetadata.getProperties().get(HiveTableProperties.STORAGE_FORMAT_PROPERTY)); + if (!rawFormat.isPresent()) { + return false; + } + + if (true != checkTableCanOffload(tableScanNode, connectorTableMetadata)) { + return false; + } + + TableHandle tableHandle = tableScanNode.getTable(); + checkArgument(tableHandle.getConnectorHandle() instanceof HiveTableHandle, "Only supports hive TableHandle"); + HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle.getConnectorHandle(); + if (!isColumnsCanOffload(hiveTableHandle, tableScanNode.getOutputSymbols(), types)) { + return false; + } + + return true; + } + + private Optional buildAggregationInfo(AggregationNode partialAggregationNode, TableScanNode tableScanNode) + { + BiMap symbolToColumnMapping = + tableScanNode.getAssignments().entrySet().stream().collect(toImmutableBiMap( + entry -> new VariableReferenceExpression(entry.getKey().getName(), types.get(entry.getKey().getName())), + entry -> new VariableReferenceExpression(entry.getValue().getColumnName(), types.get(entry.getKey().getName())))); + + ImmutableMap.Builder aggregationsBuilder = ImmutableMap.builder(); + for (Map.Entry entry : partialAggregationNode.getAggregations().entrySet()) { + RowExpression expression = replaceExpression(entry.getValue().getFunctionCall(), symbolToColumnMapping); + checkArgument(expression instanceof CallExpression, "Replace result is not callExpression"); + CallExpression callExpression = (CallExpression) expression; + AggregationInfo.AggregateFunction function = new AggregationInfo.AggregateFunction(callExpression, entry.getValue().isDistinct()); + aggregationsBuilder.put(entry.getKey().getName(), function); + } + List groupingKeys = partialAggregationNode.getGroupingKeys().stream() + .map(entry -> replaceExpression(new VariableReferenceExpression(entry.getName(), + types.get(entry.getName())), symbolToColumnMapping)).collect(toImmutableList()); + AggregationInfo aggregationInfo = new AggregationInfo(aggregationsBuilder.build(), groupingKeys); + return Optional.of(aggregationInfo); + } + + private Optional tryProjectPushdown(AggregationNode aggregationNode) + { + if (aggregationNode.getSource() instanceof TableScanNode) { + return Optional.of((TableScanNode) aggregationNode.getSource()); + } + + if (!(aggregationNode.getSource() instanceof ProjectNode)) { + return Optional.empty(); + } + + ProjectNode projectNode = (ProjectNode) aggregationNode.getSource(); + return HiveProjectPushdown.tryProjectPushdown(projectNode, types); + } + + private Optional tryPartialAggregationPushdown(PlanNode plan) + { + if (!(plan instanceof AggregationNode && ((AggregationNode) plan).getStep().equals(PARTIAL))) { + return Optional.empty(); + } + AggregationNode partialAggregationNode = (AggregationNode) plan; + if (!((partialAggregationNode.getSource() instanceof TableScanNode) || + (partialAggregationNode.getSource() instanceof ProjectNode && ((ProjectNode) partialAggregationNode.getSource()).getSource() instanceof TableScanNode))) { + return Optional.empty(); + } + + if (!isAggregationPushdownSupported(partialAggregationNode) || !HivePushdownUtil.isOmniDataNodesNormal()) { + return Optional.empty(); + } + + double aggregationFactor = getAggregationFactor(partialAggregationNode, session, transactionManager); + if (aggregationFactor > HiveSessionProperties.getMinAggregatorOffloadFactor(session)) { + return Optional.empty(); + } + + Optional oldTableScanNode = tryProjectPushdown(partialAggregationNode); + if (!oldTableScanNode.isPresent()) { + return Optional.empty(); + } + TableHandle oldTableHandle = oldTableScanNode.get().getTable(); + HiveTableHandle hiveTableHandle = getHiveTableHandle(oldTableScanNode.get()).orElseThrow(() -> new PrestoException(NOT_FOUND, "Hive table handle not found")); + + HiveTypeTranslator hiveTypeTranslator = new HiveTypeTranslator(); + Map assignments = new HashMap<>(); + for (Map.Entry aggregationEntry : partialAggregationNode.getAggregations().entrySet()) { + CallExpression callExpression = aggregationEntry.getValue().getFunctionCall(); + ColumnHandle newColumnHandle; + TypeSignature typeSignature = callExpression.getType().getTypeSignature(); + if (callExpression.getArguments().isEmpty()) { + HiveType hiveType = HiveType.toHiveType(hiveTypeTranslator, callExpression.getType()); + newColumnHandle = new HiveColumnHandle(DUMMY_OFFLOADED_COLUMN_NAME, hiveType, typeSignature, DUMMY_OFFLOADED_COLUMN_INDEX, + DUMMY_OFFLOADED, Optional.of("partial aggregation pushed down " + aggregationEntry.getKey().getName()), false); + } + else { + RowExpression column = callExpression.getArguments().get(0); + if (!(column instanceof VariableReferenceExpression)) { + return Optional.empty(); + } + String columnName = column.toString(); + HiveColumnHandle columnHandle = (HiveColumnHandle) oldTableScanNode.get().getAssignments().get(new Symbol(columnName)); + HiveType hiveType; + if (callExpression.getType() instanceof RowType) { + hiveType = columnHandle.getHiveType(); + } + else { + hiveType = HiveType.toHiveType(hiveTypeTranslator, callExpression.getType()); + } + newColumnHandle = new HiveColumnHandle(columnHandle.getName(), hiveType, typeSignature, columnHandle.getHiveColumnIndex(), + DUMMY_OFFLOADED, Optional.of("partial aggregation pushed down " + aggregationEntry.getKey().getName()), false); + } + assignments.put(aggregationEntry.getKey(), newColumnHandle); + } + + for (Symbol symbol : partialAggregationNode.getGroupingKeys()) { + HiveColumnHandle groupingColumn = (HiveColumnHandle) oldTableScanNode.get().getAssignments().get(symbol); + groupingColumn = new HiveColumnHandle(groupingColumn.getName(), groupingColumn.getHiveType(), groupingColumn.getTypeSignature(), groupingColumn.getHiveColumnIndex(), + DUMMY_OFFLOADED, Optional.of("partial aggregation pushed down " + symbol.getName()), false); + assignments.put(symbol, groupingColumn); + } + + HiveOffloadExpression offloadExpression = hiveTableHandle.getOffloadExpression(); + HiveTableHandle newHiveTableHandle = hiveTableHandle.withOffloadExpression( + offloadExpression.updateAggregation(buildAggregationInfo(partialAggregationNode, oldTableScanNode.get()), getDataSourceColumns(oldTableScanNode.get()))); + TableHandle newTableHandle = new TableHandle( + oldTableHandle.getCatalogName(), + newHiveTableHandle, + oldTableHandle.getTransaction(), + oldTableHandle.getLayout()); + + TableScanNode newTableScan = + new TableScanNode( + oldTableScanNode.get().getId(), + newTableHandle, + partialAggregationNode.getOutputSymbols(), + assignments, + oldTableScanNode.get().getEnforcedConstraint(), + oldTableScanNode.get().getPredicate(), + oldTableScanNode.get().getStrategy(), + oldTableScanNode.get().getReuseTableScanMappingId(), + oldTableScanNode.get().getConsumerTableScanNodeCount(), + oldTableScanNode.get().isForDelete()); + log.info("Offloading: table %s, aggregation factor[%.2f%%], aggregation[%s] .", + newTableHandle.getConnectorHandle().getTableName(), aggregationFactor * 100, + HiveOffloadExpression.aggregationInfoToString(newHiveTableHandle.getOffloadExpression().getAggregations().get())); + return Optional.of(newTableScan); + } + + @Override + public PlanNode visitPlan(PlanNode node, Void context) + { + Optional pushedDownPlan = tryPartialAggregationPushdown(node); + return pushedDownPlan.orElseGet(() -> replaceChildren( + node, + node.getSources().stream().map(source -> source.accept(this, null)).collect(toImmutableList()))); + } + + private double getAggregationFactor( + AggregationNode aggregationNode, + ConnectorSession connectorSession, + HiveTransactionManager hiveTransactionManager) + { + TableScanNode tableScanNode; + if (aggregationNode.getSource() instanceof TableScanNode) { + tableScanNode = (TableScanNode) aggregationNode.getSource(); + } + else if (aggregationNode.getSource() instanceof ProjectNode && ((ProjectNode) aggregationNode.getSource()).getSource() instanceof TableScanNode) { + tableScanNode = (TableScanNode) ((ProjectNode) aggregationNode.getSource()).getSource(); + } + else { + return AGGREGATION_FACTOR_MAX; + } + + ConnectorTableHandle connectorHandle = tableScanNode.getTable().getConnectorHandle(); + if (!(connectorHandle instanceof HiveTableHandle)) { + return AGGREGATION_FACTOR_MAX; + } + HiveTableHandle tableHandle = (HiveTableHandle) connectorHandle; + ConnectorMetadata metadata = hiveTransactionManager.get(tableScanNode.getTable().getTransaction()); + TableStatistics statistics = metadata.getTableStatistics(connectorSession, tableHandle, alwaysTrue(), true); + if (statistics.getRowCount().isUnknown() || statistics.getRowCount().getValue() < HiveSessionProperties.getMinOffloadRowNumber(connectorSession)) { + log.info("Aggregation:Table %s row number[%d], expect min row number[%d].", + tableHandle.getTableName(), + (long) statistics.getRowCount().getValue(), + HiveSessionProperties.getMinOffloadRowNumber(connectorSession)); + return AGGREGATION_FACTOR_MAX; + } + + if (aggregationNode.getGroupingKeys().isEmpty()) { + return AGGREGATION_FACTOR_MIN; + } + + double rowsCount = 1; + Map statisticsMap = statistics.getColumnStatistics() + .entrySet().stream().collect(Collectors.toMap(entry -> entry.getKey().getColumnName(), entry -> entry.getValue())); + for (Symbol symbol : aggregationNode.getGroupingKeys()) { + ColumnHandle columnHandle = tableScanNode.getAssignments().getOrDefault(symbol, null); + if (columnHandle == null) { + return AGGREGATION_FACTOR_MAX; + } + ColumnStatistics columnStatistics = statisticsMap.getOrDefault(columnHandle.getColumnName(), null); + if (columnStatistics == null || columnStatistics.getDistinctValuesCount().isUnknown()) { + return AGGREGATION_FACTOR_MAX; + } + int nullRow = (columnStatistics.getNullsFraction().getValue() == 0.0) ? 0 : 1; + rowsCount *= columnStatistics.getDistinctValuesCount().getValue() + nullRow; + } + return rowsCount / statistics.getRowCount().getValue(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HivePlanOptimizerProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HivePlanOptimizerProvider.java new file mode 100644 index 00000000..871f757e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HivePlanOptimizerProvider.java @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.rule; + +import com.google.common.collect.ImmutableSet; +import com.google.inject.Inject; +import io.prestosql.plugin.hive.HivePartitionManager; +import io.prestosql.plugin.hive.HiveTransactionManager; +import io.prestosql.plugin.hive.TransactionalMetadata; +import io.prestosql.spi.ConnectorPlanOptimizer; +import io.prestosql.spi.connector.ConnectorPlanOptimizerProvider; +import io.prestosql.spi.function.FunctionMetadataManager; +import io.prestosql.spi.function.StandardFunctionResolution; +import io.prestosql.spi.plan.FilterStatsCalculatorService; +import io.prestosql.spi.relation.RowExpressionService; + +import java.util.Set; +import java.util.function.Supplier; + +import static java.util.Objects.requireNonNull; + +public class HivePlanOptimizerProvider + implements ConnectorPlanOptimizerProvider +{ + private final Set planOptimizers; + + @Inject + public HivePlanOptimizerProvider( + HiveTransactionManager transactionManager, + RowExpressionService rowExpressionService, + StandardFunctionResolution functionResolution, + HivePartitionManager partitionManager, + FunctionMetadataManager functionMetadataManager, + FilterStatsCalculatorService filterCalculatorService, + Supplier metadataFactory) + { + requireNonNull(transactionManager, "transactionManager is null"); + requireNonNull(rowExpressionService, "rowExpressionService is null"); + requireNonNull(functionResolution, "functionResolution is null"); + requireNonNull(partitionManager, "partitionManager is null"); + requireNonNull(functionMetadataManager, "functionMetadataManager is null"); + this.planOptimizers = ImmutableSet.of( + new HiveFilterPushdown(transactionManager, rowExpressionService, functionResolution, partitionManager, filterCalculatorService, functionMetadataManager), + new HivePartialAggregationPushdown(transactionManager, functionMetadataManager, functionResolution, metadataFactory), + new HiveLimitPushdown(transactionManager)); + } + + @Override + public Set getLogicalPlanOptimizers() + { + return planOptimizers; + } + + @Override + public Set getPhysicalPlanOptimizers() + { + // New filters may be created in between logical optimization and physical optimization. Push those newly created filters as well. + return planOptimizers; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HiveProjectPushdown.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HiveProjectPushdown.java new file mode 100644 index 00000000..2084c5cf --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HiveProjectPushdown.java @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.rule; + +import com.google.common.collect.BiMap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.huawei.boostkit.omnidata.expression.OmniExpressionChecker; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveOffloadExpression; +import io.prestosql.plugin.hive.HiveTableHandle; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.HiveTypeTranslator; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.metadata.TableHandle; +import io.prestosql.spi.plan.ProjectNode; +import io.prestosql.spi.plan.Symbol; +import io.prestosql.spi.plan.TableScanNode; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.relation.VariableReferenceExpression; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeSignature; + +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + +import static com.google.common.collect.ImmutableBiMap.toImmutableBiMap; +import static io.prestosql.expressions.RowExpressionNodeInliner.replaceExpression; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.DUMMY_OFFLOADED; +import static io.prestosql.plugin.hive.HiveColumnHandle.DUMMY_OFFLOADED_COLUMN_INDEX; +import static io.prestosql.plugin.hive.rule.HivePushdownUtil.getDataSourceColumns; + +public class HiveProjectPushdown +{ + private HiveProjectPushdown() {} + + public static Optional tryProjectPushdown(ProjectNode plan, Map types) + { + if (!(plan.getSource() instanceof TableScanNode)) { + return Optional.empty(); + } + TableScanNode tableScanNode = (TableScanNode) plan.getSource(); + ConnectorTableHandle tableHandle = tableScanNode.getTable().getConnectorHandle(); + if (!(tableHandle instanceof HiveTableHandle) || + !(((HiveTableHandle) tableHandle).getOffloadExpression().getProjections().isEmpty())) { + return Optional.empty(); + } + + Map assignments = new HashMap<>(); + HiveTypeTranslator hiveTypeTranslator = new HiveTypeTranslator(); + for (Map.Entry entry : plan.getAssignments().entrySet()) { + String name = entry.getKey().getName(); + HiveType hiveType = HiveType.toHiveType(hiveTypeTranslator, entry.getValue().getType()); + TypeSignature typeSignature = entry.getValue().getType().getTypeSignature(); + HiveColumnHandle columnHandle = new HiveColumnHandle(name, hiveType, typeSignature, + DUMMY_OFFLOADED_COLUMN_INDEX, DUMMY_OFFLOADED, Optional.of("projections pushed down " + name)); + assignments.put(entry.getKey(), columnHandle); + } + + BiMap variableToColumnMapping = + tableScanNode.getAssignments().entrySet().stream().collect(toImmutableBiMap( + entry -> new VariableReferenceExpression(entry.getKey().getName(), types.get(entry.getKey().getName())), + entry -> new VariableReferenceExpression(entry.getValue().getColumnName(), types.get(entry.getKey().getName())))); + ImmutableMap.Builder projections = new ImmutableMap.Builder<>(); + for (Map.Entry entry : plan.getAssignments().getMap().entrySet()) { + RowExpression expression = replaceExpression(entry.getValue(), variableToColumnMapping); + if (!OmniExpressionChecker.checkExpression(expression)) { + return Optional.empty(); + } + projections.put(entry.getKey(), expression); + } + + HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle; + HiveOffloadExpression offloadExpression = hiveTableHandle.getOffloadExpression().updateProjections(projections.build(), getDataSourceColumns(tableScanNode)); + HiveTableHandle newHiveTableHandle = hiveTableHandle.withOffloadExpression(offloadExpression); + TableHandle newTableHandle = new TableHandle( + tableScanNode.getTable().getCatalogName(), + newHiveTableHandle, + tableScanNode.getTable().getTransaction(), + tableScanNode.getTable().getLayout()); + return Optional.of(new TableScanNode(tableScanNode.getId(), newTableHandle, ImmutableList.copyOf(assignments.keySet()), assignments, + tableScanNode.getEnforcedConstraint(), tableScanNode.getPredicate(), tableScanNode.getStrategy(), tableScanNode.getReuseTableScanMappingId(), + tableScanNode.getConsumerTableScanNodeCount(), tableScanNode.isForDelete())); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HivePushdownUtil.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HivePushdownUtil.java new file mode 100644 index 00000000..e86e53eb --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/rule/HivePushdownUtil.java @@ -0,0 +1,168 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.rule; + +import com.google.common.collect.ImmutableSet; +import com.huawei.boostkit.omnidata.expression.OmniExpressionChecker; +import io.prestosql.expressions.DefaultRowExpressionTraversalVisitor; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveStorageFormat; +import io.prestosql.plugin.hive.HiveTableHandle; +import io.prestosql.plugin.hive.omnidata.OmniDataNodeManager; +import io.prestosql.plugin.hive.omnidata.OmniDataNodeStatus; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.ConnectorTableMetadata; +import io.prestosql.spi.plan.Symbol; +import io.prestosql.spi.plan.TableScanNode; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.relation.VariableReferenceExpression; +import io.prestosql.spi.type.Type; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.prestosql.expressions.LogicalRowExpressions.TRUE_CONSTANT; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.DUMMY_OFFLOADED; +import static io.prestosql.plugin.hive.HiveTableProperties.getHiveStorageFormat; + +public class HivePushdownUtil +{ + private static final double OMNIDATA_BUSY_PERCENT = 0.8; + private static Optional omniDataNodeManager = Optional.empty(); + + private HivePushdownUtil() {} + + public static Set getDataSourceColumns(TableScanNode node) + { + ImmutableSet.Builder builder = new ImmutableSet.Builder<>(); + for (Map.Entry entry : node.getAssignments().entrySet()) { + HiveColumnHandle hiveColumnHandle = (HiveColumnHandle) entry.getValue(); + if (hiveColumnHandle.getColumnType().equals(DUMMY_OFFLOADED)) { + continue; + } + builder.add(hiveColumnHandle); + } + + return builder.build(); + } + + public static Set extractAll(RowExpression expression) + { + ImmutableSet.Builder builder = ImmutableSet.builder(); + expression.accept(new VariableReferenceBuilderVisitor(), builder); + return builder.build(); + } + + public static class VariableReferenceBuilderVisitor + extends DefaultRowExpressionTraversalVisitor> + { + @Override + public Void visitVariableReference(VariableReferenceExpression variable, ImmutableSet.Builder builder) + { + builder.add(variable); + return null; + } + } + + public static boolean isColumnsCanOffload(ConnectorTableHandle tableHandle, List outputSymbols, Map typesMap) + { + // just for performance, avoid query types + if (tableHandle instanceof HiveTableHandle) { + HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle; + if (hiveTableHandle.getOffloadExpression().isPresent()) { + return true; + } + } + + for (Symbol symbol : outputSymbols) { + Type type = typesMap.get(symbol.getName()); + if (!OmniExpressionChecker.checkType(type)) { + return false; + } + } + return true; + } + + public static void setOmniDataNodeManager(OmniDataNodeManager manager) + { + omniDataNodeManager = Optional.of(manager); + } + + /** + * Check that all omniDatas nodes are in the normal state. + * + * @return true/false + */ + public static boolean isOmniDataNodesNormal() + { + if (!omniDataNodeManager.isPresent()) { + return false; + } + Map nodeStatusMap = omniDataNodeManager.get().getAllNodes(); + // TODO: check online node number + for (Map.Entry entry : nodeStatusMap.entrySet()) { + OmniDataNodeStatus omniDataNode = entry.getValue(); + if (omniDataNode.getRunningTaskNumber() > omniDataNode.getMaxTaskNumber() * OMNIDATA_BUSY_PERCENT) { + return false; + } + } + return true; + } + + public static boolean checkStorageFormat(ConnectorTableMetadata metadata) + { + HiveStorageFormat hiveStorageFormat = getHiveStorageFormat(metadata.getProperties()); + if (hiveStorageFormat == null) { + return false; + } + if (hiveStorageFormat != HiveStorageFormat.ORC && hiveStorageFormat != HiveStorageFormat.PARQUET + && hiveStorageFormat != HiveStorageFormat.CSV && hiveStorageFormat != HiveStorageFormat.TEXTFILE) { + return false; + } + return true; + } + + public static boolean checkTableCanOffload(TableScanNode tableScanNode, ConnectorTableMetadata metadata) + { + // if there are filter conditions added by other optimizers, then return false + if (tableScanNode.getPredicate().isPresent() && !tableScanNode.getPredicate().get().equals(TRUE_CONSTANT)) { + return false; + } + if (!tableScanNode.getEnforcedConstraint().isAll() && !tableScanNode.getEnforcedConstraint().isNone()) { + return false; + } + checkArgument(tableScanNode.getTable().getConnectorHandle() instanceof HiveTableHandle); + HiveTableHandle tableHandle = (HiveTableHandle) tableScanNode.getTable().getConnectorHandle(); + if (!tableHandle.getPredicateColumns().isEmpty() || tableHandle.getDisjunctCompactEffectivePredicate().isPresent()) { + return false; + } + if (!tableHandle.getEnforcedConstraint().isNone() && !tableHandle.getEnforcedConstraint().isAll()) { + return false; + } + if (!tableHandle.getCompactEffectivePredicate().isNone() && !tableHandle.getCompactEffectivePredicate().isAll()) { + return false; + } + + if (true != checkStorageFormat(metadata)) { + return false; + } + + return true; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/ConfigurationInitializer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/ConfigurationInitializer.java new file mode 100644 index 00000000..1a74d883 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/ConfigurationInitializer.java @@ -0,0 +1,21 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import org.apache.hadoop.conf.Configuration; + +public interface ConfigurationInitializer +{ + void initializeConfiguration(Configuration config); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/HiveS3Config.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/HiveS3Config.java new file mode 100644 index 00000000..12f7e5ac --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/HiveS3Config.java @@ -0,0 +1,419 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import com.google.common.base.StandardSystemProperty; +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; +import io.airlift.configuration.ConfigSecuritySensitive; +import io.airlift.units.DataSize; +import io.airlift.units.Duration; +import io.airlift.units.MinDataSize; +import io.airlift.units.MinDuration; + +import javax.validation.constraints.Min; +import javax.validation.constraints.NotNull; + +import java.io.File; +import java.util.concurrent.TimeUnit; + +import static io.airlift.units.DataSize.Unit.MEGABYTE; + +public class HiveS3Config +{ + private String s3AwsAccessKey; + private String s3AwsSecretKey; + private String s3Endpoint; + private PrestoS3SignerType s3SignerType; + private boolean s3PathStyleAccess; + private boolean s3UseInstanceCredentials = true; + private String s3IamRole; + private boolean s3SslEnabled = true; + private boolean s3SseEnabled; + private PrestoS3SseType s3SseType = PrestoS3SseType.S3; + private String s3EncryptionMaterialsProvider; + private String s3KmsKeyId; + private String s3SseKmsKeyId; + private int s3MaxClientRetries = 5; + private int s3MaxErrorRetries = 10; + private Duration s3MaxBackoffTime = new Duration(10, TimeUnit.MINUTES); + private Duration s3MaxRetryTime = new Duration(10, TimeUnit.MINUTES); + private Duration s3ConnectTimeout = new Duration(5, TimeUnit.SECONDS); + private Duration s3SocketTimeout = new Duration(5, TimeUnit.SECONDS); + private int s3MaxConnections = 500; + private File s3StagingDirectory = new File(StandardSystemProperty.JAVA_IO_TMPDIR.value()); + private DataSize s3MultipartMinFileSize = new DataSize(16, MEGABYTE); + private DataSize s3MultipartMinPartSize = new DataSize(5, MEGABYTE); + private boolean pinS3ClientToCurrentRegion; + private String s3UserAgentPrefix = ""; + private PrestoS3AclType s3AclType = PrestoS3AclType.PRIVATE; + private boolean skipGlacierObjects; + + public String getS3AwsAccessKey() + { + return s3AwsAccessKey; + } + + @Config("hive.s3.aws-access-key") + public HiveS3Config setS3AwsAccessKey(String s3AwsAccessKey) + { + this.s3AwsAccessKey = s3AwsAccessKey; + return this; + } + + public String getS3AwsSecretKey() + { + return s3AwsSecretKey; + } + + @Config("hive.s3.aws-secret-key") + @ConfigSecuritySensitive + public HiveS3Config setS3AwsSecretKey(String s3AwsSecretKey) + { + this.s3AwsSecretKey = s3AwsSecretKey; + return this; + } + + public String getS3Endpoint() + { + return s3Endpoint; + } + + @Config("hive.s3.endpoint") + public HiveS3Config setS3Endpoint(String s3Endpoint) + { + this.s3Endpoint = s3Endpoint; + return this; + } + + public PrestoS3SignerType getS3SignerType() + { + return s3SignerType; + } + + @Config("hive.s3.signer-type") + public HiveS3Config setS3SignerType(PrestoS3SignerType s3SignerType) + { + this.s3SignerType = s3SignerType; + return this; + } + + public boolean isS3PathStyleAccess() + { + return s3PathStyleAccess; + } + + @Config("hive.s3.path-style-access") + @ConfigDescription("Use path-style access for all request to S3") + public HiveS3Config setS3PathStyleAccess(boolean s3PathStyleAccess) + { + this.s3PathStyleAccess = s3PathStyleAccess; + return this; + } + + public boolean isS3UseInstanceCredentials() + { + return s3UseInstanceCredentials; + } + + @Config("hive.s3.use-instance-credentials") + public HiveS3Config setS3UseInstanceCredentials(boolean s3UseInstanceCredentials) + { + this.s3UseInstanceCredentials = s3UseInstanceCredentials; + return this; + } + + public String getS3IamRole() + { + return s3IamRole; + } + + @Config("hive.s3.iam-role") + @ConfigDescription("ARN of an IAM role to assume when connecting to the S3") + public HiveS3Config setS3IamRole(String s3IamRole) + { + this.s3IamRole = s3IamRole; + return this; + } + + public boolean isS3SslEnabled() + { + return s3SslEnabled; + } + + @Config("hive.s3.ssl.enabled") + public HiveS3Config setS3SslEnabled(boolean s3SslEnabled) + { + this.s3SslEnabled = s3SslEnabled; + return this; + } + + public String getS3EncryptionMaterialsProvider() + { + return s3EncryptionMaterialsProvider; + } + + @Config("hive.s3.encryption-materials-provider") + @ConfigDescription("Use a custom encryption materials provider for S3 data encryption") + public HiveS3Config setS3EncryptionMaterialsProvider(String s3EncryptionMaterialsProvider) + { + this.s3EncryptionMaterialsProvider = s3EncryptionMaterialsProvider; + return this; + } + + public String getS3KmsKeyId() + { + return s3KmsKeyId; + } + + @Config("hive.s3.kms-key-id") + @ConfigDescription("Use an AWS KMS key for S3 data encryption") + public HiveS3Config setS3KmsKeyId(String s3KmsKeyId) + { + this.s3KmsKeyId = s3KmsKeyId; + return this; + } + + public String getS3SseKmsKeyId() + { + return s3SseKmsKeyId; + } + + @Config("hive.s3.sse.kms-key-id") + @ConfigDescription("KMS Key ID to use for S3 server-side encryption with KMS-managed key") + public HiveS3Config setS3SseKmsKeyId(String s3SseKmsKeyId) + { + this.s3SseKmsKeyId = s3SseKmsKeyId; + return this; + } + + public boolean isS3SseEnabled() + { + return s3SseEnabled; + } + + @Config("hive.s3.sse.enabled") + @ConfigDescription("Enable S3 server side encryption") + public HiveS3Config setS3SseEnabled(boolean s3SseEnabled) + { + this.s3SseEnabled = s3SseEnabled; + return this; + } + + @NotNull + public PrestoS3SseType getS3SseType() + { + return s3SseType; + } + + @Config("hive.s3.sse.type") + @ConfigDescription("Key management type for S3 server-side encryption (S3 or KMS)") + public HiveS3Config setS3SseType(PrestoS3SseType s3SseType) + { + this.s3SseType = s3SseType; + return this; + } + + @Min(0) + public int getS3MaxClientRetries() + { + return s3MaxClientRetries; + } + + @Config("hive.s3.max-client-retries") + public HiveS3Config setS3MaxClientRetries(int s3MaxClientRetries) + { + this.s3MaxClientRetries = s3MaxClientRetries; + return this; + } + + @Min(0) + public int getS3MaxErrorRetries() + { + return s3MaxErrorRetries; + } + + @Config("hive.s3.max-error-retries") + public HiveS3Config setS3MaxErrorRetries(int s3MaxErrorRetries) + { + this.s3MaxErrorRetries = s3MaxErrorRetries; + return this; + } + + @MinDuration("1s") + @NotNull + public Duration getS3MaxBackoffTime() + { + return s3MaxBackoffTime; + } + + @Config("hive.s3.max-backoff-time") + public HiveS3Config setS3MaxBackoffTime(Duration s3MaxBackoffTime) + { + this.s3MaxBackoffTime = s3MaxBackoffTime; + return this; + } + + @MinDuration("1ms") + @NotNull + public Duration getS3MaxRetryTime() + { + return s3MaxRetryTime; + } + + @Config("hive.s3.max-retry-time") + public HiveS3Config setS3MaxRetryTime(Duration s3MaxRetryTime) + { + this.s3MaxRetryTime = s3MaxRetryTime; + return this; + } + + @MinDuration("1ms") + @NotNull + public Duration getS3ConnectTimeout() + { + return s3ConnectTimeout; + } + + @Config("hive.s3.connect-timeout") + public HiveS3Config setS3ConnectTimeout(Duration s3ConnectTimeout) + { + this.s3ConnectTimeout = s3ConnectTimeout; + return this; + } + + @MinDuration("1ms") + @NotNull + public Duration getS3SocketTimeout() + { + return s3SocketTimeout; + } + + @Config("hive.s3.socket-timeout") + public HiveS3Config setS3SocketTimeout(Duration s3SocketTimeout) + { + this.s3SocketTimeout = s3SocketTimeout; + return this; + } + + @Min(1) + public int getS3MaxConnections() + { + return s3MaxConnections; + } + + @Config("hive.s3.max-connections") + public HiveS3Config setS3MaxConnections(int s3MaxConnections) + { + this.s3MaxConnections = s3MaxConnections; + return this; + } + + @NotNull + public File getS3StagingDirectory() + { + return s3StagingDirectory; + } + + @Config("hive.s3.staging-directory") + @ConfigDescription("Temporary directory for staging files before uploading to S3") + public HiveS3Config setS3StagingDirectory(File s3StagingDirectory) + { + this.s3StagingDirectory = s3StagingDirectory; + return this; + } + + @NotNull + @MinDataSize("16MB") + public DataSize getS3MultipartMinFileSize() + { + return s3MultipartMinFileSize; + } + + @Config("hive.s3.multipart.min-file-size") + @ConfigDescription("Minimum file size for an S3 multipart upload") + public HiveS3Config setS3MultipartMinFileSize(DataSize size) + { + this.s3MultipartMinFileSize = size; + return this; + } + + @NotNull + @MinDataSize("5MB") + public DataSize getS3MultipartMinPartSize() + { + return s3MultipartMinPartSize; + } + + @Config("hive.s3.multipart.min-part-size") + @ConfigDescription("Minimum part size for an S3 multipart upload") + public HiveS3Config setS3MultipartMinPartSize(DataSize size) + { + this.s3MultipartMinPartSize = size; + return this; + } + + public boolean isPinS3ClientToCurrentRegion() + { + return pinS3ClientToCurrentRegion; + } + + @Config("hive.s3.pin-client-to-current-region") + @ConfigDescription("Should the S3 client be pinned to the current EC2 region") + public HiveS3Config setPinS3ClientToCurrentRegion(boolean pinS3ClientToCurrentRegion) + { + this.pinS3ClientToCurrentRegion = pinS3ClientToCurrentRegion; + return this; + } + + @NotNull + public String getS3UserAgentPrefix() + { + return s3UserAgentPrefix; + } + + @Config("hive.s3.user-agent-prefix") + @ConfigDescription("The user agent prefix to use for S3 calls") + public HiveS3Config setS3UserAgentPrefix(String s3UserAgentPrefix) + { + this.s3UserAgentPrefix = s3UserAgentPrefix; + return this; + } + + @NotNull + public PrestoS3AclType getS3AclType() + { + return s3AclType; + } + + @Config("hive.s3.upload-acl-type") + @ConfigDescription("Canned ACL type for S3 uploads") + public HiveS3Config setS3AclType(PrestoS3AclType s3AclType) + { + this.s3AclType = s3AclType; + return this; + } + + public boolean isSkipGlacierObjects() + { + return skipGlacierObjects; + } + + @Config("hive.s3.skip-glacier-objects") + public HiveS3Config setSkipGlacierObjects(boolean skipGlacierObjects) + { + this.skipGlacierObjects = skipGlacierObjects; + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/HiveS3Module.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/HiveS3Module.java new file mode 100644 index 00000000..848311d8 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/HiveS3Module.java @@ -0,0 +1,76 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import com.google.inject.Binder; +import com.google.inject.Scopes; +import io.airlift.configuration.AbstractConfigurationAwareModule; +import io.prestosql.plugin.hive.HiveConfig; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.JavaUtils; + +import static com.google.inject.multibindings.Multibinder.newSetBinder; +import static io.airlift.configuration.ConfigBinder.configBinder; +import static org.weakref.jmx.guice.ExportBinder.newExporter; + +public class HiveS3Module + extends AbstractConfigurationAwareModule +{ + public static final String EMR_FS_CLASS_NAME = "com.amazon.ws.emr.hadoop.fs.EmrFileSystem"; + + @Override + protected void setup(Binder binder) + { + S3FileSystemType type = buildConfigObject(HiveConfig.class).getS3FileSystemType(); + if (type == S3FileSystemType.PRESTO) { + newSetBinder(binder, ConfigurationInitializer.class).addBinding().to(PrestoS3ConfigurationInitializer.class).in(Scopes.SINGLETON); + configBinder(binder).bindConfig(HiveS3Config.class); + + binder.bind(PrestoS3FileSystemStats.class).toInstance(PrestoS3FileSystem.getFileSystemStats()); + newExporter(binder).export(PrestoS3FileSystemStats.class) + .as(generator -> generator.generatedNameOf(PrestoS3FileSystem.class)); + } + else if (type == S3FileSystemType.EMRFS) { + validateEmrFsClass(); + newSetBinder(binder, ConfigurationInitializer.class).addBinding().to(EmrFsS3ConfigurationInitializer.class).in(Scopes.SINGLETON); + } + else { + throw new RuntimeException("Unknown file system type: " + type); + } + } + + private static void validateEmrFsClass() + { + // verify that the class exists + try { + Class.forName(EMR_FS_CLASS_NAME, true, JavaUtils.getClassLoader()); + } + catch (ClassNotFoundException e) { + throw new RuntimeException("EMR File System class not found: " + EMR_FS_CLASS_NAME, e); + } + } + + public static class EmrFsS3ConfigurationInitializer + implements ConfigurationInitializer + { + @Override + public void initializeConfiguration(Configuration config) + { + // re-map filesystem schemes to use the Amazon EMR file system + config.set("fs.s3.impl", EMR_FS_CLASS_NAME); + config.set("fs.s3a.impl", EMR_FS_CLASS_NAME); + config.set("fs.s3n.impl", EMR_FS_CLASS_NAME); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3AclType.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3AclType.java new file mode 100644 index 00000000..c261d97e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3AclType.java @@ -0,0 +1,42 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import com.amazonaws.services.s3.model.CannedAccessControlList; + +import static java.util.Objects.requireNonNull; + +public enum PrestoS3AclType +{ + AUTHENTICATED_READ(CannedAccessControlList.AuthenticatedRead), + AWS_EXEC_READ(CannedAccessControlList.AwsExecRead), + BUCKET_OWNER_FULL_CONTROL(CannedAccessControlList.BucketOwnerFullControl), + BUCKET_OWNER_READ(CannedAccessControlList.BucketOwnerRead), + LOG_DELIVERY_WRITE(CannedAccessControlList.LogDeliveryWrite), + PRIVATE(CannedAccessControlList.Private), + PUBLIC_READ(CannedAccessControlList.PublicRead), + PUBLIC_READ_WRITE(CannedAccessControlList.PublicReadWrite); + + private final CannedAccessControlList cannedACL; + + PrestoS3AclType(CannedAccessControlList cannedACL) + { + this.cannedACL = requireNonNull(cannedACL, "cannedACL is null"); + } + + CannedAccessControlList getCannedACL() + { + return cannedACL; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3ClientFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3ClientFactory.java new file mode 100644 index 00000000..dac94847 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3ClientFactory.java @@ -0,0 +1,178 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import com.amazonaws.ClientConfiguration; +import com.amazonaws.Protocol; +import com.amazonaws.auth.AWSCredentials; +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.AWSStaticCredentialsProvider; +import com.amazonaws.auth.BasicAWSCredentials; +import com.amazonaws.auth.DefaultAWSCredentialsProviderChain; +import com.amazonaws.auth.InstanceProfileCredentialsProvider; +import com.amazonaws.metrics.RequestMetricCollector; +import com.amazonaws.regions.Region; +import com.amazonaws.regions.Regions; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.AmazonS3Builder; +import com.amazonaws.services.s3.AmazonS3Client; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.HiveConfig; +import org.apache.hadoop.conf.Configuration; + +import javax.annotation.concurrent.GuardedBy; + +import java.net.URI; +import java.util.Optional; + +import static com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration; +import static com.amazonaws.regions.Regions.US_EAST_1; +import static com.google.common.base.Strings.isNullOrEmpty; +import static com.google.common.base.Verify.verify; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_ENDPOINT; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_PIN_CLIENT_TO_CURRENT_REGION; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; + +/** + * This factory provides AmazonS3 client required for executing S3SelectPushdown requests. + * Normal S3 GET requests use AmazonS3 clients initialized in PrestoS3FileSystem or EMRFS. + * The ideal state will be to merge this logic with the two file systems and get rid of this + * factory class. + * Please do not use the client provided by this factory for any other use cases. + */ +public class PrestoS3ClientFactory +{ + private static final String S3_ACCESS_KEY = "presto.s3.access-key"; + private static final String S3_SECRET_KEY = "presto.s3.secret-key"; + private static final String S3_CREDENTIALS_PROVIDER = "presto.s3.credentials-provider"; + private static final String S3_USE_INSTANCE_CREDENTIALS = "presto.s3.use-instance-credentials"; + private static final String S3_CONNECT_TIMEOUT = "presto.s3.connect-timeout"; + private static final String S3_SOCKET_TIMEOUT = "presto.s3.socket-timeout"; + private static final String S3_SSL_ENABLED = "presto.s3.ssl.enabled"; + private static final String S3_MAX_ERROR_RETRIES = "presto.s3.max-error-retries"; + private static final String S3_USER_AGENT_PREFIX = "presto.s3.user-agent-prefix"; + private static final String S3_SELECT_PUSHDOWN_MAX_CONNECTIONS = "hive.s3select-pushdown.max-connections"; + private static final String S3_USER_AGENT_SUFFIX = "presto"; + private static final String S3_USER_AGENT_SUFFIX_SELECT = "presto-select"; + + @GuardedBy("this") + private AmazonS3 s3Client; + + synchronized AmazonS3 getS3Client(Configuration config, HiveConfig hiveConfig) + { + if (s3Client != null) { + return s3Client; + } + + HiveS3Config defaults = new HiveS3Config(); + String userAgentPrefix = config.get(S3_USER_AGENT_PREFIX, defaults.getS3UserAgentPrefix()); + int maxErrorRetries = config.getInt(S3_MAX_ERROR_RETRIES, defaults.getS3MaxErrorRetries()); + boolean sslEnabled = config.getBoolean(S3_SSL_ENABLED, defaults.isS3SslEnabled()); + Duration connectTimeout = Duration.valueOf(config.get(S3_CONNECT_TIMEOUT, defaults.getS3ConnectTimeout().toString())); + Duration socketTimeout = Duration.valueOf(config.get(S3_SOCKET_TIMEOUT, defaults.getS3SocketTimeout().toString())); + int maxConnections = config.getInt(S3_SELECT_PUSHDOWN_MAX_CONNECTIONS, hiveConfig.getS3SelectPushdownMaxConnections()); + + ClientConfiguration clientConfiguration = new ClientConfiguration() + .withMaxErrorRetry(maxErrorRetries) + .withProtocol(sslEnabled ? Protocol.HTTPS : Protocol.HTTP) + .withConnectionTimeout(toIntExact(connectTimeout.toMillis())) + .withSocketTimeout(toIntExact(socketTimeout.toMillis())) + .withMaxConnections(maxConnections) + .withUserAgentPrefix(userAgentPrefix) + .withUserAgentSuffix(hiveConfig.isS3SelectPushdownEnabled() ? S3_USER_AGENT_SUFFIX_SELECT : S3_USER_AGENT_SUFFIX); + + PrestoS3FileSystemStats stats = new PrestoS3FileSystemStats(); + RequestMetricCollector metricCollector = new PrestoS3FileSystemMetricCollector(stats); + AWSCredentialsProvider awsCredentialsProvider = getAwsCredentialsProvider(config, defaults); + AmazonS3Builder, ? extends AmazonS3> clientBuilder = AmazonS3Client.builder() + .withCredentials(awsCredentialsProvider) + .withClientConfiguration(clientConfiguration) + .withMetricsCollector(metricCollector) + .enablePathStyleAccess(); + + boolean regionOrEndpointSet = false; + + String endpoint = config.get(S3_ENDPOINT); + boolean pinS3ClientToCurrentRegion = config.getBoolean(S3_PIN_CLIENT_TO_CURRENT_REGION, defaults.isPinS3ClientToCurrentRegion()); + verify(!pinS3ClientToCurrentRegion || endpoint == null, + "Invalid configuration: either endpoint can be set or S3 client can be pinned to the current region"); + + // use local region when running inside of EC2 + if (pinS3ClientToCurrentRegion) { + Region region = Regions.getCurrentRegion(); + if (region != null) { + clientBuilder.withRegion(region.getName()); + regionOrEndpointSet = true; + } + } + + if (!isNullOrEmpty(endpoint)) { + clientBuilder.withEndpointConfiguration(new EndpointConfiguration(endpoint, null)); + regionOrEndpointSet = true; + } + + if (!regionOrEndpointSet) { + clientBuilder.withRegion(US_EAST_1); + clientBuilder.setForceGlobalBucketAccessEnabled(true); + } + + s3Client = clientBuilder.build(); + return s3Client; + } + + private AWSCredentialsProvider getAwsCredentialsProvider(Configuration conf, HiveS3Config defaults) + { + Optional credentials = getAwsCredentials(conf); + if (credentials.isPresent()) { + return new AWSStaticCredentialsProvider(credentials.get()); + } + + boolean useInstanceCredentials = conf.getBoolean(S3_USE_INSTANCE_CREDENTIALS, defaults.isS3UseInstanceCredentials()); + if (useInstanceCredentials) { + return InstanceProfileCredentialsProvider.getInstance(); + } + + String providerClass = conf.get(S3_CREDENTIALS_PROVIDER); + if (!isNullOrEmpty(providerClass)) { + return getCustomAWSCredentialsProvider(conf, providerClass); + } + + return DefaultAWSCredentialsProviderChain.getInstance(); + } + + private static AWSCredentialsProvider getCustomAWSCredentialsProvider(Configuration conf, String providerClass) + { + try { + return conf.getClassByName(providerClass) + .asSubclass(AWSCredentialsProvider.class) + .getConstructor(URI.class, Configuration.class) + .newInstance(null, conf); + } + catch (ReflectiveOperationException e) { + throw new RuntimeException(format("Error creating an instance of %s", providerClass), e); + } + } + + private static Optional getAwsCredentials(Configuration conf) + { + String accessKey = conf.get(S3_ACCESS_KEY); + String secretKey = conf.get(S3_SECRET_KEY); + + if (isNullOrEmpty(accessKey) || isNullOrEmpty(secretKey)) { + return Optional.empty(); + } + return Optional.of(new BasicAWSCredentials(accessKey, secretKey)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3ConfigurationInitializer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3ConfigurationInitializer.java new file mode 100644 index 00000000..8212c693 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3ConfigurationInitializer.java @@ -0,0 +1,167 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import io.airlift.units.DataSize; +import io.airlift.units.Duration; +import org.apache.hadoop.conf.Configuration; + +import javax.inject.Inject; + +import java.io.File; + +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_ACCESS_KEY; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_ACL_TYPE; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_CONNECT_TIMEOUT; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_ENCRYPTION_MATERIALS_PROVIDER; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_ENDPOINT; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_IAM_ROLE; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_KMS_KEY_ID; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MAX_BACKOFF_TIME; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MAX_CLIENT_RETRIES; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MAX_CONNECTIONS; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MAX_ERROR_RETRIES; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MAX_RETRY_TIME; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MULTIPART_MIN_FILE_SIZE; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MULTIPART_MIN_PART_SIZE; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_PATH_STYLE_ACCESS; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_PIN_CLIENT_TO_CURRENT_REGION; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_SECRET_KEY; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_SIGNER_TYPE; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_SKIP_GLACIER_OBJECTS; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_SOCKET_TIMEOUT; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_SSE_ENABLED; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_SSE_KMS_KEY_ID; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_SSE_TYPE; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_SSL_ENABLED; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_STAGING_DIRECTORY; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_USER_AGENT_PREFIX; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_USE_INSTANCE_CREDENTIALS; + +public class PrestoS3ConfigurationInitializer + implements ConfigurationInitializer +{ + private final String awsAccessKey; + private final String awsSecretKey; + private final String endpoint; + private final PrestoS3SignerType signerType; + private final boolean pathStyleAccess; + private final boolean useInstanceCredentials; + private final String iamRole; + private final boolean sslEnabled; + private final boolean sseEnabled; + private final PrestoS3SseType sseType; + private final String encryptionMaterialsProvider; + private final String kmsKeyId; + private final String sseKmsKeyId; + private final int maxClientRetries; + private final int maxErrorRetries; + private final Duration maxBackoffTime; + private final Duration maxRetryTime; + private final Duration connectTimeout; + private final Duration socketTimeout; + private final int maxConnections; + private final DataSize multipartMinFileSize; + private final DataSize multipartMinPartSize; + private final File stagingDirectory; + private final boolean pinClientToCurrentRegion; + private final String userAgentPrefix; + private final PrestoS3AclType aclType; + private boolean skipGlacierObjects; + + @Inject + public PrestoS3ConfigurationInitializer(HiveS3Config config) + { + this.awsAccessKey = config.getS3AwsAccessKey(); + this.awsSecretKey = config.getS3AwsSecretKey(); + this.endpoint = config.getS3Endpoint(); + this.signerType = config.getS3SignerType(); + this.pathStyleAccess = config.isS3PathStyleAccess(); + this.useInstanceCredentials = config.isS3UseInstanceCredentials(); + this.iamRole = config.getS3IamRole(); + this.sslEnabled = config.isS3SslEnabled(); + this.sseEnabled = config.isS3SseEnabled(); + this.sseType = config.getS3SseType(); + this.encryptionMaterialsProvider = config.getS3EncryptionMaterialsProvider(); + this.kmsKeyId = config.getS3KmsKeyId(); + this.sseKmsKeyId = config.getS3SseKmsKeyId(); + this.maxClientRetries = config.getS3MaxClientRetries(); + this.maxErrorRetries = config.getS3MaxErrorRetries(); + this.maxBackoffTime = config.getS3MaxBackoffTime(); + this.maxRetryTime = config.getS3MaxRetryTime(); + this.connectTimeout = config.getS3ConnectTimeout(); + this.socketTimeout = config.getS3SocketTimeout(); + this.maxConnections = config.getS3MaxConnections(); + this.multipartMinFileSize = config.getS3MultipartMinFileSize(); + this.multipartMinPartSize = config.getS3MultipartMinPartSize(); + this.stagingDirectory = config.getS3StagingDirectory(); + this.pinClientToCurrentRegion = config.isPinS3ClientToCurrentRegion(); + this.userAgentPrefix = config.getS3UserAgentPrefix(); + this.aclType = config.getS3AclType(); + this.skipGlacierObjects = config.isSkipGlacierObjects(); + } + + @Override + public void initializeConfiguration(Configuration config) + { + // re-map filesystem schemes to match Amazon Elastic MapReduce + config.set("fs.s3.impl", PrestoS3FileSystem.class.getName()); + config.set("fs.s3a.impl", PrestoS3FileSystem.class.getName()); + config.set("fs.s3n.impl", PrestoS3FileSystem.class.getName()); + + if (awsAccessKey != null) { + config.set(S3_ACCESS_KEY, awsAccessKey); + } + if (awsSecretKey != null) { + config.set(S3_SECRET_KEY, awsSecretKey); + } + if (endpoint != null) { + config.set(S3_ENDPOINT, endpoint); + } + if (signerType != null) { + config.set(S3_SIGNER_TYPE, signerType.name()); + } + config.setBoolean(S3_PATH_STYLE_ACCESS, pathStyleAccess); + config.setBoolean(S3_USE_INSTANCE_CREDENTIALS, useInstanceCredentials); + if (iamRole != null) { + config.set(S3_IAM_ROLE, iamRole); + } + config.setBoolean(S3_SSL_ENABLED, sslEnabled); + config.setBoolean(S3_SSE_ENABLED, sseEnabled); + config.set(S3_SSE_TYPE, sseType.name()); + if (encryptionMaterialsProvider != null) { + config.set(S3_ENCRYPTION_MATERIALS_PROVIDER, encryptionMaterialsProvider); + } + if (kmsKeyId != null) { + config.set(S3_KMS_KEY_ID, kmsKeyId); + } + if (sseKmsKeyId != null) { + config.set(S3_SSE_KMS_KEY_ID, sseKmsKeyId); + } + config.setInt(S3_MAX_CLIENT_RETRIES, maxClientRetries); + config.setInt(S3_MAX_ERROR_RETRIES, maxErrorRetries); + config.set(S3_MAX_BACKOFF_TIME, maxBackoffTime.toString()); + config.set(S3_MAX_RETRY_TIME, maxRetryTime.toString()); + config.set(S3_CONNECT_TIMEOUT, connectTimeout.toString()); + config.set(S3_SOCKET_TIMEOUT, socketTimeout.toString()); + config.set(S3_STAGING_DIRECTORY, stagingDirectory.toString()); + config.setInt(S3_MAX_CONNECTIONS, maxConnections); + config.setLong(S3_MULTIPART_MIN_FILE_SIZE, multipartMinFileSize.toBytes()); + config.setLong(S3_MULTIPART_MIN_PART_SIZE, multipartMinPartSize.toBytes()); + config.setBoolean(S3_PIN_CLIENT_TO_CURRENT_REGION, pinClientToCurrentRegion); + config.set(S3_USER_AGENT_PREFIX, userAgentPrefix); + config.set(S3_ACL_TYPE, aclType.name()); + config.setBoolean(S3_SKIP_GLACIER_OBJECTS, skipGlacierObjects); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3Constants.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3Constants.java new file mode 100644 index 00000000..ea52b76b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3Constants.java @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import com.amazonaws.services.s3.model.KMSEncryptionMaterialsProvider; +import com.amazonaws.services.s3.model.SimpleMaterialProvider; +import com.amazonaws.services.s3.model.StaticEncryptionMaterialsProvider; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class PrestoS3Constants +{ + /** + * EncryptionMaterialsProvider Implementation List + */ + public static final List ENCRYPTIONMATERIALSPROVIDER_IMPL_LIST = Collections.unmodifiableList(new ArrayList() { + { + this.add("io.prestosql.plugin.hive.s3.TestPrestoS3FileSystem$TestEncryptionMaterialsProvider"); + this.add(KMSEncryptionMaterialsProvider.class.getName()); + this.add(SimpleMaterialProvider.class.getName()); + this.add(StaticEncryptionMaterialsProvider.class.getName()); + } + }); + + private PrestoS3Constants() + { + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3FileSystem.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3FileSystem.java new file mode 100644 index 00000000..861e7642 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3FileSystem.java @@ -0,0 +1,1285 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import com.amazonaws.AbortedException; +import com.amazonaws.AmazonClientException; +import com.amazonaws.ClientConfiguration; +import com.amazonaws.Protocol; +import com.amazonaws.auth.AWSCredentials; +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.AWSStaticCredentialsProvider; +import com.amazonaws.auth.BasicAWSCredentials; +import com.amazonaws.auth.DefaultAWSCredentialsProviderChain; +import com.amazonaws.auth.InstanceProfileCredentialsProvider; +import com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider; +import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration; +import com.amazonaws.event.ProgressEvent; +import com.amazonaws.event.ProgressEventType; +import com.amazonaws.event.ProgressListener; +import com.amazonaws.regions.Region; +import com.amazonaws.regions.Regions; +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.AmazonS3Builder; +import com.amazonaws.services.s3.AmazonS3Client; +import com.amazonaws.services.s3.AmazonS3EncryptionClient; +import com.amazonaws.services.s3.model.AmazonS3Exception; +import com.amazonaws.services.s3.model.CannedAccessControlList; +import com.amazonaws.services.s3.model.EncryptionMaterialsProvider; +import com.amazonaws.services.s3.model.GetObjectRequest; +import com.amazonaws.services.s3.model.KMSEncryptionMaterialsProvider; +import com.amazonaws.services.s3.model.ListObjectsRequest; +import com.amazonaws.services.s3.model.ObjectListing; +import com.amazonaws.services.s3.model.ObjectMetadata; +import com.amazonaws.services.s3.model.PutObjectRequest; +import com.amazonaws.services.s3.model.S3ObjectInputStream; +import com.amazonaws.services.s3.model.S3ObjectSummary; +import com.amazonaws.services.s3.model.SSEAwsKeyManagementParams; +import com.amazonaws.services.s3.transfer.Transfer; +import com.amazonaws.services.s3.transfer.TransferManager; +import com.amazonaws.services.s3.transfer.TransferManagerBuilder; +import com.amazonaws.services.s3.transfer.Upload; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.AbstractSequentialIterator; +import com.google.common.collect.Iterators; +import com.google.common.io.Closer; +import io.airlift.log.Logger; +import io.airlift.units.DataSize; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.util.RetryDriver; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.BufferedFSInputStream; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FSInputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.util.Progressable; + +import java.io.BufferedOutputStream; +import java.io.ByteArrayInputStream; +import java.io.Closeable; +import java.io.EOFException; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FilterOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InterruptedIOException; +import java.io.UncheckedIOException; +import java.net.URI; +import java.util.ArrayList; +import java.util.Date; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicBoolean; + +import static com.amazonaws.regions.Regions.US_EAST_1; +import static com.amazonaws.services.s3.Headers.SERVER_SIDE_ENCRYPTION; +import static com.amazonaws.services.s3.Headers.UNENCRYPTED_CONTENT_LENGTH; +import static com.amazonaws.services.s3.model.StorageClass.Glacier; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkPositionIndexes; +import static com.google.common.base.Strings.isNullOrEmpty; +import static com.google.common.base.Strings.nullToEmpty; +import static com.google.common.base.Throwables.throwIfInstanceOf; +import static com.google.common.base.Throwables.throwIfUnchecked; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.Iterables.toArray; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.plugin.hive.s3.PrestoS3Constants.ENCRYPTIONMATERIALSPROVIDER_IMPL_LIST; +import static java.lang.Math.max; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; +import static java.net.HttpURLConnection.HTTP_BAD_REQUEST; +import static java.net.HttpURLConnection.HTTP_FORBIDDEN; +import static java.net.HttpURLConnection.HTTP_NOT_FOUND; +import static java.nio.file.Files.createDirectories; +import static java.nio.file.Files.createTempFile; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.apache.hadoop.fs.FSExceptionMessages.CANNOT_SEEK_PAST_EOF; +import static org.apache.hadoop.fs.FSExceptionMessages.NEGATIVE_SEEK; +import static org.apache.hadoop.fs.FSExceptionMessages.STREAM_IS_CLOSED; + +public class PrestoS3FileSystem + extends FileSystem +{ + public static final String S3_USER_AGENT_SUFFIX = "presto"; + public static final String S3_USER_AGENT_PREFIX = "presto.s3.user-agent-prefix"; + public static final String S3_CREDENTIALS_PROVIDER = "presto.s3.credentials-provider"; + public static final String S3_SSE_TYPE = "presto.s3.sse.type"; + public static final String S3_SSE_ENABLED = "presto.s3.sse.enabled"; + public static final String S3_SSE_KMS_KEY_ID = "presto.s3.sse.kms-key-id"; + public static final String S3_KMS_KEY_ID = "presto.s3.kms-key-id"; + public static final String S3_ENCRYPTION_MATERIALS_PROVIDER = "presto.s3.encryption-materials-provider"; + public static final String S3_PIN_CLIENT_TO_CURRENT_REGION = "presto.s3.pin-client-to-current-region"; + public static final String S3_USE_INSTANCE_CREDENTIALS = "presto.s3.use-instance-credentials"; + public static final String S3_MULTIPART_MIN_PART_SIZE = "presto.s3.multipart.min-part-size"; + public static final String S3_MULTIPART_MIN_FILE_SIZE = "presto.s3.multipart.min-file-size"; + public static final String S3_STAGING_DIRECTORY = "presto.s3.staging-directory"; + public static final String S3_MAX_CONNECTIONS = "presto.s3.max-connections"; + public static final String S3_SOCKET_TIMEOUT = "presto.s3.socket-timeout"; + public static final String S3_CONNECT_TIMEOUT = "presto.s3.connect-timeout"; + public static final String S3_MAX_RETRY_TIME = "presto.s3.max-retry-time"; + public static final String S3_MAX_BACKOFF_TIME = "presto.s3.max-backoff-time"; + public static final String S3_MAX_CLIENT_RETRIES = "presto.s3.max-client-retries"; + public static final String S3_MAX_ERROR_RETRIES = "presto.s3.max-error-retries"; + public static final String S3_SSL_ENABLED = "presto.s3.ssl.enabled"; + public static final String S3_PATH_STYLE_ACCESS = "presto.s3.path-style-access"; + public static final String S3_SIGNER_TYPE = "presto.s3.signer-type"; + public static final String S3_ENDPOINT = "presto.s3.endpoint"; + public static final String S3_SECRET_KEY = "presto.s3.secret-key"; + public static final String S3_ACCESS_KEY = "presto.s3.access-key"; + public static final String S3_IAM_ROLE = "presto.s3.iam-role"; + public static final String S3_ACL_TYPE = "presto.s3.upload-acl-type"; + public static final String S3_SKIP_GLACIER_OBJECTS = "presto.s3.skip-glacier-objects"; + + static final String S3_DIRECTORY_OBJECT_CONTENT_TYPE = "application/x-directory"; + + private static final Logger log = Logger.get(PrestoS3FileSystem.class); + private static final PrestoS3FileSystemStats STATS = new PrestoS3FileSystemStats(); + private static final PrestoS3FileSystemMetricCollector METRIC_COLLECTOR = new PrestoS3FileSystemMetricCollector(STATS); + private static final String DIRECTORY_SUFFIX = "_$folder$"; + private static final DataSize BLOCK_SIZE = new DataSize(32, MEGABYTE); + private static final DataSize MAX_SKIP_SIZE = new DataSize(1, MEGABYTE); + private static final String PATH_SEPARATOR = "/"; + private static final Duration BACKOFF_MIN_SLEEP = new Duration(1, SECONDS); + private static final int HTTP_RANGE_NOT_SATISFIABLE = 416; + + private URI uri; + private Path workingDirectory; + private AmazonS3 s3; + private AWSCredentialsProvider credentialsProvider; + private File stagingDirectory; + private int maxAttempts; + private Duration maxBackoffTime; + private Duration maxRetryTime; + private boolean useInstanceCredentials; + private String iamRole; + private boolean pinS3ClientToCurrentRegion; + private boolean sseEnabled; + private PrestoS3SseType sseType; + private String sseKmsKeyId; + private boolean isPathStyleAccess; + private long multiPartUploadMinFileSize; + private long multiPartUploadMinPartSize; + private PrestoS3AclType s3AclType; + private boolean skipGlacierObjects; + + @Override + public void initialize(URI uri, Configuration conf) + throws IOException + { + requireNonNull(uri, "uri is null"); + requireNonNull(conf, "conf is null"); + super.initialize(uri, conf); + setConf(conf); + + this.uri = URI.create(uri.getScheme() + "://" + uri.getAuthority()); + this.workingDirectory = new Path(PATH_SEPARATOR).makeQualified(this.uri, new Path(PATH_SEPARATOR)); + + HiveS3Config defaults = new HiveS3Config(); + this.stagingDirectory = new File(conf.get(S3_STAGING_DIRECTORY, defaults.getS3StagingDirectory().toString())); + this.maxAttempts = conf.getInt(S3_MAX_CLIENT_RETRIES, defaults.getS3MaxClientRetries()) + 1; + this.maxBackoffTime = Duration.valueOf(conf.get(S3_MAX_BACKOFF_TIME, defaults.getS3MaxBackoffTime().toString())); + this.maxRetryTime = Duration.valueOf(conf.get(S3_MAX_RETRY_TIME, defaults.getS3MaxRetryTime().toString())); + int maxErrorRetries = conf.getInt(S3_MAX_ERROR_RETRIES, defaults.getS3MaxErrorRetries()); + boolean sslEnabled = conf.getBoolean(S3_SSL_ENABLED, defaults.isS3SslEnabled()); + Duration connectTimeout = Duration.valueOf(conf.get(S3_CONNECT_TIMEOUT, defaults.getS3ConnectTimeout().toString())); + Duration socketTimeout = Duration.valueOf(conf.get(S3_SOCKET_TIMEOUT, defaults.getS3SocketTimeout().toString())); + int maxConnections = conf.getInt(S3_MAX_CONNECTIONS, defaults.getS3MaxConnections()); + this.multiPartUploadMinFileSize = conf.getLong(S3_MULTIPART_MIN_FILE_SIZE, defaults.getS3MultipartMinFileSize().toBytes()); + this.multiPartUploadMinPartSize = conf.getLong(S3_MULTIPART_MIN_PART_SIZE, defaults.getS3MultipartMinPartSize().toBytes()); + this.isPathStyleAccess = conf.getBoolean(S3_PATH_STYLE_ACCESS, defaults.isS3PathStyleAccess()); + this.useInstanceCredentials = conf.getBoolean(S3_USE_INSTANCE_CREDENTIALS, defaults.isS3UseInstanceCredentials()); + this.iamRole = conf.get(S3_IAM_ROLE, defaults.getS3IamRole()); + verify(!(useInstanceCredentials && this.iamRole != null), + "Invalid configuration: either use instance credentials or specify an iam role"); + this.pinS3ClientToCurrentRegion = conf.getBoolean(S3_PIN_CLIENT_TO_CURRENT_REGION, defaults.isPinS3ClientToCurrentRegion()); + verify((pinS3ClientToCurrentRegion && conf.get(S3_ENDPOINT) == null) || !pinS3ClientToCurrentRegion, + "Invalid configuration: either endpoint can be set or S3 client can be pinned to the current region"); + this.sseEnabled = conf.getBoolean(S3_SSE_ENABLED, defaults.isS3SseEnabled()); + this.sseType = PrestoS3SseType.valueOf(conf.get(S3_SSE_TYPE, defaults.getS3SseType().name())); + this.sseKmsKeyId = conf.get(S3_SSE_KMS_KEY_ID, defaults.getS3SseKmsKeyId()); + this.s3AclType = PrestoS3AclType.valueOf(conf.get(S3_ACL_TYPE, defaults.getS3AclType().name())); + String userAgentPrefix = conf.get(S3_USER_AGENT_PREFIX, defaults.getS3UserAgentPrefix()); + this.skipGlacierObjects = conf.getBoolean(S3_SKIP_GLACIER_OBJECTS, defaults.isSkipGlacierObjects()); + + ClientConfiguration configuration = new ClientConfiguration() + .withMaxErrorRetry(maxErrorRetries) + .withProtocol(sslEnabled ? Protocol.HTTPS : Protocol.HTTP) + .withConnectionTimeout(toIntExact(connectTimeout.toMillis())) + .withSocketTimeout(toIntExact(socketTimeout.toMillis())) + .withMaxConnections(maxConnections) + .withUserAgentPrefix(userAgentPrefix) + .withUserAgentSuffix(S3_USER_AGENT_SUFFIX); + + this.credentialsProvider = createAwsCredentialsProvider(uri, conf); + this.s3 = createAmazonS3Client(conf, configuration); + } + + @Override + public void close() + throws IOException + { + try (Closer closer = Closer.create()) { + closer.register(super::close); + if (credentialsProvider instanceof Closeable) { + closer.register((Closeable) credentialsProvider); + } + closer.register(s3::shutdown); + } + } + + @Override + public URI getUri() + { + return uri; + } + + @Override + public Path getWorkingDirectory() + { + return workingDirectory; + } + + @Override + public void setWorkingDirectory(Path path) + { + workingDirectory = path; + } + + @Override + public FileStatus[] listStatus(Path path) + throws IOException + { + STATS.newListStatusCall(); + List list = new ArrayList<>(); + RemoteIterator iterator = listLocatedStatus(path); + while (iterator.hasNext()) { + list.add(iterator.next()); + } + return toArray(list, LocatedFileStatus.class); + } + + @Override + public RemoteIterator listLocatedStatus(Path path) + { + STATS.newListLocatedStatusCall(); + return new RemoteIterator() + { + private final Iterator iterator = listPrefix(path); + + @Override + public boolean hasNext() + throws IOException + { + try { + return iterator.hasNext(); + } + catch (AmazonClientException e) { + throw new IOException(e); + } + } + + @Override + public LocatedFileStatus next() + throws IOException + { + try { + return iterator.next(); + } + catch (AmazonClientException e) { + throw new IOException(e); + } + } + }; + } + + @Override + public FileStatus getFileStatus(Path path) + throws IOException + { + if (path.getName().isEmpty()) { + // the bucket root requires special handling + if (getS3ObjectMetadata(path) != null) { + return new FileStatus(0, true, 1, 0, 0, qualifiedPath(path)); + } + throw new FileNotFoundException("File does not exist: " + path); + } + + ObjectMetadata metadata = getS3ObjectMetadata(path); + + if (metadata == null) { + // check if this path is a directory + Iterator iterator = listPrefix(path); + if (iterator.hasNext()) { + return new FileStatus(0, true, 1, 0, 0, qualifiedPath(path)); + } + throw new FileNotFoundException("File does not exist: " + path); + } + + return new FileStatus( + getObjectSize(path, metadata), + S3_DIRECTORY_OBJECT_CONTENT_TYPE.equals(metadata.getContentType()), + 1, + BLOCK_SIZE.toBytes(), + lastModifiedTime(metadata), + qualifiedPath(path)); + } + + private static long getObjectSize(Path path, ObjectMetadata metadata) + throws IOException + { + Map userMetadata = metadata.getUserMetadata(); + String length = userMetadata.get(UNENCRYPTED_CONTENT_LENGTH); + if (userMetadata.containsKey(SERVER_SIDE_ENCRYPTION) && length == null) { + throw new IOException(format("%s header is not set on an encrypted object: %s", UNENCRYPTED_CONTENT_LENGTH, path)); + } + return (length != null) ? Long.parseLong(length) : metadata.getContentLength(); + } + + @Override + public FSDataInputStream open(Path path, int bufferSize) + { + return new FSDataInputStream( + new BufferedFSInputStream( + new PrestoS3InputStream(s3, getBucketName(uri), path, maxAttempts, maxBackoffTime, maxRetryTime), + bufferSize)); + } + + @Override + public FSDataOutputStream create(Path path, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) + throws IOException + { + // Ignore the overwrite flag, since Presto always writes to unique file names. + // Checking for file existence can break read-after-write consistency. + + if (!stagingDirectory.exists()) { + createDirectories(stagingDirectory.toPath()); + } + if (!stagingDirectory.isDirectory()) { + throw new IOException("Configured staging path is not a directory: " + stagingDirectory); + } + File tempFile = createTempFile(stagingDirectory.toPath(), "presto-s3-", ".tmp").toFile(); + + String key = keyFromPath(qualifiedPath(path)); + return new FSDataOutputStream( + new PrestoS3OutputStream(s3, getBucketName(uri), key, tempFile, sseEnabled, sseType, sseKmsKeyId, multiPartUploadMinFileSize, multiPartUploadMinPartSize, s3AclType), + statistics); + } + + @Override + public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) + { + throw new UnsupportedOperationException("append"); + } + + @Override + public boolean rename(Path src, Path dst) + throws IOException + { + boolean srcDirectory; + try { + srcDirectory = directory(src); + } + catch (FileNotFoundException e) { + return false; + } + + try { + if (!directory(dst)) { + // cannot copy a file to an existing file + return false; + } + // move source under destination directory + dst = new Path(dst, src.getName()); + } + catch (FileNotFoundException e) { + // destination does not exist + } + + if (keysEqual(src, dst)) { + return false; + } + + if (srcDirectory) { + for (FileStatus file : listStatus(src)) { + rename(file.getPath(), new Path(dst, file.getPath().getName())); + } + deleteObject(keyFromPath(src) + DIRECTORY_SUFFIX); + } + else { + s3.copyObject(getBucketName(uri), keyFromPath(src), getBucketName(uri), keyFromPath(dst)); + delete(src, true); + } + + return true; + } + + @Override + public boolean delete(Path path, boolean recursive) + throws IOException + { + try { + if (!directory(path)) { + return deleteObject(keyFromPath(path)); + } + } + catch (FileNotFoundException e) { + return false; + } + + if (!recursive) { + throw new IOException("Directory " + path + " is not empty"); + } + + for (FileStatus file : listStatus(path)) { + delete(file.getPath(), true); + } + deleteObject(keyFromPath(path) + DIRECTORY_SUFFIX); + + return true; + } + + private boolean directory(Path path) + throws IOException + { + return getFileStatus(path).isDirectory(); + } + + private boolean deleteObject(String key) + { + try { + s3.deleteObject(getBucketName(uri), key); + return true; + } + catch (AmazonClientException e) { + return false; + } + } + + @Override + public boolean mkdirs(Path f, FsPermission permission) + { + // no need to do anything for S3 + return true; + } + + private Iterator listPrefix(Path path) + { + String key = keyFromPath(path); + if (!key.isEmpty()) { + key += PATH_SEPARATOR; + } + + ListObjectsRequest request = new ListObjectsRequest() + .withBucketName(getBucketName(uri)) + .withPrefix(key) + .withDelimiter(PATH_SEPARATOR); + + STATS.newListObjectsCall(); + Iterator listings = new AbstractSequentialIterator(s3.listObjects(request)) + { + @Override + protected ObjectListing computeNext(ObjectListing previous) + { + if (!previous.isTruncated()) { + return null; + } + return s3.listNextBatchOfObjects(previous); + } + }; + + return Iterators.concat(Iterators.transform(listings, this::statusFromListing)); + } + + private Iterator statusFromListing(ObjectListing listing) + { + return Iterators.concat( + statusFromPrefixes(listing.getCommonPrefixes()), + statusFromObjects(listing.getObjectSummaries())); + } + + private Iterator statusFromPrefixes(List prefixes) + { + List list = new ArrayList<>(); + for (String prefix : prefixes) { + Path path = qualifiedPath(new Path(PATH_SEPARATOR + prefix)); + FileStatus status = new FileStatus(0, true, 1, 0, 0, path); + list.add(createLocatedFileStatus(status)); + } + return list.iterator(); + } + + private Iterator statusFromObjects(List objects) + { + // NOTE: for encrypted objects, S3ObjectSummary.size() used below is NOT correct, + // however, to get the correct size we'd need to make an additional request to get + // user metadata, and in this case it doesn't matter. + return objects.stream() + .filter(object -> !object.getKey().endsWith(PATH_SEPARATOR)) + .filter(object -> !skipGlacierObjects || !isGlacierObject(object)) + .map(object -> new FileStatus( + object.getSize(), + false, + 1, + BLOCK_SIZE.toBytes(), + object.getLastModified().getTime(), + qualifiedPath(new Path(PATH_SEPARATOR + object.getKey())))) + .map(this::createLocatedFileStatus) + .iterator(); + } + + private boolean isGlacierObject(S3ObjectSummary object) + { + return Glacier.toString().equals(object.getStorageClass()); + } + + /** + * This exception is for stopping retries for S3 calls that shouldn't be retried. + * For example, "Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: Forbidden (Service: Amazon S3; Status Code: 403 ..." + */ + @VisibleForTesting + static class UnrecoverableS3OperationException + extends IOException + { + public UnrecoverableS3OperationException(Path path, Throwable cause) + { + // append the path info to the message + super(format("%s (Path: %s)", cause, path), cause); + } + } + + @VisibleForTesting + ObjectMetadata getS3ObjectMetadata(Path path) + throws IOException + { + String bucketName = getBucketName(uri); + String key = keyFromPath(path); + ObjectMetadata s3ObjectMetadata = getS3ObjectMetadata(path, bucketName, key); + if (s3ObjectMetadata == null && !key.isEmpty()) { + return getS3ObjectMetadata(path, bucketName, key + PATH_SEPARATOR); + } + return s3ObjectMetadata; + } + + private ObjectMetadata getS3ObjectMetadata(Path path, String bucketName, String key) + throws IOException + { + try { + return RetryDriver.retry() + .maxAttempts(maxAttempts) + .exponentialBackoff(BACKOFF_MIN_SLEEP, maxBackoffTime, maxRetryTime, 2.0) + .stopOn(InterruptedException.class, UnrecoverableS3OperationException.class) + .onRetry(STATS::newGetMetadataRetry) + .run("getS3ObjectMetadata", () -> { + try { + STATS.newMetadataCall(); + return s3.getObjectMetadata(bucketName, key); + } + catch (RuntimeException e) { + STATS.newGetMetadataError(); + if (e instanceof AmazonS3Exception) { + switch (((AmazonS3Exception) e).getStatusCode()) { + case HTTP_NOT_FOUND: + return null; + case HTTP_FORBIDDEN: + case HTTP_BAD_REQUEST: + throw new UnrecoverableS3OperationException(path, e); + } + } + throw e; + } + }); + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException(e); + } + catch (Exception e) { + throwIfInstanceOf(e, IOException.class); + throwIfUnchecked(e); + throw new RuntimeException(e); + } + } + + private Path qualifiedPath(Path path) + { + return path.makeQualified(this.uri, getWorkingDirectory()); + } + + private LocatedFileStatus createLocatedFileStatus(FileStatus status) + { + try { + BlockLocation[] fakeLocation = getFileBlockLocations(status, 0, status.getLen()); + return new LocatedFileStatus(status, fakeLocation); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static long lastModifiedTime(ObjectMetadata metadata) + { + Date date = metadata.getLastModified(); + return (date != null) ? date.getTime() : 0; + } + + private static boolean keysEqual(Path p1, Path p2) + { + return keyFromPath(p1).equals(keyFromPath(p2)); + } + + public static String keyFromPath(Path path) + { + checkArgument(path.isAbsolute(), "Path is not absolute: %s", path); + String key = nullToEmpty(path.toUri().getPath()); + if (key.startsWith(PATH_SEPARATOR)) { + key = key.substring(PATH_SEPARATOR.length()); + } + if (key.endsWith(PATH_SEPARATOR)) { + key = key.substring(0, key.length() - PATH_SEPARATOR.length()); + } + return key; + } + + private AmazonS3 createAmazonS3Client(Configuration hadoopConfig, ClientConfiguration clientConfig) + { + Optional encryptionMaterialsProvider = createEncryptionMaterialsProvider(hadoopConfig); + AmazonS3Builder clientBuilder; + + String signerType = hadoopConfig.get(S3_SIGNER_TYPE); + if (signerType != null) { + clientConfig.withSignerOverride(signerType); + } + + if (encryptionMaterialsProvider.isPresent()) { + clientBuilder = AmazonS3EncryptionClient.encryptionBuilder() + .withCredentials(credentialsProvider) + .withEncryptionMaterials(encryptionMaterialsProvider.get()) + .withClientConfiguration(clientConfig) + .withMetricsCollector(METRIC_COLLECTOR); + } + else { + clientBuilder = AmazonS3Client.builder() + .withCredentials(credentialsProvider) + .withClientConfiguration(clientConfig) + .withMetricsCollector(METRIC_COLLECTOR); + } + + boolean regionOrEndpointSet = false; + + // use local region when running inside of EC2 + if (pinS3ClientToCurrentRegion) { + Region region = Regions.getCurrentRegion(); + if (region != null) { + clientBuilder = clientBuilder.withRegion(region.getName()); + regionOrEndpointSet = true; + } + } + + String endpoint = hadoopConfig.get(S3_ENDPOINT); + if (endpoint != null) { + clientBuilder = clientBuilder.withEndpointConfiguration(new EndpointConfiguration(endpoint, null)); + regionOrEndpointSet = true; + } + + if (isPathStyleAccess) { + clientBuilder = clientBuilder.enablePathStyleAccess(); + } + + if (!regionOrEndpointSet) { + clientBuilder = clientBuilder.withRegion(US_EAST_1); + clientBuilder.setForceGlobalBucketAccessEnabled(true); + } + + return clientBuilder.build(); + } + + private static Optional createEncryptionMaterialsProvider(Configuration hadoopConfig) + { + String kmsKeyId = hadoopConfig.get(S3_KMS_KEY_ID); + if (kmsKeyId != null) { + return Optional.of(new KMSEncryptionMaterialsProvider(kmsKeyId)); + } + + String empClassName = hadoopConfig.get(S3_ENCRYPTION_MATERIALS_PROVIDER); + if (empClassName == null) { + return Optional.empty(); + } + + try { + if (!ENCRYPTIONMATERIALSPROVIDER_IMPL_LIST.contains(empClassName)) { + throw new RuntimeException("Invalid provider class: " + empClassName); + } + Object instance = Class.forName(empClassName).getConstructor().newInstance(); + if (!(instance instanceof EncryptionMaterialsProvider)) { + throw new RuntimeException("Invalid encryption materials provider class: " + instance.getClass().getName()); + } + EncryptionMaterialsProvider emp = (EncryptionMaterialsProvider) instance; + if (emp instanceof Configurable) { + ((Configurable) emp).setConf(hadoopConfig); + } + return Optional.of(emp); + } + catch (ReflectiveOperationException e) { + throw new RuntimeException("Unable to load or create S3 encryption materials provider: " + empClassName, e); + } + } + + private AWSCredentialsProvider createAwsCredentialsProvider(URI uri, Configuration conf) + { + Optional credentials = getAwsCredentials(uri, conf); + if (credentials.isPresent()) { + return new AWSStaticCredentialsProvider(credentials.get()); + } + + if (useInstanceCredentials) { + return InstanceProfileCredentialsProvider.getInstance(); + } + + if (iamRole != null) { + return new STSAssumeRoleSessionCredentialsProvider.Builder(this.iamRole, "presto-session").build(); + } + + String providerClass = conf.get(S3_CREDENTIALS_PROVIDER); + if (!isNullOrEmpty(providerClass)) { + return getCustomAWSCredentialsProvider(uri, conf, providerClass); + } + + return DefaultAWSCredentialsProviderChain.getInstance(); + } + + private static AWSCredentialsProvider getCustomAWSCredentialsProvider(URI uri, Configuration conf, String providerClass) + { + try { + log.debug("Using AWS credential provider %s for URI %s", providerClass, uri); + return conf.getClassByName(providerClass) + .asSubclass(AWSCredentialsProvider.class) + .getConstructor(URI.class, Configuration.class) + .newInstance(uri, conf); + } + catch (ReflectiveOperationException e) { + throw new RuntimeException(format("Error creating an instance of %s for URI %s", providerClass, uri), e); + } + } + + private static Optional getAwsCredentials(URI uri, Configuration conf) + { + String accessKey = conf.get(S3_ACCESS_KEY); + String secretKey = conf.get(S3_SECRET_KEY); + + String userInfo = uri.getUserInfo(); + if (userInfo != null) { + int index = userInfo.indexOf(':'); + if (index < 0) { + accessKey = userInfo; + } + else { + accessKey = userInfo.substring(0, index); + secretKey = userInfo.substring(index + 1); + } + } + + if (isNullOrEmpty(accessKey) || isNullOrEmpty(secretKey)) { + return Optional.empty(); + } + return Optional.of(new BasicAWSCredentials(accessKey, secretKey)); + } + + private static class PrestoS3InputStream + extends FSInputStream + { + private final AmazonS3 s3; + private final String host; + private final Path path; + private final int maxAttempts; + private final Duration maxBackoffTime; + private final Duration maxRetryTime; + + private final AtomicBoolean closed = new AtomicBoolean(); + + private InputStream in; + private long streamPosition; + private long nextReadPosition; + + public PrestoS3InputStream(AmazonS3 s3, String host, Path path, int maxAttempts, Duration maxBackoffTime, Duration maxRetryTime) + { + this.s3 = requireNonNull(s3, "s3 is null"); + this.host = requireNonNull(host, "host is null"); + this.path = requireNonNull(path, "path is null"); + + checkArgument(maxAttempts >= 0, "maxAttempts cannot be negative"); + this.maxAttempts = maxAttempts; + this.maxBackoffTime = requireNonNull(maxBackoffTime, "maxBackoffTime is null"); + this.maxRetryTime = requireNonNull(maxRetryTime, "maxRetryTime is null"); + } + + @Override + public void close() + { + closed.set(true); + closeStream(); + } + + @Override + public int read(long position, byte[] buffer, int offset, int length) + throws IOException + { + checkClosed(); + if (position < 0) { + throw new EOFException(NEGATIVE_SEEK); + } + checkPositionIndexes(offset, offset + length, buffer.length); + if (length == 0) { + return 0; + } + + try { + return RetryDriver.retry() + .maxAttempts(maxAttempts) + .exponentialBackoff(BACKOFF_MIN_SLEEP, maxBackoffTime, maxRetryTime, 2.0) + .stopOn(InterruptedException.class, UnrecoverableS3OperationException.class, EOFException.class) + .onRetry(STATS::newGetObjectRetry) + .run("getS3Object", () -> { + InputStream stream; + try { + GetObjectRequest request = new GetObjectRequest(host, keyFromPath(path)) + .withRange(position, (position + length) - 1); + stream = s3.getObject(request).getObjectContent(); + } + catch (RuntimeException e) { + STATS.newGetObjectError(); + if (e instanceof AmazonS3Exception) { + switch (((AmazonS3Exception) e).getStatusCode()) { + case HTTP_RANGE_NOT_SATISFIABLE: + throw new EOFException(CANNOT_SEEK_PAST_EOF); + case HTTP_FORBIDDEN: + case HTTP_NOT_FOUND: + case HTTP_BAD_REQUEST: + throw new UnrecoverableS3OperationException(path, e); + } + } + throw e; + } + + STATS.connectionOpened(); + try { + int read = 0; + while (read < length) { + int n = stream.read(buffer, offset + read, length - read); + if (n <= 0) { + break; + } + read += n; + } + return read; + } + catch (Throwable t) { + STATS.newReadError(t); + abortStream(stream); + throw t; + } + finally { + STATS.connectionReleased(); + stream.close(); + } + }); + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public void seek(long pos) + throws IOException + { + checkClosed(); + if (pos < 0) { + throw new EOFException(NEGATIVE_SEEK); + } + + // this allows a seek beyond the end of the stream but the next read will fail + nextReadPosition = pos; + } + + @Override + public long getPos() + { + return nextReadPosition; + } + + @Override + public int read() + { + // This stream is wrapped with BufferedInputStream, so this method should never be called + throw new UnsupportedOperationException(); + } + + @Override + public int read(byte[] buffer, int offset, int length) + throws IOException + { + checkClosed(); + try { + int bytesRead = RetryDriver.retry() + .maxAttempts(maxAttempts) + .exponentialBackoff(BACKOFF_MIN_SLEEP, maxBackoffTime, maxRetryTime, 2.0) + .stopOn(InterruptedException.class, UnrecoverableS3OperationException.class, AbortedException.class) + .onRetry(STATS::newReadRetry) + .run("readStream", () -> { + seekStream(); + try { + return in.read(buffer, offset, length); + } + catch (Exception e) { + STATS.newReadError(e); + closeStream(); + throw e; + } + }); + + if (bytesRead != -1) { + streamPosition += bytesRead; + nextReadPosition += bytesRead; + } + return bytesRead; + } + catch (Exception e) { + throw propagate(e); + } + } + + @Override + public boolean seekToNewSource(long targetPos) + { + return false; + } + + private void seekStream() + throws IOException + { + if ((in != null) && (nextReadPosition == streamPosition)) { + // already at specified position + return; + } + + if ((in != null) && (nextReadPosition > streamPosition)) { + // seeking forwards + long skip = nextReadPosition - streamPosition; + if (skip <= max(in.available(), MAX_SKIP_SIZE.toBytes())) { + // already buffered or seek is small enough + try { + if (in.skip(skip) == skip) { + streamPosition = nextReadPosition; + return; + } + } + catch (IOException ignored) { + // will retry by re-opening the stream + } + } + } + + // close the stream and open at desired position + streamPosition = nextReadPosition; + closeStream(); + openStream(); + } + + private void openStream() + throws IOException + { + if (in == null) { + in = openStream(path, nextReadPosition); + streamPosition = nextReadPosition; + STATS.connectionOpened(); + } + } + + private InputStream openStream(Path path, long start) + throws IOException + { + try { + return RetryDriver.retry() + .maxAttempts(maxAttempts) + .exponentialBackoff(BACKOFF_MIN_SLEEP, maxBackoffTime, maxRetryTime, 2.0) + .stopOn(InterruptedException.class, UnrecoverableS3OperationException.class) + .onRetry(STATS::newGetObjectRetry) + .run("getS3Object", () -> { + try { + GetObjectRequest request = new GetObjectRequest(host, keyFromPath(path)).withRange(start); + return s3.getObject(request).getObjectContent(); + } + catch (RuntimeException e) { + STATS.newGetObjectError(); + if (e instanceof AmazonS3Exception) { + switch (((AmazonS3Exception) e).getStatusCode()) { + case HTTP_RANGE_NOT_SATISFIABLE: + // ignore request for start past end of object + return new ByteArrayInputStream(new byte[0]); + case HTTP_FORBIDDEN: + case HTTP_NOT_FOUND: + case HTTP_BAD_REQUEST: + throw new UnrecoverableS3OperationException(path, e); + } + } + throw e; + } + }); + } + catch (Exception e) { + throw propagate(e); + } + } + + private void closeStream() + { + if (in != null) { + abortStream(in); + in = null; + STATS.connectionReleased(); + } + } + + private void checkClosed() + throws IOException + { + if (closed.get()) { + throw new IOException(STREAM_IS_CLOSED); + } + } + + private static void abortStream(InputStream in) + { + try { + if (in instanceof S3ObjectInputStream) { + ((S3ObjectInputStream) in).abort(); + } + else { + in.close(); + } + } + catch (IOException | AbortedException ignored) { + // thrown if the current thread is in the interrupted state + } + } + + private static RuntimeException propagate(Exception e) + throws IOException + { + if (e instanceof InterruptedException) { + Thread.currentThread().interrupt(); + throw new InterruptedIOException(); + } + throwIfInstanceOf(e, IOException.class); + throwIfUnchecked(e); + throw new IOException(e); + } + } + + private static class PrestoS3OutputStream + extends FilterOutputStream + { + private final TransferManager transferManager; + private final String host; + private final String key; + private final File tempFile; + private final boolean sseEnabled; + private final PrestoS3SseType sseType; + private final String sseKmsKeyId; + private final CannedAccessControlList aclType; + + private boolean closed; + + public PrestoS3OutputStream( + AmazonS3 s3, + String host, + String key, + File tempFile, + boolean sseEnabled, + PrestoS3SseType sseType, + String sseKmsKeyId, + long multiPartUploadMinFileSize, + long multiPartUploadMinPartSize, + PrestoS3AclType aclType) + throws IOException + { + super(new BufferedOutputStream(new FileOutputStream(requireNonNull(tempFile, "tempFile is null")))); + + transferManager = TransferManagerBuilder.standard() + .withS3Client(requireNonNull(s3, "s3 is null")) + .withMinimumUploadPartSize(multiPartUploadMinPartSize) + .withMultipartUploadThreshold(multiPartUploadMinFileSize).build(); + + requireNonNull(aclType, "aclType is null"); + this.aclType = aclType.getCannedACL(); + this.host = requireNonNull(host, "host is null"); + this.key = requireNonNull(key, "key is null"); + this.tempFile = tempFile; + this.sseEnabled = sseEnabled; + this.sseType = requireNonNull(sseType, "sseType is null"); + this.sseKmsKeyId = sseKmsKeyId; + + log.debug("OutputStream for key '%s' using file: %s", key, tempFile); + } + + @Override + public void close() + throws IOException + { + if (closed) { + return; + } + closed = true; + + try { + super.close(); + uploadObject(); + } + finally { + if (!tempFile.delete()) { + log.warn("Could not delete temporary file: %s", tempFile); + } + // close transfer manager but keep underlying S3 client open + transferManager.shutdownNow(false); + } + } + + private void uploadObject() + throws IOException + { + try { + log.debug("Starting upload for host: %s, key: %s, file: %s, size: %s", host, key, tempFile, tempFile.length()); + STATS.uploadStarted(); + + PutObjectRequest request = new PutObjectRequest(host, key, tempFile); + if (sseEnabled) { + switch (sseType) { + case KMS: + if (sseKmsKeyId != null) { + request.withSSEAwsKeyManagementParams(new SSEAwsKeyManagementParams(sseKmsKeyId)); + } + else { + request.withSSEAwsKeyManagementParams(new SSEAwsKeyManagementParams()); + } + break; + case S3: + ObjectMetadata metadata = new ObjectMetadata(); + metadata.setSSEAlgorithm(ObjectMetadata.AES_256_SERVER_SIDE_ENCRYPTION); + request.setMetadata(metadata); + break; + } + } + + request.withCannedAcl(aclType); + + Upload upload = transferManager.upload(request); + + if (log.isDebugEnabled()) { + upload.addProgressListener(createProgressListener(upload)); + } + + upload.waitForCompletion(); + STATS.uploadSuccessful(); + log.debug("Completed upload for host: %s, key: %s", host, key); + } + catch (AmazonClientException e) { + STATS.uploadFailed(); + throw new IOException(e); + } + catch (InterruptedException e) { + STATS.uploadFailed(); + Thread.currentThread().interrupt(); + throw new InterruptedIOException(); + } + } + + private ProgressListener createProgressListener(Transfer transfer) + { + return new ProgressListener() + { + private ProgressEventType previousType; + private double previousTransferred; + + @Override + public synchronized void progressChanged(ProgressEvent progressEvent) + { + ProgressEventType eventType = progressEvent.getEventType(); + if (previousType != eventType) { + log.debug("Upload progress event (%s/%s): %s", host, key, eventType); + previousType = eventType; + } + + double transferred = transfer.getProgress().getPercentTransferred(); + if (transferred >= (previousTransferred + 10.0)) { + log.debug("Upload percentage (%s/%s): %.0f%%", host, key, transferred); + previousTransferred = transferred; + } + } + }; + } + } + + @VisibleForTesting + AmazonS3 getS3Client() + { + return s3; + } + + @VisibleForTesting + void setS3Client(AmazonS3 client) + { + s3 = client; + } + + /** + * Helper function used to work around the fact that if you use an S3 bucket with an '_' that java.net.URI + * behaves differently and sets the host value to null whereas S3 buckets without '_' have a properly + * set host field. '_' is only allowed in S3 bucket names in us-east-1. + * + * @param uri The URI from which to extract a host value. + * @return The host value where uri.getAuthority() is used when uri.getHost() returns null as long as no UserInfo is present. + * @throws IllegalArgumentException If the bucket can not be determined from the URI. + */ + public static String getBucketName(URI uri) + { + if (uri.getHost() != null) { + return uri.getHost(); + } + + if (uri.getUserInfo() == null) { + return uri.getAuthority(); + } + + throw new IllegalArgumentException("Unable to determine S3 bucket from URI."); + } + + public static PrestoS3FileSystemStats getFileSystemStats() + { + return STATS; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3FileSystemMetricCollector.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3FileSystemMetricCollector.java new file mode 100644 index 00000000..43294a67 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3FileSystemMetricCollector.java @@ -0,0 +1,73 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import com.amazonaws.Request; +import com.amazonaws.Response; +import com.amazonaws.metrics.RequestMetricCollector; +import com.amazonaws.util.AWSRequestMetrics; +import com.amazonaws.util.TimingInfo; +import io.airlift.units.Duration; + +import static com.amazonaws.util.AWSRequestMetrics.Field.ClientExecuteTime; +import static com.amazonaws.util.AWSRequestMetrics.Field.HttpClientRetryCount; +import static com.amazonaws.util.AWSRequestMetrics.Field.HttpRequestTime; +import static com.amazonaws.util.AWSRequestMetrics.Field.RequestCount; +import static com.amazonaws.util.AWSRequestMetrics.Field.ThrottleException; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.MILLISECONDS; + +public class PrestoS3FileSystemMetricCollector + extends RequestMetricCollector +{ + private final PrestoS3FileSystemStats stats; + + public PrestoS3FileSystemMetricCollector(PrestoS3FileSystemStats stats) + { + this.stats = requireNonNull(stats, "stats is null"); + } + + @Override + public void collectMetrics(Request request, Response response) + { + AWSRequestMetrics metrics = request.getAWSRequestMetrics(); + + TimingInfo timingInfo = metrics.getTimingInfo(); + Number requestCounts = timingInfo.getCounter(RequestCount.name()); + Number retryCounts = timingInfo.getCounter(HttpClientRetryCount.name()); + Number throttleExceptions = timingInfo.getCounter(ThrottleException.name()); + TimingInfo requestTime = timingInfo.getSubMeasurement(HttpRequestTime.name()); + TimingInfo clientExecuteTime = timingInfo.getSubMeasurement(ClientExecuteTime.name()); + + if (requestCounts != null) { + stats.updateAwsRequestCount(requestCounts.longValue()); + } + + if (retryCounts != null) { + stats.updateAwsRetryCount(retryCounts.longValue()); + } + + if (throttleExceptions != null) { + stats.updateAwsThrottleExceptionsCount(throttleExceptions.longValue()); + } + + if (requestTime != null && requestTime.getTimeTakenMillisIfKnown() != null) { + stats.addAwsRequestTime(new Duration(requestTime.getTimeTakenMillisIfKnown(), MILLISECONDS)); + } + + if (clientExecuteTime != null && clientExecuteTime.getTimeTakenMillisIfKnown() != null) { + stats.addAwsClientExecuteTime(new Duration(clientExecuteTime.getTimeTakenMillisIfKnown(), MILLISECONDS)); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3FileSystemStats.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3FileSystemStats.java new file mode 100644 index 00000000..e82825f4 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3FileSystemStats.java @@ -0,0 +1,319 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import com.amazonaws.AbortedException; +import io.airlift.stats.CounterStat; +import io.airlift.stats.TimeStat; +import io.airlift.units.Duration; +import org.weakref.jmx.Managed; +import org.weakref.jmx.Nested; + +import java.net.SocketException; +import java.net.SocketTimeoutException; + +import static java.util.concurrent.TimeUnit.MILLISECONDS; + +public class PrestoS3FileSystemStats +{ + private final CounterStat activeConnections = new CounterStat(); + private final CounterStat startedUploads = new CounterStat(); + private final CounterStat failedUploads = new CounterStat(); + private final CounterStat successfulUploads = new CounterStat(); + private final CounterStat metadataCalls = new CounterStat(); + private final CounterStat listStatusCalls = new CounterStat(); + private final CounterStat listLocatedStatusCalls = new CounterStat(); + private final CounterStat listObjectsCalls = new CounterStat(); + private final CounterStat otherReadErrors = new CounterStat(); + private final CounterStat awsAbortedExceptions = new CounterStat(); + private final CounterStat socketExceptions = new CounterStat(); + private final CounterStat socketTimeoutExceptions = new CounterStat(); + private final CounterStat getObjectErrors = new CounterStat(); + private final CounterStat getMetadataErrors = new CounterStat(); + private final CounterStat getObjectRetries = new CounterStat(); + private final CounterStat getMetadataRetries = new CounterStat(); + private final CounterStat readRetries = new CounterStat(); + + // see AWSRequestMetrics + private final CounterStat awsRequestCount = new CounterStat(); + private final CounterStat awsRetryCount = new CounterStat(); + private final CounterStat awsThrottleExceptions = new CounterStat(); + private final TimeStat awsRequestTime = new TimeStat(MILLISECONDS); + private final TimeStat awsClientExecuteTime = new TimeStat(MILLISECONDS); + + @Managed + @Nested + public CounterStat getActiveConnections() + { + return activeConnections; + } + + @Managed + @Nested + public CounterStat getStartedUploads() + { + return startedUploads; + } + + @Managed + @Nested + public CounterStat getFailedUploads() + { + return failedUploads; + } + + @Managed + @Nested + public CounterStat getSuccessfulUploads() + { + return successfulUploads; + } + + @Managed + @Nested + public CounterStat getMetadataCalls() + { + return metadataCalls; + } + + @Managed + @Nested + public CounterStat getListStatusCalls() + { + return listStatusCalls; + } + + @Managed + @Nested + public CounterStat getListLocatedStatusCalls() + { + return listLocatedStatusCalls; + } + + @Managed + @Nested + public CounterStat getListObjectsCalls() + { + return listObjectsCalls; + } + + @Managed + @Nested + public CounterStat getGetObjectErrors() + { + return getObjectErrors; + } + + @Managed + @Nested + public CounterStat getGetMetadataErrors() + { + return getMetadataErrors; + } + + @Managed + @Nested + public CounterStat getOtherReadErrors() + { + return otherReadErrors; + } + + @Managed + @Nested + public CounterStat getSocketExceptions() + { + return socketExceptions; + } + + @Managed + @Nested + public CounterStat getSocketTimeoutExceptions() + { + return socketTimeoutExceptions; + } + + @Managed + @Nested + public CounterStat getAwsAbortedExceptions() + { + return awsAbortedExceptions; + } + + @Managed + @Nested + public CounterStat getAwsRequestCount() + { + return awsRequestCount; + } + + @Managed + @Nested + public CounterStat getAwsRetryCount() + { + return awsRetryCount; + } + + @Managed + @Nested + public CounterStat getAwsThrottleExceptions() + { + return awsThrottleExceptions; + } + + @Managed + @Nested + public TimeStat getAwsRequestTime() + { + return awsRequestTime; + } + + @Managed + @Nested + public TimeStat getAwsClientExecuteTime() + { + return awsClientExecuteTime; + } + + @Managed + @Nested + public CounterStat getGetObjectRetries() + { + return getObjectRetries; + } + + @Managed + @Nested + public CounterStat getGetMetadataRetries() + { + return getMetadataRetries; + } + + @Managed + @Nested + public CounterStat getReadRetries() + { + return readRetries; + } + + public void connectionOpened() + { + activeConnections.update(1); + } + + public void connectionReleased() + { + activeConnections.update(-1); + } + + public void uploadStarted() + { + startedUploads.update(1); + } + + public void uploadFailed() + { + failedUploads.update(1); + } + + public void uploadSuccessful() + { + successfulUploads.update(1); + } + + public void newMetadataCall() + { + metadataCalls.update(1); + } + + public void newListStatusCall() + { + listStatusCalls.update(1); + } + + public void newListLocatedStatusCall() + { + listLocatedStatusCalls.update(1); + } + + public void newListObjectsCall() + { + listObjectsCalls.update(1); + } + + public void newReadError(Throwable t) + { + if (t instanceof SocketException) { + socketExceptions.update(1); + } + else if (t instanceof SocketTimeoutException) { + socketTimeoutExceptions.update(1); + } + else if (t instanceof AbortedException) { + awsAbortedExceptions.update(1); + } + else { + otherReadErrors.update(1); + } + } + + public void newGetObjectError() + { + getObjectErrors.update(1); + } + + public void newGetMetadataError() + { + getMetadataErrors.update(1); + } + + public void updateAwsRequestCount(long requestCount) + { + awsRequestCount.update(requestCount); + } + + public void updateAwsRetryCount(long retryCount) + { + awsRetryCount.update(retryCount); + } + + public void updateAwsThrottleExceptionsCount(long throttleExceptionsCount) + { + awsThrottleExceptions.update(throttleExceptionsCount); + } + + public void addAwsRequestTime(Duration duration) + { + awsRequestTime.add(duration); + } + + public void addAwsClientExecuteTime(Duration duration) + { + awsClientExecuteTime.add(duration); + } + + public void newGetObjectRetry() + { + getObjectRetries.update(1); + } + + public void newGetMetadataRetry() + { + getMetadataRetries.update(1); + } + + public void newReadRetry() + { + readRetries.update(1); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3SelectClient.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3SelectClient.java new file mode 100644 index 00000000..bad58a40 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3SelectClient.java @@ -0,0 +1,87 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import com.amazonaws.services.s3.AmazonS3; +import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor; +import com.amazonaws.services.s3.model.SelectObjectContentRequest; +import com.amazonaws.services.s3.model.SelectObjectContentResult; +import io.prestosql.plugin.hive.HiveConfig; +import org.apache.hadoop.conf.Configuration; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; + +import static com.amazonaws.services.s3.model.SelectObjectContentEvent.EndEvent; +import static java.util.Objects.requireNonNull; + +public class PrestoS3SelectClient + implements Closeable +{ + private final AmazonS3 s3Client; + private boolean requestComplete; + private SelectObjectContentRequest selectObjectRequest; + private SelectObjectContentResult selectObjectContentResult; + + public PrestoS3SelectClient(Configuration configuration, HiveConfig hiveConfig, PrestoS3ClientFactory s3ClientFactory) + { + requireNonNull(configuration, "configuration is null"); + requireNonNull(hiveConfig, "hiveConfig is null"); + requireNonNull(s3ClientFactory, "s3ClientFactory is null"); + this.s3Client = s3ClientFactory.getS3Client(configuration, hiveConfig); + } + + public InputStream getRecordsContent(SelectObjectContentRequest selectObjectRequest) + { + this.selectObjectRequest = requireNonNull(selectObjectRequest, "selectObjectRequest is null"); + this.selectObjectContentResult = s3Client.selectObjectContent(selectObjectRequest); + return selectObjectContentResult.getPayload() + .getRecordsInputStream( + new SelectObjectContentEventVisitor() + { + @Override + public void visit(EndEvent endEvent) + { + requestComplete = true; + } + }); + } + + @Override + public void close() + throws IOException + { + selectObjectContentResult.close(); + } + + public String getKeyName() + { + return selectObjectRequest.getKey(); + } + + public String getBucketName() + { + return selectObjectRequest.getBucketName(); + } + + /** + * The End Event indicates all matching records have been transmitted. + * If the End Event is not received, the results may be incomplete. + */ + public boolean isRequestComplete() + { + return requestComplete; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3SignerType.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3SignerType.java new file mode 100644 index 00000000..f69b8163 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3SignerType.java @@ -0,0 +1,27 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +// These are the exact names used by SignerFactory in the AWS library +// and thus cannot be renamed or use the normal naming convention. +@SuppressWarnings("EnumeratedConstantNamingConvention") +public enum PrestoS3SignerType +{ + S3SignerType, + AWS3SignerType, + AWS4SignerType, + AWSS3V4SignerType, + CloudFrontSignerType, + QueryStringSignerType, +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3SseType.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3SseType.java new file mode 100644 index 00000000..bae2d4a8 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/PrestoS3SseType.java @@ -0,0 +1,20 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +public enum PrestoS3SseType +{ + KMS, + S3; +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/S3FileSystemType.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/S3FileSystemType.java new file mode 100644 index 00000000..c1556b77 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/s3/S3FileSystemType.java @@ -0,0 +1,20 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +public enum S3FileSystemType +{ + PRESTO, + EMRFS, +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/AccessControlMetadata.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/AccessControlMetadata.java new file mode 100644 index 00000000..829dcb4a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/AccessControlMetadata.java @@ -0,0 +1,125 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.security.GrantInfo; +import io.prestosql.spi.security.Privilege; +import io.prestosql.spi.security.RoleGrant; + +import java.util.List; +import java.util.Optional; +import java.util.Set; + +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; + +public interface AccessControlMetadata +{ + /** + * Creates the specified role. + * + * @param grantor represents the principal specified by WITH ADMIN statement + */ + default void createRole(ConnectorSession session, String role, Optional grantor) + { + throw new PrestoException(NOT_SUPPORTED, "This connector does not support create role"); + } + + /** + * Drops the specified role. + */ + default void dropRole(ConnectorSession session, String role) + { + throw new PrestoException(NOT_SUPPORTED, "This connector does not support drop role"); + } + + /** + * List available roles. + */ + default Set listRoles(ConnectorSession session) + { + throw new PrestoException(NOT_SUPPORTED, "This connector does not support roles"); + } + + /** + * List role grants for a given principal, not recursively. + */ + default Set listRoleGrants(ConnectorSession session, HivePrincipal principal) + { + throw new PrestoException(NOT_SUPPORTED, "This connector does not support roles"); + } + + /** + * Grants the specified roles to the specified grantees + * + * @param grantor represents the principal specified by GRANTED BY statement + */ + default void grantRoles(ConnectorSession connectorSession, Set roles, Set grantees, boolean withAdminOption, Optional grantor) + { + throw new PrestoException(NOT_SUPPORTED, "This connector does not support roles"); + } + + /** + * Revokes the specified roles from the specified grantees + * + * @param grantor represents the principal specified by GRANTED BY statement + */ + default void revokeRoles(ConnectorSession connectorSession, Set roles, Set grantees, boolean adminOptionFor, Optional grantor) + { + throw new PrestoException(NOT_SUPPORTED, "This connector does not support roles"); + } + + /** + * List applicable roles, including the transitive grants, for the specified principal + */ + default Set listApplicableRoles(ConnectorSession session, HivePrincipal principal) + { + throw new PrestoException(NOT_SUPPORTED, "This connector does not support roles"); + } + + /** + * List applicable roles, including the transitive grants, in given session + */ + default Set listEnabledRoles(ConnectorSession session) + { + throw new PrestoException(NOT_SUPPORTED, "This connector does not support roles"); + } + + /** + * Grants the specified privilege to the specified user on the specified table + */ + default void grantTablePrivileges(ConnectorSession session, SchemaTableName tableName, Set privileges, HivePrincipal grantee, boolean grantOption) + { + throw new PrestoException(NOT_SUPPORTED, "This connector does not support grants"); + } + + /** + * Revokes the specified privilege on the specified table from the specified user + */ + default void revokeTablePrivileges(ConnectorSession session, SchemaTableName tableName, Set privileges, HivePrincipal grantee, boolean grantOption) + { + throw new PrestoException(NOT_SUPPORTED, "This connector does not support revokes"); + } + + /** + * List the table privileges granted to the specified grantee for the tables that have the specified prefix considering the selected session role + */ + default List listTablePrivileges(ConnectorSession session, List tableName) + { + throw new PrestoException(NOT_SUPPORTED, "This connector does not support table privileges"); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/AccessControlMetadataFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/AccessControlMetadataFactory.java new file mode 100644 index 00000000..1f64a9bc --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/AccessControlMetadataFactory.java @@ -0,0 +1,21 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; + +public interface AccessControlMetadataFactory +{ + AccessControlMetadata create(SemiTransactionalHiveMetastore metastore); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/HiveSecurityModule.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/HiveSecurityModule.java new file mode 100644 index 00000000..b2ed9970 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/HiveSecurityModule.java @@ -0,0 +1,67 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import com.google.inject.Binder; +import com.google.inject.Module; +import io.airlift.configuration.AbstractConfigurationAwareModule; +import io.prestosql.plugin.base.security.FileBasedAccessControlModule; +import io.prestosql.plugin.base.security.ReadOnlySecurityModule; + +import static io.airlift.configuration.ConditionalModule.installModuleIf; +import static io.airlift.configuration.ConfigurationModule.installModules; + +public class HiveSecurityModule + extends AbstractConfigurationAwareModule +{ + @Override + protected void setup(Binder binder) + { + bindSecurityModule( + "legacy", + installModules( + new LegacySecurityModule(), + new StaticAccessControlMetadataModule())); + bindSecurityModule( + "file", + installModules( + new FileBasedAccessControlModule(), + new StaticAccessControlMetadataModule())); + bindSecurityModule( + "read-only", + installModules( + new ReadOnlySecurityModule(), + new StaticAccessControlMetadataModule())); + bindSecurityModule("sql-standard", new SqlStandardSecurityModule( + buildConfigObject(SecurityConfig.class).getSqlStandardAccessControlImp())); + } + + private void bindSecurityModule(String name, Module module) + { + install(installModuleIf( + SecurityConfig.class, + security -> name.equalsIgnoreCase(security.getSecuritySystem()), + module)); + } + + private static class StaticAccessControlMetadataModule + implements Module + { + @Override + public void configure(Binder binder) + { + binder.bind(AccessControlMetadataFactory.class).toInstance(metastore -> new AccessControlMetadata() {}); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/LegacyAccessControl.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/LegacyAccessControl.java new file mode 100644 index 00000000..ce24ae8f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/LegacyAccessControl.java @@ -0,0 +1,310 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import io.prestosql.plugin.hive.HiveTransactionHandle; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.connector.ColumnMetadata; +import io.prestosql.spi.connector.ConnectorAccessControl; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.security.ConnectorIdentity; +import io.prestosql.spi.security.Identity; +import io.prestosql.spi.security.PrestoPrincipal; +import io.prestosql.spi.security.Privilege; +import io.prestosql.spi.security.ViewExpression; +import io.prestosql.spi.type.Type; + +import javax.inject.Inject; + +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; + +import static io.prestosql.spi.security.AccessDeniedException.denyAddColumn; +import static io.prestosql.spi.security.AccessDeniedException.denyCommentTable; +import static io.prestosql.spi.security.AccessDeniedException.denyDropColumn; +import static io.prestosql.spi.security.AccessDeniedException.denyDropTable; +import static io.prestosql.spi.security.AccessDeniedException.denyRenameColumn; +import static io.prestosql.spi.security.AccessDeniedException.denyRenameTable; +import static java.util.Objects.requireNonNull; + +public class LegacyAccessControl + implements ConnectorAccessControl +{ + private final Function metastoreProvider; + private final boolean allowDropTable; + private final boolean allowRenameTable; + private final boolean allowCommentTable; + private final boolean allowAddColumn; + private final boolean allowDropColumn; + private final boolean allowRenameColumn; + + @Inject + public LegacyAccessControl( + Function metastoreProvider, + LegacySecurityConfig securityConfig) + { + this.metastoreProvider = requireNonNull(metastoreProvider, "metastoreProvider is null"); + + requireNonNull(securityConfig, "securityConfig is null"); + allowDropTable = securityConfig.getAllowDropTable(); + allowRenameTable = securityConfig.getAllowRenameTable(); + allowCommentTable = securityConfig.getAllowCommentTable(); + allowAddColumn = securityConfig.getAllowAddColumn(); + allowDropColumn = securityConfig.getAllowDropColumn(); + allowRenameColumn = securityConfig.getAllowRenameColumn(); + } + + @Override + public void checkCanCreateSchema(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String schemaName) + { + } + + @Override + public void checkCanDropSchema(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String schemaName) + { + } + + @Override + public void checkCanRenameSchema(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String schemaName, String newSchemaName) + { + } + + @Override + public void checkCanShowSchemas(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity) + { + } + + @Override + public Set filterSchemas(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, Set schemaNames) + { + return schemaNames; + } + + @Override + public void checkCanCreateTable(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + } + + @Override + public void checkCanDropTable(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!allowDropTable) { + denyDropTable(tableName.toString()); + } + + Optional
target = metastoreProvider.apply(((HiveTransactionHandle) transaction)).getTable(new HiveIdentity(identity), tableName.getSchemaName(), tableName.getTableName()); + + if (!target.isPresent()) { + denyDropTable(tableName.toString(), "Table not found"); + } + + if (!identity.getUser().equals(target.get().getOwner())) { + denyDropTable(tableName.toString(), "Owner of the table is different from session user"); + } + } + + @Override + public void checkCanRenameTable(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName, SchemaTableName newTableName) + { + if (!allowRenameTable) { + denyRenameTable(tableName.toString(), newTableName.toString()); + } + } + + @Override + public void checkCanSetTableComment(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!allowCommentTable) { + denyCommentTable(tableName.toString()); + } + } + + @Override + public void checkCanShowTablesMetadata(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String schemaName) + { + } + + @Override + public Set filterTables(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, Set tableNames) + { + return tableNames; + } + + @Override + public void checkCanShowColumnsMetadata(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName) + { + } + + @Override + public List filterColumns(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName, List columns) + { + return columns; + } + + @Override + public void checkCanAddColumn(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!allowAddColumn) { + denyAddColumn(tableName.toString()); + } + } + + @Override + public void checkCanDropColumn(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!allowDropColumn) { + denyDropColumn(tableName.toString()); + } + } + + @Override + public void checkCanRenameColumn(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!allowRenameColumn) { + denyRenameColumn(tableName.toString()); + } + } + + @Override + public void checkCanSelectFromColumns(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName, Set columnNames) + { + } + + @Override + public void checkCanInsertIntoTable(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + } + + @Override + public void checkCanDeleteFromTable(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + } + + @Override + public void checkCanCreateView(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName viewName) + { + } + + @Override + public void checkCanDropView(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName viewName) + { + } + + @Override + public void checkCanCreateViewWithSelectFromColumns(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName, Set columnNames) + { + } + + @Override + public void checkCanSetCatalogSessionProperty(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String propertyName) + { + } + + @Override + public void checkCanGrantTablePrivilege(ConnectorTransactionHandle transaction, ConnectorIdentity identity, Privilege privilege, SchemaTableName tableName, PrestoPrincipal grantee, boolean withGrantOption) + { + } + + @Override + public void checkCanRevokeTablePrivilege(ConnectorTransactionHandle transaction, ConnectorIdentity identity, Privilege privilege, SchemaTableName tableName, PrestoPrincipal revokee, boolean grantOptionFor) + { + } + + @Override + public void checkCanCreateRole(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String role, Optional grantor) + { + } + + @Override + public void checkCanDropRole(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String role) + { + } + + @Override + public void checkCanGrantRoles(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, Set roles, Set grantees, boolean withAdminOption, Optional grantor, String catalogName) + { + } + + @Override + public void checkCanRevokeRoles(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, Set roles, Set grantees, boolean adminOptionFor, Optional grantor, String catalogName) + { + } + + @Override + public void checkCanSetRole(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String role, String catalogName) + { + } + + @Override + public void checkCanShowRoles(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String catalogName) + { + } + + @Override + public void checkCanShowCurrentRoles(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String catalogName) + { + } + + @Override + public void checkCanShowRoleGrants(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String catalogName) + { + } + + @Override + public void checkCanUpdateTable(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + } + + @Override + public Optional getRowFilter(ConnectorTransactionHandle transactionHandle, Identity identity, SchemaTableName tableName) + { + return Optional.empty(); + } + + @Override + public Optional getColumnMask(ConnectorTransactionHandle transactionHandle, Identity identity, SchemaTableName tableName, String columnName, Type type) + { + return Optional.empty(); + } + + @Override + public void checkCanCreateIndex(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName) + { + } + + @Override + public void checkCanDropIndex(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName) + { + } + + @Override + public void checkCanRenameIndex(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName indexName, SchemaTableName newIndexName) + { + } + + @Override + public void checkCanUpdateIndex(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName indexName) + { + } + + @Override + public void checkCanShowIndex(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName indexName) + { + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/LegacySecurityConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/LegacySecurityConfig.java new file mode 100644 index 00000000..446b6aff --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/LegacySecurityConfig.java @@ -0,0 +1,105 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; + +public class LegacySecurityConfig +{ + private boolean allowAddColumn; + private boolean allowDropColumn; + private boolean allowDropTable; + private boolean allowRenameTable; + private boolean allowCommentTable; + private boolean allowRenameColumn; + + public boolean getAllowAddColumn() + { + return this.allowAddColumn; + } + + @Config("hive.allow-add-column") + @ConfigDescription("Allow Hive connector to add column") + public LegacySecurityConfig setAllowAddColumn(boolean allowAddColumn) + { + this.allowAddColumn = allowAddColumn; + return this; + } + + public boolean getAllowDropColumn() + { + return this.allowDropColumn; + } + + @Config("hive.allow-drop-column") + @ConfigDescription("Allow Hive connector to drop column") + public LegacySecurityConfig setAllowDropColumn(boolean allowDropColumn) + { + this.allowDropColumn = allowDropColumn; + return this; + } + + public boolean getAllowDropTable() + { + return this.allowDropTable; + } + + @Config("hive.allow-drop-table") + @ConfigDescription("Allow Hive connector to drop table") + public LegacySecurityConfig setAllowDropTable(boolean allowDropTable) + { + this.allowDropTable = allowDropTable; + return this; + } + + public boolean getAllowRenameTable() + { + return this.allowRenameTable; + } + + @Config("hive.allow-rename-table") + @ConfigDescription("Allow Hive connector to rename table") + public LegacySecurityConfig setAllowRenameTable(boolean allowRenameTable) + { + this.allowRenameTable = allowRenameTable; + return this; + } + + public boolean getAllowCommentTable() + { + return this.allowCommentTable; + } + + @Config("hive.allow-comment-table") + @ConfigDescription("Allow Hive connector to set comment for a table") + public LegacySecurityConfig setAllowCommentTable(boolean allowCommentTable) + { + this.allowCommentTable = allowCommentTable; + return this; + } + + public boolean getAllowRenameColumn() + { + return this.allowRenameColumn; + } + + @Config("hive.allow-rename-column") + @ConfigDescription("Allow Hive connector to rename column") + public LegacySecurityConfig setAllowRenameColumn(boolean allowRenameColumn) + { + this.allowRenameColumn = allowRenameColumn; + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/LegacySecurityModule.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/LegacySecurityModule.java new file mode 100644 index 00000000..968a92fa --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/LegacySecurityModule.java @@ -0,0 +1,32 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import com.google.inject.Binder; +import com.google.inject.Module; +import com.google.inject.Scopes; +import io.prestosql.spi.connector.ConnectorAccessControl; + +import static io.airlift.configuration.ConfigBinder.configBinder; + +public class LegacySecurityModule + implements Module +{ + @Override + public void configure(Binder binder) + { + configBinder(binder).bindConfig(LegacySecurityConfig.class); + binder.bind(ConnectorAccessControl.class).to(LegacyAccessControl.class).in(Scopes.SINGLETON); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SecurityConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SecurityConfig.java new file mode 100644 index 00000000..1bfdeb7d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SecurityConfig.java @@ -0,0 +1,50 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import io.airlift.configuration.Config; + +import javax.validation.constraints.NotNull; + +public class SecurityConfig +{ + private String securitySystem = "legacy"; + private String sqlStandardAccessControlImp = ""; + + @NotNull + public String getSecuritySystem() + { + return securitySystem; + } + + @Config("hive.security") + public SecurityConfig setSecuritySystem(String securitySystem) + { + this.securitySystem = securitySystem; + return this; + } + + @NotNull + public String getSqlStandardAccessControlImp() + { + return sqlStandardAccessControlImp; + } + + @Config("hive.security.sql-standard-imp") + public SecurityConfig setSqlStandardAccessControlImp(String sqlStandardAccessControlImp) + { + this.sqlStandardAccessControlImp = sqlStandardAccessControlImp; + return this; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SecurityConstants.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SecurityConstants.java new file mode 100644 index 00000000..906b5845 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SecurityConstants.java @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import io.prestosql.plugin.base.security.AllowAllAccessControl; +import io.prestosql.plugin.base.security.FileBasedAccessControl; +import io.prestosql.plugin.base.security.ForwardingConnectorAccessControl; +import io.prestosql.plugin.base.security.ReadOnlyAccessControl; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class SecurityConstants +{ + /** + * SqlStandardAccessControl implementation white list + */ + public static final List WHITE_LIST_SQLSTANDARDACCESSCONTROL_IMPL = Collections.unmodifiableList(new ArrayList() { + { + // for a full name of class string will cause a maven-dependency-plugin issue, we need to separate it into two string + String classPackage = "io.prestosql.security"; + String className = ".TestAccessControlManager$DenyConnectorAccessControl"; + this.add(classPackage + className); + this.add(AllowAllAccessControl.class.getName()); + this.add(ForwardingConnectorAccessControl.class.getName()); + this.add(FileBasedAccessControl.class.getName()); + this.add(LegacyAccessControl.class.getName()); + this.add(ReadOnlyAccessControl.class.getName()); + this.add(SqlStandardAccessControl.class.getName()); + this.add(SystemTableAwareAccessControl.class.getName()); + } + }); + + private SecurityConstants() + { + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SqlStandardAccessControl.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SqlStandardAccessControl.java new file mode 100644 index 00000000..2d124221 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SqlStandardAccessControl.java @@ -0,0 +1,537 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.HiveCatalogName; +import io.prestosql.plugin.hive.HiveTransactionHandle; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.spi.connector.ColumnMetadata; +import io.prestosql.spi.connector.ConnectorAccessControl; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.security.AccessDeniedException; +import io.prestosql.spi.security.ConnectorIdentity; +import io.prestosql.spi.security.Identity; +import io.prestosql.spi.security.PrestoPrincipal; +import io.prestosql.spi.security.Privilege; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.security.ViewExpression; +import io.prestosql.spi.type.Type; + +import javax.inject.Inject; + +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; + +import static io.prestosql.plugin.hive.metastore.Database.DEFAULT_DATABASE_NAME; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege.DELETE; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege.INSERT; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege.OWNERSHIP; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege.SELECT; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege.UPDATE; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.toHivePrivilege; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.isRoleApplicable; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.isRoleEnabled; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.listApplicableRoles; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.listApplicableTablePrivileges; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.listEnabledTablePrivileges; +import static io.prestosql.spi.security.AccessDeniedException.denyAddColumn; +import static io.prestosql.spi.security.AccessDeniedException.denyCommentTable; +import static io.prestosql.spi.security.AccessDeniedException.denyCreateRole; +import static io.prestosql.spi.security.AccessDeniedException.denyCreateSchema; +import static io.prestosql.spi.security.AccessDeniedException.denyCreateTable; +import static io.prestosql.spi.security.AccessDeniedException.denyCreateView; +import static io.prestosql.spi.security.AccessDeniedException.denyCreateViewWithSelect; +import static io.prestosql.spi.security.AccessDeniedException.denyDeleteTable; +import static io.prestosql.spi.security.AccessDeniedException.denyDropColumn; +import static io.prestosql.spi.security.AccessDeniedException.denyDropRole; +import static io.prestosql.spi.security.AccessDeniedException.denyDropSchema; +import static io.prestosql.spi.security.AccessDeniedException.denyDropTable; +import static io.prestosql.spi.security.AccessDeniedException.denyDropView; +import static io.prestosql.spi.security.AccessDeniedException.denyGrantRoles; +import static io.prestosql.spi.security.AccessDeniedException.denyGrantTablePrivilege; +import static io.prestosql.spi.security.AccessDeniedException.denyInsertTable; +import static io.prestosql.spi.security.AccessDeniedException.denyRenameColumn; +import static io.prestosql.spi.security.AccessDeniedException.denyRenameSchema; +import static io.prestosql.spi.security.AccessDeniedException.denyRenameTable; +import static io.prestosql.spi.security.AccessDeniedException.denyRevokeRoles; +import static io.prestosql.spi.security.AccessDeniedException.denyRevokeTablePrivilege; +import static io.prestosql.spi.security.AccessDeniedException.denySelectTable; +import static io.prestosql.spi.security.AccessDeniedException.denySetCatalogSessionProperty; +import static io.prestosql.spi.security.AccessDeniedException.denySetRole; +import static io.prestosql.spi.security.AccessDeniedException.denyShowColumnsMetadata; +import static io.prestosql.spi.security.AccessDeniedException.denyShowRoles; +import static io.prestosql.spi.security.AccessDeniedException.denyUpdateTable; +import static io.prestosql.spi.security.PrincipalType.ROLE; +import static io.prestosql.spi.security.PrincipalType.USER; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toSet; + +public class SqlStandardAccessControl + implements ConnectorAccessControl +{ + public static final String ADMIN_ROLE_NAME = "admin"; + private static final String INFORMATION_SCHEMA_NAME = "information_schema"; + private static final SchemaTableName ROLES = new SchemaTableName(INFORMATION_SCHEMA_NAME, "roles"); + + private final String catalogName; + private final Function metastoreProvider; + + @Inject + public SqlStandardAccessControl( + HiveCatalogName catalogName, + Function metastoreProvider) + { + this.catalogName = requireNonNull(catalogName, "catalogName is null").toString(); + this.metastoreProvider = requireNonNull(metastoreProvider, "metastoreProvider is null"); + } + + @Override + public void checkCanCreateSchema(ConnectorTransactionHandle transaction, ConnectorIdentity identity, String schemaName) + { + if (!isAdmin(transaction, identity)) { + denyCreateSchema(schemaName); + } + } + + @Override + public void checkCanDropSchema(ConnectorTransactionHandle transaction, ConnectorIdentity identity, String schemaName) + { + if (!isDatabaseOwner(transaction, identity, schemaName)) { + denyDropSchema(schemaName); + } + } + + @Override + public void checkCanRenameSchema(ConnectorTransactionHandle transaction, ConnectorIdentity identity, String schemaName, String newSchemaName) + { + if (!isDatabaseOwner(transaction, identity, schemaName)) { + denyRenameSchema(schemaName, newSchemaName); + } + } + + @Override + public void checkCanShowSchemas(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity) + { + } + + @Override + public Set filterSchemas(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, Set schemaNames) + { + return schemaNames; + } + + @Override + public void checkCanCreateTable(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!isDatabaseOwner(transaction, identity, tableName.getSchemaName())) { + denyCreateTable(tableName.toString()); + } + } + + @Override + public void checkCanDropTable(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!isTableOwner(transaction, identity, tableName)) { + denyDropTable(tableName.toString()); + } + } + + @Override + public void checkCanRenameTable(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName, SchemaTableName newTableName) + { + if (!isTableOwner(transaction, identity, tableName)) { + denyRenameTable(tableName.toString(), newTableName.toString()); + } + } + + @Override + public void checkCanSetTableComment(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!isTableOwner(transaction, identity, tableName)) { + denyCommentTable(tableName.toString()); + } + } + + @Override + public void checkCanShowTablesMetadata(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String schemaName) + { + } + + @Override + public Set filterTables(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, Set tableNames) + { + return tableNames; + } + + @Override + public void checkCanShowColumnsMetadata(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!hasAnyTablePermission(transactionHandle, identity, tableName)) { + denyShowColumnsMetadata(tableName.toString()); + } + } + + @Override + public List filterColumns(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName, List columns) + { + if (!hasAnyTablePermission(transactionHandle, identity, tableName)) { + return ImmutableList.of(); + } + return columns; + } + + @Override + public void checkCanAddColumn(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!isTableOwner(transaction, identity, tableName)) { + denyAddColumn(tableName.toString()); + } + } + + @Override + public void checkCanDropColumn(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!isTableOwner(transaction, identity, tableName)) { + denyDropColumn(tableName.toString()); + } + } + + @Override + public void checkCanRenameColumn(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!isTableOwner(transaction, identity, tableName)) { + denyRenameColumn(tableName.toString()); + } + } + + @Override + public void checkCanSelectFromColumns(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName, Set columnNames) + { + // TODO: Implement column level access control + if (!checkTablePermission(transaction, identity, tableName, SELECT, false)) { + denySelectTable(tableName.toString()); + } + } + + @Override + public void checkCanInsertIntoTable(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!checkTablePermission(transaction, identity, tableName, INSERT, false)) { + denyInsertTable(tableName.toString()); + } + } + + @Override + public void checkCanDeleteFromTable(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!checkTablePermission(transaction, identity, tableName, DELETE, false)) { + denyDeleteTable(tableName.toString()); + } + } + + @Override + public void checkCanCreateView(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName viewName) + { + if (!isDatabaseOwner(transaction, identity, viewName.getSchemaName())) { + denyCreateView(viewName.toString()); + } + } + + @Override + public void checkCanDropView(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName viewName) + { + if (!isTableOwner(transaction, identity, viewName)) { + denyDropView(viewName.toString()); + } + } + + @Override + public void checkCanCreateViewWithSelectFromColumns(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName, Set columnNames) + { + checkCanSelectFromColumns(transaction, identity, tableName, columnNames); + + // TODO implement column level access control + if (!checkTablePermission(transaction, identity, tableName, SELECT, true)) { + denyCreateViewWithSelect(tableName.toString(), identity); + } + } + + @Override + public void checkCanSetCatalogSessionProperty(ConnectorTransactionHandle transaction, ConnectorIdentity identity, String propertyName) + { + if (!isAdmin(transaction, identity)) { + denySetCatalogSessionProperty(catalogName, propertyName); + } + } + + @Override + public void checkCanGrantTablePrivilege(ConnectorTransactionHandle transaction, ConnectorIdentity identity, Privilege privilege, SchemaTableName tableName, PrestoPrincipal grantee, boolean withGrantOption) + { + if (isTableOwner(transaction, identity, tableName)) { + return; + } + + if (!hasGrantOptionForPrivilege(transaction, identity, privilege, tableName)) { + denyGrantTablePrivilege(privilege.name(), tableName.toString()); + } + } + + @Override + public void checkCanRevokeTablePrivilege(ConnectorTransactionHandle transaction, ConnectorIdentity identity, Privilege privilege, SchemaTableName tableName, PrestoPrincipal revokee, boolean grantOptionFor) + { + if (isTableOwner(transaction, identity, tableName)) { + return; + } + + if (!hasGrantOptionForPrivilege(transaction, identity, privilege, tableName)) { + denyRevokeTablePrivilege(privilege.name(), tableName.toString()); + } + } + + @Override + public void checkCanCreateRole(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String role, Optional grantor) + { + // currently specifying grantor is supported by metastore, but it is not supported by Hive itself + if (grantor.isPresent()) { + throw new AccessDeniedException("Hive Connector does not support WITH ADMIN statement"); + } + if (!isAdmin(transactionHandle, identity)) { + denyCreateRole(role); + } + } + + @Override + public void checkCanDropRole(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String role) + { + if (!isAdmin(transactionHandle, identity)) { + denyDropRole(role); + } + } + + @Override + public void checkCanGrantRoles(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, Set roles, Set grantees, boolean withAdminOption, Optional grantor, String catalogName) + { + // currently specifying grantor is supported by metastore, but it is not supported by Hive itself + if (grantor.isPresent()) { + throw new AccessDeniedException("Hive Connector does not support GRANTED BY statement"); + } + if (!hasAdminOptionForRoles(transactionHandle, identity, roles)) { + denyGrantRoles(roles, grantees); + } + } + + @Override + public void checkCanRevokeRoles(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, Set roles, Set grantees, boolean adminOptionFor, Optional grantor, String catalogName) + { + // currently specifying grantor is supported by metastore, but it is not supported by Hive itself + if (grantor.isPresent()) { + throw new AccessDeniedException("Hive Connector does not support GRANTED BY statement"); + } + if (!hasAdminOptionForRoles(transactionHandle, identity, roles)) { + denyRevokeRoles(roles, grantees); + } + } + + @Override + public void checkCanSetRole(ConnectorTransactionHandle transaction, ConnectorIdentity identity, String role, String catalogName) + { + SemiTransactionalHiveMetastore metastore = metastoreProvider.apply(((HiveTransactionHandle) transaction)); + if (!isRoleApplicable(metastore, new HivePrincipal(USER, identity.getUser()), role)) { + denySetRole(role); + } + } + + @Override + public void checkCanShowRoles(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String catalogName) + { + if (!isAdmin(transactionHandle, identity)) { + denyShowRoles(catalogName); + } + } + + @Override + public void checkCanShowCurrentRoles(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String catalogName) + { + } + + @Override + public void checkCanShowRoleGrants(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, String catalogName) + { + } + + @Override + public void checkCanUpdateTable(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (!checkTablePermission(transaction, identity, tableName, UPDATE, false)) { + denyUpdateTable(tableName.toString()); + } + } + + private boolean isAdmin(ConnectorTransactionHandle transaction, ConnectorIdentity identity) + { + SemiTransactionalHiveMetastore metastore = metastoreProvider.apply(((HiveTransactionHandle) transaction)); + return isRoleEnabled(identity, metastore::listRoleGrants, ADMIN_ROLE_NAME); + } + + private boolean isDatabaseOwner(ConnectorTransactionHandle transaction, ConnectorIdentity identity, String databaseName) + { + // all users are "owners" of the default database + if (DEFAULT_DATABASE_NAME.equalsIgnoreCase(databaseName)) { + return true; + } + + if (isAdmin(transaction, identity)) { + return true; + } + + SemiTransactionalHiveMetastore metastore = metastoreProvider.apply(((HiveTransactionHandle) transaction)); + Optional databaseMetadata = metastore.getDatabase(databaseName); + if (!databaseMetadata.isPresent()) { + return false; + } + + Database database = databaseMetadata.get(); + + // a database can be owned by a user or role + if (database.getOwnerType() == USER && identity.getUser().equals(database.getOwnerName())) { + return true; + } + if (database.getOwnerType() == ROLE && isRoleEnabled(identity, metastore::listRoleGrants, database.getOwnerName())) { + return true; + } + return false; + } + + private boolean isTableOwner(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + return checkTablePermission(transaction, identity, tableName, OWNERSHIP, false); + } + + private boolean checkTablePermission( + ConnectorTransactionHandle transaction, + ConnectorIdentity identity, + SchemaTableName tableName, + HivePrivilege requiredPrivilege, + boolean grantOptionRequired) + { + if (isAdmin(transaction, identity)) { + return true; + } + + if (tableName.equals(ROLES)) { + return false; + } + + if (INFORMATION_SCHEMA_NAME.equals(tableName.getSchemaName())) { + return true; + } + + SemiTransactionalHiveMetastore metastore = metastoreProvider.apply(((HiveTransactionHandle) transaction)); + return listEnabledTablePrivileges(metastore, tableName.getSchemaName(), tableName.getTableName(), identity) + .filter(privilegeInfo -> !grantOptionRequired || privilegeInfo.isGrantOption()) + .anyMatch(privilegeInfo -> privilegeInfo.getHivePrivilege().equals(requiredPrivilege)); + } + + private boolean hasGrantOptionForPrivilege(ConnectorTransactionHandle transaction, ConnectorIdentity identity, Privilege privilege, SchemaTableName tableName) + { + if (isAdmin(transaction, identity)) { + return true; + } + + SemiTransactionalHiveMetastore metastore = metastoreProvider.apply(((HiveTransactionHandle) transaction)); + return listApplicableTablePrivileges( + metastore, + tableName.getSchemaName(), + tableName.getTableName(), + identity.getUser()) + .anyMatch(privilegeInfo -> privilegeInfo.getHivePrivilege().equals(toHivePrivilege(privilege)) && privilegeInfo.isGrantOption()); + } + + private boolean hasAdminOptionForRoles(ConnectorTransactionHandle transaction, ConnectorIdentity identity, Set roles) + { + if (isAdmin(transaction, identity)) { + return true; + } + + SemiTransactionalHiveMetastore metastore = metastoreProvider.apply(((HiveTransactionHandle) transaction)); + Set rolesWithGrantOption = listApplicableRoles(new HivePrincipal(USER, identity.getUser()), metastore::listRoleGrants) + .filter(RoleGrant::isGrantable) + .map(RoleGrant::getRoleName) + .collect(toSet()); + return rolesWithGrantOption.containsAll(roles); + } + + private boolean hasAnyTablePermission(ConnectorTransactionHandle transaction, ConnectorIdentity identity, SchemaTableName tableName) + { + if (isAdmin(transaction, identity)) { + return true; + } + + if (tableName.equals(ROLES)) { + return false; + } + + if (INFORMATION_SCHEMA_NAME.equals(tableName.getSchemaName())) { + return true; + } + + SemiTransactionalHiveMetastore metastore = metastoreProvider.apply(((HiveTransactionHandle) transaction)); + return listEnabledTablePrivileges(metastore, tableName.getSchemaName(), tableName.getTableName(), identity) + .anyMatch(privilegeInfo -> true); + } + + @Override + public Optional getRowFilter(ConnectorTransactionHandle transactionHandle, Identity identity, SchemaTableName tableName) + { + return Optional.empty(); + } + + @Override + public Optional getColumnMask(ConnectorTransactionHandle transactionHandle, Identity identity, SchemaTableName tableName, String columnName, Type type) + { + return Optional.empty(); + } + + @Override + public void checkCanCreateIndex(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName) + { + } + + @Override + public void checkCanDropIndex(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName) + { + } + + @Override + public void checkCanRenameIndex(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName indexName, SchemaTableName newIndexName) + { + } + + @Override + public void checkCanUpdateIndex(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName indexName) + { + } + + @Override + public void checkCanShowIndex(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName indexName) + { + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SqlStandardAccessControlMetadata.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SqlStandardAccessControlMetadata.java new file mode 100644 index 00000000..9363c67f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SqlStandardAccessControlMetadata.java @@ -0,0 +1,184 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.HivePrivilegeInfo; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.security.GrantInfo; +import io.prestosql.spi.security.Privilege; +import io.prestosql.spi.security.PrivilegeInfo; +import io.prestosql.spi.security.RoleGrant; + +import java.util.List; +import java.util.Optional; +import java.util.Set; + +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.prestosql.plugin.hive.metastore.HivePrivilegeInfo.toHivePrivilege; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.listEnabledPrincipals; +import static io.prestosql.spi.StandardErrorCode.ALREADY_EXISTS; +import static io.prestosql.spi.security.PrincipalType.USER; +import static java.util.Locale.ENGLISH; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toSet; + +public class SqlStandardAccessControlMetadata + implements AccessControlMetadata +{ + private static final Set RESERVED_ROLES = ImmutableSet.of("all", "default", "none"); + + private final SemiTransactionalHiveMetastore metastore; + + public SqlStandardAccessControlMetadata(SemiTransactionalHiveMetastore metastore) + { + this.metastore = requireNonNull(metastore, "metastore is null"); + } + + @Override + public void createRole(ConnectorSession session, String role, Optional grantor) + { + checkRoleIsNotReserved(role); + metastore.createRole(role, null); + } + + @Override + public void dropRole(ConnectorSession session, String role) + { + checkRoleIsNotReserved(role); + metastore.dropRole(role); + } + + private static void checkRoleIsNotReserved(String role) + { + // can not change the reserved role, case insensitive + if (RESERVED_ROLES.contains(role.toLowerCase(ENGLISH))) { + throw new PrestoException(ALREADY_EXISTS, "Role name cannot be one of the reserved roles (case insensitive): " + RESERVED_ROLES); + } + } + + @Override + public Set listRoles(ConnectorSession session) + { + return ImmutableSet.copyOf(metastore.listRoles()); + } + + @Override + public Set listRoleGrants(ConnectorSession session, HivePrincipal principal) + { + return ImmutableSet.copyOf(metastore.listRoleGrants(principal)); + } + + @Override + public void grantRoles(ConnectorSession session, Set roles, Set grantees, boolean withAdminOption, Optional grantor) + { + metastore.grantRoles(roles, grantees, withAdminOption, grantor.orElse(new HivePrincipal(USER, session.getUser()))); + } + + @Override + public void revokeRoles(ConnectorSession session, Set roles, Set grantees, boolean adminOptionFor, Optional grantor) + { + metastore.revokeRoles(roles, grantees, adminOptionFor, grantor.orElse(new HivePrincipal(USER, session.getUser()))); + } + + @Override + public Set listApplicableRoles(ConnectorSession session, HivePrincipal principal) + { + return ThriftMetastoreUtil.listApplicableRoles(principal, metastore::listRoleGrants) + .collect(toImmutableSet()); + } + + @Override + public Set listEnabledRoles(ConnectorSession session) + { + return ThriftMetastoreUtil.listEnabledRoles(session.getIdentity(), metastore::listRoleGrants) + .collect(toImmutableSet()); + } + + @Override + public void grantTablePrivileges(ConnectorSession session, SchemaTableName schemaTableName, Set privileges, HivePrincipal grantee, boolean grantOption) + { + String schemaName = schemaTableName.getSchemaName(); + String tableName = schemaTableName.getTableName(); + + Set hivePrivilegeInfos = privileges.stream() + .map(privilege -> new HivePrivilegeInfo(toHivePrivilege(privilege), grantOption, new HivePrincipal(USER, session.getUser()), new HivePrincipal(USER, session.getUser()))) + .collect(toSet()); + + metastore.grantTablePrivileges(schemaName, tableName, grantee, hivePrivilegeInfos); + } + + @Override + public void revokeTablePrivileges(ConnectorSession session, SchemaTableName schemaTableName, Set privileges, HivePrincipal grantee, boolean grantOption) + { + String schemaName = schemaTableName.getSchemaName(); + String tableName = schemaTableName.getTableName(); + + Set hivePrivilegeInfos = privileges.stream() + .map(privilege -> new HivePrivilegeInfo(toHivePrivilege(privilege), grantOption, new HivePrincipal(USER, session.getUser()), new HivePrincipal(USER, session.getUser()))) + .collect(toSet()); + + metastore.revokeTablePrivileges(schemaName, tableName, grantee, hivePrivilegeInfos); + } + + @Override + public List listTablePrivileges(ConnectorSession session, List tableNames) + { + Set principals = listEnabledPrincipals(metastore, session.getIdentity()) + .collect(toImmutableSet()); + boolean isAdminRoleSet = hasAdminRole(principals); + ImmutableList.Builder result = ImmutableList.builder(); + for (SchemaTableName tableName : tableNames) { + if (isAdminRoleSet) { + result.addAll(buildGrants(tableName, null)); + } + else { + for (HivePrincipal grantee : principals) { + result.addAll(buildGrants(tableName, grantee)); + } + } + } + return result.build(); + } + + private List buildGrants(SchemaTableName tableName, HivePrincipal principal) + { + ImmutableList.Builder result = ImmutableList.builder(); + Set hivePrivileges = metastore.listTablePrivileges(tableName.getSchemaName(), tableName.getTableName(), principal); + for (HivePrivilegeInfo hivePrivilege : hivePrivileges) { + Set prestoPrivileges = hivePrivilege.toPrivilegeInfo(); + for (PrivilegeInfo prestoPrivilege : prestoPrivileges) { + GrantInfo grant = new GrantInfo( + prestoPrivilege, + hivePrivilege.getGrantee().toPrestoPrincipal(), + tableName, + Optional.of(hivePrivilege.getGrantor().toPrestoPrincipal()), + Optional.empty()); + result.add(grant); + } + } + return result.build(); + } + + private static boolean hasAdminRole(Set roles) + { + return roles.stream().anyMatch(principal -> principal.getName().equalsIgnoreCase(SqlStandardAccessControl.ADMIN_ROLE_NAME)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SqlStandardSecurityModule.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SqlStandardSecurityModule.java new file mode 100644 index 00000000..6cf6165b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SqlStandardSecurityModule.java @@ -0,0 +1,74 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import com.google.inject.Binder; +import com.google.inject.Module; +import com.google.inject.Scopes; +import io.airlift.log.Logger; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorAccessControl; + +import static io.prestosql.plugin.hive.security.SecurityConstants.WHITE_LIST_SQLSTANDARDACCESSCONTROL_IMPL; + +public class SqlStandardSecurityModule + implements Module +{ + private static final Logger log = Logger.get(SqlStandardSecurityModule.class); + + private String sqlStandardAccessControlImp; + + public SqlStandardSecurityModule(String sqlStandardAccessControlImp) + { + this.sqlStandardAccessControlImp = sqlStandardAccessControlImp; + } + + @Override + public void configure(Binder binder) + { + if (sqlStandardAccessControlImp.isEmpty()) { + binder.bind(ConnectorAccessControl.class).to(SqlStandardAccessControl.class).in(Scopes.SINGLETON); + } + else { + try { + if (!WHITE_LIST_SQLSTANDARDACCESSCONTROL_IMPL.contains(sqlStandardAccessControlImp)) { + throw new PrestoException(HiveErrorCode.HIVE_FILE_NOT_FOUND, "Found illegal class when binding ConnectorAccessControl."); + } + log.info("Binding ConnectorAccessControl.class to %s", sqlStandardAccessControlImp); + binder.bind(ConnectorAccessControl.class) + .to((Class) Class.forName(this.sqlStandardAccessControlImp)) + .in(Scopes.SINGLETON); + } + catch (ClassNotFoundException e) { + log.error("Failed to bind ConnectorAccessControl to a specified class. Error: %s", e.getLocalizedMessage()); + throw new PrestoException(HiveErrorCode.HIVE_FILE_NOT_FOUND, "Class not found when binding ConnectorAccessControl."); + } + } + binder.bind(AccessControlMetadataFactory.class).to(SqlStandardAccessControlMetadataFactory.class); + } + + private static final class SqlStandardAccessControlMetadataFactory + implements AccessControlMetadataFactory + { + public SqlStandardAccessControlMetadataFactory() {} + + @Override + public AccessControlMetadata create(SemiTransactionalHiveMetastore metastore) + { + return new SqlStandardAccessControlMetadata(metastore); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SystemTableAwareAccessControl.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SystemTableAwareAccessControl.java new file mode 100644 index 00000000..30fd8e79 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/security/SystemTableAwareAccessControl.java @@ -0,0 +1,108 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.security; + +import io.prestosql.plugin.base.security.ForwardingConnectorAccessControl; +import io.prestosql.spi.connector.ColumnMetadata; +import io.prestosql.spi.connector.ConnectorAccessControl; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.security.AccessDeniedException; +import io.prestosql.spi.security.ConnectorIdentity; +import io.prestosql.spi.security.Identity; +import io.prestosql.spi.security.ViewExpression; +import io.prestosql.spi.type.Type; + +import java.util.List; +import java.util.Optional; +import java.util.Set; + +import static io.prestosql.plugin.hive.HiveMetadata.getSourceTableNameFromSystemTable; +import static io.prestosql.spi.security.AccessDeniedException.denySelectTable; +import static io.prestosql.spi.security.AccessDeniedException.denyShowColumnsMetadata; +import static java.util.Objects.requireNonNull; + +public class SystemTableAwareAccessControl + extends ForwardingConnectorAccessControl +{ + private final ConnectorAccessControl delegate; + + public SystemTableAwareAccessControl(ConnectorAccessControl delegate) + { + this.delegate = requireNonNull(delegate, "delegate is null"); + } + + @Override + protected ConnectorAccessControl delegate() + { + return delegate; + } + + @Override + public void checkCanShowColumnsMetadata(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName) + { + Optional sourceTableName = getSourceTableNameFromSystemTable(tableName); + if (sourceTableName.isPresent()) { + try { + checkCanShowColumnsMetadata(transactionHandle, identity, sourceTableName.get()); + return; + } + catch (AccessDeniedException e) { + denyShowColumnsMetadata(tableName.toString()); + } + } + + delegate.checkCanShowColumnsMetadata(transactionHandle, identity, tableName); + } + + @Override + public List filterColumns(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName, List columns) + { + Optional sourceTableName = getSourceTableNameFromSystemTable(tableName); + if (sourceTableName.isPresent()) { + return filterColumns(transactionHandle, identity, sourceTableName.get(), columns); + } + return delegate.filterColumns(transactionHandle, identity, tableName, columns); + } + + @Override + public void checkCanSelectFromColumns(ConnectorTransactionHandle transactionHandle, ConnectorIdentity identity, SchemaTableName tableName, Set columnNames) + { + Optional sourceTableName = getSourceTableNameFromSystemTable(tableName); + if (sourceTableName.isPresent()) { + try { + checkCanSelectFromColumns(transactionHandle, identity, sourceTableName.get(), columnNames); + return; + } + catch (AccessDeniedException e) { + denySelectTable(tableName.toString()); + } + } + + delegate.checkCanSelectFromColumns(transactionHandle, identity, tableName, columnNames); + } + + @Override + public Optional getRowFilter(ConnectorTransactionHandle transactionHandle, Identity identity, SchemaTableName tableName) + { + return Optional.empty(); + } + + @Override + public Optional getColumnMask(ConnectorTransactionHandle transactionHandle, Identity identity, SchemaTableName tableName, String columnName, Type type) + { + return Optional.empty(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/statistics/HiveStatisticsProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/statistics/HiveStatisticsProvider.java new file mode 100644 index 00000000..6198e01a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/statistics/HiveStatisticsProvider.java @@ -0,0 +1,41 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.statistics; + +import io.prestosql.plugin.hive.HivePartition; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.statistics.TableStatistics; +import io.prestosql.spi.type.Type; + +import java.util.List; +import java.util.Map; + +public interface HiveStatisticsProvider +{ + /** + * @param columns must be Hive columns, not hidden (Presto-internal) columns + */ + TableStatistics getTableStatistics( + ConnectorSession session, + SchemaTableName schemaTableName, + Map columns, + Map columnTypes, + List partitions, + boolean includeColumnStatistics, + Table table); +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/statistics/MetastoreHiveStatisticsProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/statistics/MetastoreHiveStatisticsProvider.java new file mode 100644 index 00000000..9cfe135b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/statistics/MetastoreHiveStatisticsProvider.java @@ -0,0 +1,935 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.statistics; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.VerifyException; +import com.google.common.collect.ImmutableMap; +import com.google.common.hash.HashFunction; +import com.google.common.primitives.Ints; +import com.google.common.primitives.Shorts; +import com.google.common.primitives.SignedBytes; +import io.airlift.log.Logger; +import io.airlift.slice.Slice; +import io.prestosql.plugin.hive.HiveBasicStatistics; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HivePartition; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.DateStatistics; +import io.prestosql.plugin.hive.metastore.DecimalStatistics; +import io.prestosql.plugin.hive.metastore.DoubleStatistics; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; +import io.prestosql.plugin.hive.metastore.IntegerStatistics; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.predicate.NullableValue; +import io.prestosql.spi.statistics.ColumnStatistics; +import io.prestosql.spi.statistics.DoubleRange; +import io.prestosql.spi.statistics.Estimate; +import io.prestosql.spi.statistics.TableStatistics; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.Decimals; +import io.prestosql.spi.type.Type; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.OptionalDouble; +import java.util.OptionalLong; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.google.common.collect.Maps.immutableEntry; +import static com.google.common.hash.Hashing.murmur3_128; +import static io.prestosql.plugin.hive.HivePartition.UNPARTITIONED_ID; +import static io.prestosql.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize; +import static io.prestosql.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics; +import static io.prestosql.plugin.hive.HiveSessionProperties.isStatisticsEnabled; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.Chars.isCharType; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.Decimals.isLongDecimal; +import static io.prestosql.spi.type.Decimals.isShortDecimal; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.Varchars.isVarcharType; +import static java.lang.Double.isFinite; +import static java.lang.Double.isNaN; +import static java.lang.Double.parseDouble; +import static java.lang.Float.intBitsToFloat; +import static java.lang.String.format; +import static java.util.Collections.unmodifiableList; +import static java.util.Objects.requireNonNull; + +public class MetastoreHiveStatisticsProvider + implements HiveStatisticsProvider +{ + private static final Logger log = Logger.get(MetastoreHiveStatisticsProvider.class); + + private final PartitionsStatisticsProvider statisticsProvider; + private static Map statsCache; + private static Map samplePartitionCache; + + public MetastoreHiveStatisticsProvider(SemiTransactionalHiveMetastore metastore, Map statsCache, Map samplePartitionCache) + { + requireNonNull(metastore, "metastore is null"); + this.statsCache = requireNonNull(statsCache, "statsCache is null"); + this.samplePartitionCache = requireNonNull(samplePartitionCache, "samplePartitionCache is null"); + this.statisticsProvider = (session, schemaTableName, hivePartitions, table) -> getPartitionsStatistics(session, metastore, schemaTableName, hivePartitions, table); + } + + @VisibleForTesting + MetastoreHiveStatisticsProvider(PartitionsStatisticsProvider statisticsProvider) + { + this.statisticsProvider = requireNonNull(statisticsProvider, "statisticsProvider is null"); + } + + private static Map getPartitionsStatistics(ConnectorSession session, SemiTransactionalHiveMetastore metastore, SchemaTableName schemaTableName, List hivePartitions, Table table) + { + if (hivePartitions.isEmpty()) { + return ImmutableMap.of(); + } + boolean unpartitioned = hivePartitions.stream().anyMatch(partition -> partition.getPartitionId().equals(UNPARTITIONED_ID)); + if (unpartitioned) { + checkArgument(hivePartitions.size() == 1, "expected only one hive partition"); + return ImmutableMap.of(UNPARTITIONED_ID, metastore.getTableStatistics(new HiveIdentity(session), schemaTableName.getSchemaName(), schemaTableName.getTableName())); + } + Set partitionNames = hivePartitions.stream() + .map(HivePartition::getPartitionId) + .collect(toImmutableSet()); + return metastore.getPartitionStatistics(new HiveIdentity(session), schemaTableName.getSchemaName(), schemaTableName.getTableName(), partitionNames, Optional.of(table)); + } + + @Override + public TableStatistics getTableStatistics( + ConnectorSession session, + SchemaTableName schemaTableName, + Map columns, + Map columnTypes, + List partitions, + boolean includeColumnStatistics, + Table table) + { + if (!isStatisticsEnabled(session)) { + return TableStatistics.empty(); + } + if (partitions.isEmpty()) { + return createZeroStatistics(columns, columnTypes); + } + int sampleSize = getPartitionStatisticsSampleSize(session); + List partitionsSample = null; + SamplePartition sample = samplePartitionCache.get(table); + if (includeColumnStatistics || sample == null || sample.partitionCount != partitions.size()) { + partitionsSample = getPartitionsSample(partitions, sampleSize); + samplePartitionCache.put(table, new SamplePartition(partitions.size(), partitionsSample)); + } + else if (sample != null) { + partitionsSample = sample.partitionsSample; + } + try { + Map statisticsSample = statisticsProvider.getPartitionsStatistics(session, schemaTableName, partitionsSample, table); + if (!includeColumnStatistics) { + OptionalDouble averageRows = calculateAverageRowsPerPartition(statisticsSample.values()); + TableStatistics.Builder result = TableStatistics.builder(); + if (averageRows.isPresent()) { + result.setRowCount(Estimate.of(averageRows.getAsDouble() * partitions.size())); + } + result.setFileCount(calulateFileCount(statisticsSample.values())); + result.setOnDiskDataSizeInBytes(calculateTotalOnDiskSizeInBytes(statisticsSample.values())); + return result.build(); + } + else { + validatePartitionStatistics(schemaTableName, statisticsSample); + return getTableStatistics(columns, columnTypes, partitions, statisticsSample); + } + } + catch (PrestoException e) { + if (e.getErrorCode().equals(HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS.toErrorCode()) && isIgnoreCorruptedStatistics(session)) { + log.error(e); + return TableStatistics.empty(); + } + throw e; + } + } + + public class SamplePartition + { + long partitionCount; + List partitionsSample; + + public SamplePartition(long partitionCount, List partitionsSample) + { + this.partitionCount = partitionCount; + this.partitionsSample = partitionsSample; + } + } + + private TableStatistics createZeroStatistics(Map columns, Map columnTypes) + { + TableStatistics.Builder result = TableStatistics.builder(); + result.setRowCount(Estimate.of(0)); + columns.forEach((columnName, columnHandle) -> { + Type columnType = columnTypes.get(columnName); + verify(columnType != null, "columnType is missing for column: %s", columnName); + ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder(); + columnStatistics.setNullsFraction(Estimate.of(0)); + columnStatistics.setDistinctValuesCount(Estimate.of(0)); + if (hasDataSize(columnType)) { + columnStatistics.setDataSize(Estimate.of(0)); + } + result.setColumnStatistics(columnHandle, columnStatistics.build()); + }); + return result.build(); + } + + @VisibleForTesting + static List getPartitionsSample(List partitions, int sampleSize) + { + checkArgument(sampleSize > 0, "sampleSize is expected to be greater than zero"); + + if (partitions.size() <= sampleSize) { + return partitions; + } + + List result = new ArrayList<>(); + + int samplesLeft = sampleSize; + + HivePartition min = partitions.get(0); + HivePartition max = partitions.get(0); + for (HivePartition partition : partitions) { + if (partition.getPartitionId().compareTo(min.getPartitionId()) < 0) { + min = partition; + } + else if (partition.getPartitionId().compareTo(max.getPartitionId()) > 0) { + max = partition; + } + } + + result.add(min); + samplesLeft--; + if (samplesLeft > 0) { + result.add(max); + samplesLeft--; + } + + if (samplesLeft > 0) { + HashFunction hashFunction = murmur3_128(); + Comparator> hashComparator = Comparator + ., Long>comparing(Map.Entry::getValue) + .thenComparing(entry -> entry.getKey().getPartitionId()); + partitions.stream() + .filter(partition -> !result.contains(partition)) + .map(partition -> immutableEntry(partition, hashFunction.hashUnencodedChars(partition.getPartitionId()).asLong())) + .sorted(hashComparator) + .limit(samplesLeft) + .forEachOrdered(entry -> result.add(entry.getKey())); + } + + return unmodifiableList(result); + } + + @VisibleForTesting + static void validatePartitionStatistics(SchemaTableName table, Map partitionStatistics) + { + partitionStatistics.forEach((partition, statistics) -> { + HiveBasicStatistics basicStatistics = statistics.getBasicStatistics(); + OptionalLong rowCount = basicStatistics.getRowCount(); + rowCount.ifPresent(count -> checkStatistics(count >= 0, table, partition, "rowCount must be greater than or equal to zero: %s", count)); + basicStatistics.getFileCount().ifPresent(count -> checkStatistics(count >= 0, table, partition, "fileCount must be greater than or equal to zero: %s", count)); + basicStatistics.getInMemoryDataSizeInBytes().ifPresent(size -> checkStatistics(size >= 0, table, partition, "inMemoryDataSizeInBytes must be greater than or equal to zero: %s", size)); + basicStatistics.getOnDiskDataSizeInBytes().ifPresent(size -> checkStatistics(size >= 0, table, partition, "onDiskDataSizeInBytes must be greater than or equal to zero: %s", size)); + statistics.getColumnStatistics().forEach((column, columnStatistics) -> validateColumnStatistics(table, partition, column, rowCount, columnStatistics)); + }); + } + + private static void validateColumnStatistics(SchemaTableName table, String partition, String column, OptionalLong rowCount, HiveColumnStatistics columnStatistics) + { + columnStatistics.getMaxValueSizeInBytes().ifPresent(maxValueSizeInBytes -> + checkStatistics(maxValueSizeInBytes >= 0, table, partition, column, "maxValueSizeInBytes must be greater than or equal to zero: %s", maxValueSizeInBytes)); + columnStatistics.getTotalSizeInBytes().ifPresent(totalSizeInBytes -> + checkStatistics(totalSizeInBytes >= 0, table, partition, column, "totalSizeInBytes must be greater than or equal to zero: %s", totalSizeInBytes)); + columnStatistics.getNullsCount().ifPresent(nullsCount -> { + checkStatistics(nullsCount >= 0, table, partition, column, "nullsCount must be greater than or equal to zero: %s", nullsCount); + if (rowCount.isPresent()) { + checkStatistics( + nullsCount <= rowCount.getAsLong(), + table, + partition, + column, + "nullsCount must be less than or equal to rowCount. nullsCount: %s. rowCount: %s.", + nullsCount, + rowCount.getAsLong()); + } + }); + columnStatistics.getDistinctValuesCount().ifPresent(distinctValuesCount -> { + checkStatistics(distinctValuesCount >= 0, table, partition, column, "distinctValuesCount must be greater than or equal to zero: %s", distinctValuesCount); + if (rowCount.isPresent()) { + checkStatistics( + distinctValuesCount <= rowCount.getAsLong(), + table, + partition, + column, + "distinctValuesCount must be less than or equal to rowCount. distinctValuesCount: %s. rowCount: %s.", + distinctValuesCount, + rowCount.getAsLong()); + } + if (rowCount.isPresent() && columnStatistics.getNullsCount().isPresent()) { + long nonNullsCount = rowCount.getAsLong() - columnStatistics.getNullsCount().getAsLong(); + checkStatistics( + distinctValuesCount <= nonNullsCount, + table, + partition, + column, + "distinctValuesCount must be less than or equal to nonNullsCount. distinctValuesCount: %s. nonNullsCount: %s.", + distinctValuesCount, + nonNullsCount); + } + }); + + columnStatistics.getIntegerStatistics().ifPresent(integerStatistics -> { + OptionalLong min = integerStatistics.getMin(); + OptionalLong max = integerStatistics.getMax(); + if (min.isPresent() && max.isPresent()) { + checkStatistics( + min.getAsLong() <= max.getAsLong(), + table, + partition, + column, + "integerStatistics.min must be less than or equal to integerStatistics.max. integerStatistics.min: %s. integerStatistics.max: %s.", + min.getAsLong(), + max.getAsLong()); + } + }); + columnStatistics.getDoubleStatistics().ifPresent(doubleStatistics -> { + OptionalDouble min = doubleStatistics.getMin(); + OptionalDouble max = doubleStatistics.getMax(); + if (min.isPresent() && max.isPresent() && !isNaN(min.getAsDouble()) && !isNaN(max.getAsDouble())) { + checkStatistics( + min.getAsDouble() <= max.getAsDouble(), + table, + partition, + column, + "doubleStatistics.min must be less than or equal to doubleStatistics.max. doubleStatistics.min: %s. doubleStatistics.max: %s.", + min.getAsDouble(), + max.getAsDouble()); + } + }); + columnStatistics.getDecimalStatistics().ifPresent(decimalStatistics -> { + Optional min = decimalStatistics.getMin(); + Optional max = decimalStatistics.getMax(); + if (min.isPresent() && max.isPresent()) { + checkStatistics( + min.get().compareTo(max.get()) <= 0, + table, + partition, + column, + "decimalStatistics.min must be less than or equal to decimalStatistics.max. decimalStatistics.min: %s. decimalStatistics.max: %s.", + min.get(), + max.get()); + } + }); + columnStatistics.getDateStatistics().ifPresent(dateStatistics -> { + Optional min = dateStatistics.getMin(); + Optional max = dateStatistics.getMax(); + if (min.isPresent() && max.isPresent()) { + checkStatistics( + min.get().compareTo(max.get()) <= 0, + table, + partition, + column, + "dateStatistics.min must be less than or equal to dateStatistics.max. dateStatistics.min: %s. dateStatistics.max: %s.", + min.get(), + max.get()); + } + }); + columnStatistics.getBooleanStatistics().ifPresent(booleanStatistics -> { + OptionalLong falseCount = booleanStatistics.getFalseCount(); + OptionalLong trueCount = booleanStatistics.getTrueCount(); + falseCount.ifPresent(count -> + checkStatistics(count >= 0, table, partition, column, "falseCount must be greater than or equal to zero: %s", count)); + trueCount.ifPresent(count -> + checkStatistics(count >= 0, table, partition, column, "trueCount must be greater than or equal to zero: %s", count)); + if (rowCount.isPresent() && falseCount.isPresent()) { + checkStatistics( + falseCount.getAsLong() <= rowCount.getAsLong(), + table, + partition, + column, + "booleanStatistics.falseCount must be less than or equal to rowCount. booleanStatistics.falseCount: %s. rowCount: %s.", + falseCount.getAsLong(), + rowCount.getAsLong()); + } + if (rowCount.isPresent() && trueCount.isPresent()) { + checkStatistics( + trueCount.getAsLong() <= rowCount.getAsLong(), + table, + partition, + column, + "booleanStatistics.trueCount must be less than or equal to rowCount. booleanStatistics.trueCount: %s. rowCount: %s.", + trueCount.getAsLong(), + rowCount.getAsLong()); + } + }); + } + + private static void checkStatistics(boolean expression, SchemaTableName table, String partition, String column, String message, Object... args) + { + if (!expression) { + throw new PrestoException( + HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS, + format("Corrupted partition statistics (Table: %s Partition: [%s] Column: %s): %s", table, partition, column, format(message, args))); + } + } + + private static void checkStatistics(boolean expression, SchemaTableName table, String partition, String message, Object... args) + { + if (!expression) { + throw new PrestoException( + HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS, + format("Corrupted partition statistics (Table: %s Partition: [%s]): %s", table, partition, format(message, args))); + } + } + + private static TableStatistics getTableStatistics( + Map columns, + Map columnTypes, + List partitions, + Map statistics) + { + if (statistics.isEmpty()) { + return TableStatistics.empty(); + } + + checkArgument(!partitions.isEmpty(), "partitions is empty"); + + OptionalDouble optionalAverageRowsPerPartition = calculateAverageRowsPerPartition(statistics.values()); + if (!optionalAverageRowsPerPartition.isPresent()) { + return TableStatistics.empty(); + } + double averageRowsPerPartition = optionalAverageRowsPerPartition.getAsDouble(); + verify(averageRowsPerPartition >= 0, "averageRowsPerPartition must be greater than or equal to zero"); + int queriedPartitionsCount = partitions.size(); + double rowCount = averageRowsPerPartition * queriedPartitionsCount; + + TableStatistics.Builder result = TableStatistics.builder(); + long fileCount = calulateFileCount(statistics.values()); + long totalOnDiskSize = calculateTotalOnDiskSizeInBytes(statistics.values()); + result.setRowCount(Estimate.of(rowCount)); + result.setFileCount(fileCount); + result.setOnDiskDataSizeInBytes(totalOnDiskSize); + for (Map.Entry column : columns.entrySet()) { + String columnName = column.getKey(); + HiveColumnHandle columnHandle = (HiveColumnHandle) column.getValue(); + Type columnType = columnTypes.get(columnName); + ColumnStatistics columnStatistics; + TableColumnStatistics tableColumnStatistics; + if (columnHandle.isPartitionKey()) { + tableColumnStatistics = statsCache.get(partitions.get(0).getTableName().getTableName() + columnName); + if (tableColumnStatistics == null || invalidateStatsCache(tableColumnStatistics, Estimate.of(rowCount), fileCount, totalOnDiskSize)) { + columnStatistics = createPartitionColumnStatistics(columnHandle, columnType, partitions, statistics, averageRowsPerPartition, rowCount); + TableStatistics tableStatistics = new TableStatistics(Estimate.of(rowCount), fileCount, totalOnDiskSize, ImmutableMap.of()); + tableColumnStatistics = new TableColumnStatistics(tableStatistics, columnStatistics); + statsCache.put(partitions.get(0).getTableName().getTableName() + columnName, tableColumnStatistics); + } + else { + columnStatistics = tableColumnStatistics.columnStatistics; + } + } + else { + columnStatistics = createDataColumnStatistics(columnName, columnType, rowCount, statistics.values()); + } + result.setColumnStatistics(columnHandle, columnStatistics); + } + return result.build(); + } + + private static boolean invalidateStatsCache(TableColumnStatistics tableColumnStatistics, Estimate rowCount, long fileCount, long totalOnDisk) + { + if (tableColumnStatistics.tableStatistics.getOnDiskDataSizeInBytes() != totalOnDisk + || tableColumnStatistics.tableStatistics.getFileCount() != fileCount + || !tableColumnStatistics.tableStatistics.getRowCount().equals(rowCount)) { + return true; + } + return false; + } + + @VisibleForTesting + static OptionalDouble calculateAverageRowsPerPartition(Collection statistics) + { + return statistics.stream() + .map(PartitionStatistics::getBasicStatistics) + .map(HiveBasicStatistics::getRowCount) + .filter(OptionalLong::isPresent) + .mapToLong(OptionalLong::getAsLong) + .peek(count -> verify(count >= 0, "count must be greater than or equal to zero")) + .average(); + } + + static long calulateFileCount(Collection statistics) + { + return statistics.stream() + .map(PartitionStatistics::getBasicStatistics) + .map(HiveBasicStatistics::getFileCount) + .filter(OptionalLong::isPresent) + .mapToLong(OptionalLong::getAsLong) + .sum(); + } + + static long calculateTotalOnDiskSizeInBytes(Collection statistics) + { + return statistics.stream() + .map(PartitionStatistics::getBasicStatistics) + .map(HiveBasicStatistics::getOnDiskDataSizeInBytes) + .filter(OptionalLong::isPresent) + .mapToLong(OptionalLong::getAsLong) + .sum(); + } + + private static ColumnStatistics createPartitionColumnStatistics( + HiveColumnHandle column, + Type type, + List partitions, + Map statistics, + double averageRowsPerPartition, + double rowCount) + { + List nonEmptyPartitions = partitions.stream() + .filter(partition -> getPartitionRowCount(partition.getPartitionId(), statistics).orElse(averageRowsPerPartition) != 0) + .collect(toImmutableList()); + + return ColumnStatistics.builder() + .setDistinctValuesCount(Estimate.of(calculateDistinctPartitionKeys(column, nonEmptyPartitions))) + .setNullsFraction(Estimate.of(calculateNullsFractionForPartitioningKey(column, partitions, statistics, averageRowsPerPartition, rowCount))) + .setRange(calculateRangeForPartitioningKey(column, type, nonEmptyPartitions)) + .setDataSize(calculateDataSizeForPartitioningKey(column, type, partitions, statistics, averageRowsPerPartition)) + .build(); + } + + @VisibleForTesting + static long calculateDistinctPartitionKeys( + HiveColumnHandle column, + List partitions) + { + return partitions.stream() + .map(partition -> partition.getKeys().get(column)) + .filter(value -> !value.isNull()) + .distinct() + .count(); + } + + @VisibleForTesting + static double calculateNullsFractionForPartitioningKey( + HiveColumnHandle column, + List partitions, + Map statistics, + double averageRowsPerPartition, + double rowCount) + { + if (rowCount == 0) { + return 0; + } + double estimatedNullsCount = partitions.stream() + .filter(partition -> partition.getKeys().get(column).isNull()) + .map(HivePartition::getPartitionId) + .mapToDouble(partitionName -> getPartitionRowCount(partitionName, statistics).orElse(averageRowsPerPartition)) + .sum(); + return normalizeFraction(estimatedNullsCount / rowCount); + } + + private static double normalizeFraction(double fraction) + { + checkArgument(!isNaN(fraction), "fraction is NaN"); + checkArgument(isFinite(fraction), "fraction must be finite"); + if (fraction < 0) { + return 0; + } + if (fraction > 1) { + return 1; + } + return fraction; + } + + @VisibleForTesting + static Estimate calculateDataSizeForPartitioningKey( + HiveColumnHandle column, + Type type, + List partitions, + Map statistics, + double averageRowsPerPartition) + { + if (!hasDataSize(type)) { + return Estimate.unknown(); + } + double dataSize = 0; + for (HivePartition partition : partitions) { + int length = getSize(partition.getKeys().get(column)); + double rowCount = getPartitionRowCount(partition.getPartitionId(), statistics).orElse(averageRowsPerPartition); + dataSize += length * rowCount; + } + return Estimate.of(dataSize); + } + + private static boolean hasDataSize(Type type) + { + return isVarcharType(type) || isCharType(type); + } + + private static int getSize(NullableValue nullableValue) + { + if (nullableValue.isNull()) { + return 0; + } + Object value = nullableValue.getValue(); + checkArgument(value instanceof Slice, "value is expected to be of Slice type"); + return ((Slice) value).length(); + } + + private static OptionalDouble getPartitionRowCount(String partitionName, Map statistics) + { + PartitionStatistics partitionStatistics = statistics.get(partitionName); + if (partitionStatistics == null) { + return OptionalDouble.empty(); + } + OptionalLong rowCount = partitionStatistics.getBasicStatistics().getRowCount(); + if (rowCount.isPresent()) { + verify(rowCount.getAsLong() >= 0, "rowCount must be greater than or equal to zero"); + return OptionalDouble.of(rowCount.getAsLong()); + } + return OptionalDouble.empty(); + } + + @VisibleForTesting + static Optional calculateRangeForPartitioningKey(HiveColumnHandle column, Type type, List partitions) + { + if (!isRangeSupported(type)) { + return Optional.empty(); + } + + List values = partitions.stream() + .map(HivePartition::getKeys) + .map(keys -> keys.get(column)) + .filter(value -> !value.isNull()) + .map(NullableValue::getValue) + .map(value -> convertPartitionValueToDouble(type, value)) + .collect(toImmutableList()); + + if (values.isEmpty()) { + return Optional.empty(); + } + + double min = values.get(0); + double max = values.get(0); + + for (Double value : values) { + if (value > max) { + max = value; + } + if (value < min) { + min = value; + } + } + + return Optional.of(new DoubleRange(min, max)); + } + + @VisibleForTesting + static double convertPartitionValueToDouble(Type type, Object value) + { + if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { + return (Long) value; + } + if (type.equals(DOUBLE)) { + return (Double) value; + } + if (type.equals(REAL)) { + return intBitsToFloat(((Long) value).intValue()); + } + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + if (isShortDecimal(decimalType)) { + return parseDouble(Decimals.toString((Long) value, decimalType.getScale())); + } + if (isLongDecimal(decimalType)) { + return parseDouble(Decimals.toString((Slice) value, decimalType.getScale())); + } + throw new IllegalArgumentException("Unexpected decimal type: " + decimalType); + } + if (type.equals(DATE)) { + return (Long) value; + } + throw new IllegalArgumentException("Unexpected type: " + type); + } + + @VisibleForTesting + static ColumnStatistics createDataColumnStatistics(String column, Type type, double rowsCount, Collection partitionStatistics) + { + List columnStatistics = partitionStatistics.stream() + .map(PartitionStatistics::getColumnStatistics) + .map(statistics -> statistics.get(column)) + .filter(Objects::nonNull) + .collect(toImmutableList()); + + if (columnStatistics.isEmpty()) { + return ColumnStatistics.empty(); + } + + return ColumnStatistics.builder() + .setDistinctValuesCount(calculateDistinctValuesCount(columnStatistics)) + .setNullsFraction(calculateNullsFraction(column, partitionStatistics)) + .setDataSize(calculateDataSize(column, partitionStatistics, rowsCount)) + .setRange(calculateRange(type, columnStatistics)) + .build(); + } + + @VisibleForTesting + static Estimate calculateDistinctValuesCount(List columnStatistics) + { + return columnStatistics.stream() + .map(MetastoreHiveStatisticsProvider::getDistinctValuesCount) + .filter(OptionalLong::isPresent) + .map(OptionalLong::getAsLong) + .peek(distinctValuesCount -> verify(distinctValuesCount >= 0, "distinctValuesCount must be greater than or equal to zero")) + .max(Long::compare) + .map(Estimate::of) + .orElse(Estimate.unknown()); + } + + private static OptionalLong getDistinctValuesCount(HiveColumnStatistics statistics) + { + if (statistics.getBooleanStatistics().isPresent() && + statistics.getBooleanStatistics().get().getFalseCount().isPresent() && + statistics.getBooleanStatistics().get().getTrueCount().isPresent()) { + long falseCount = statistics.getBooleanStatistics().get().getFalseCount().getAsLong(); + long trueCount = statistics.getBooleanStatistics().get().getTrueCount().getAsLong(); + return OptionalLong.of((falseCount > 0 ? 1 : 0) + (trueCount > 0 ? 1 : 0)); + } + if (statistics.getDistinctValuesCount().isPresent()) { + return statistics.getDistinctValuesCount(); + } + return OptionalLong.empty(); + } + + @VisibleForTesting + static Estimate calculateNullsFraction(String column, Collection partitionStatistics) + { + List statisticsWithKnownRowCountAndNullsCount = partitionStatistics.stream() + .filter(statistics -> { + if (!statistics.getBasicStatistics().getRowCount().isPresent()) { + return false; + } + HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column); + if (columnStatistics == null) { + return false; + } + return columnStatistics.getNullsCount().isPresent(); + }) + .collect(toImmutableList()); + + if (statisticsWithKnownRowCountAndNullsCount.isEmpty()) { + return Estimate.unknown(); + } + + long totalNullsCount = 0; + long totalRowCount = 0; + for (PartitionStatistics statistics : statisticsWithKnownRowCountAndNullsCount) { + long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present")); + verify(rowCount >= 0, "rowCount must be greater than or equal to zero"); + HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column); + verify(columnStatistics != null, "columnStatistics is null"); + long nullsCount = columnStatistics.getNullsCount().orElseThrow(() -> new VerifyException("nullsCount is not present")); + verify(nullsCount >= 0, "nullsCount must be greater than or equal to zero"); + verify(nullsCount <= rowCount, "nullsCount must be less than or equal to rowCount. nullsCount: %s. rowCount: %s.", nullsCount, rowCount); + totalNullsCount += nullsCount; + totalRowCount += rowCount; + } + + if (totalRowCount == 0) { + return Estimate.zero(); + } + + verify( + totalNullsCount <= totalRowCount, + "totalNullsCount must be less than or equal to totalRowCount. totalNullsCount: %s. totalRowCount: %s.", + totalNullsCount, + totalRowCount); + return Estimate.of(((double) totalNullsCount) / totalRowCount); + } + + @VisibleForTesting + static Estimate calculateDataSize(String column, Collection partitionStatistics, double totalRowCount) + { + List statisticsWithKnownRowCountAndDataSize = partitionStatistics.stream() + .filter(statistics -> { + if (!statistics.getBasicStatistics().getRowCount().isPresent()) { + return false; + } + HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column); + if (columnStatistics == null) { + return false; + } + return columnStatistics.getTotalSizeInBytes().isPresent(); + }) + .collect(toImmutableList()); + + if (statisticsWithKnownRowCountAndDataSize.isEmpty()) { + return Estimate.unknown(); + } + + long knownRowCount = 0; + long knownDataSize = 0; + for (PartitionStatistics statistics : statisticsWithKnownRowCountAndDataSize) { + long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present")); + verify(rowCount >= 0, "rowCount must be greater than or equal to zero"); + HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column); + verify(columnStatistics != null, "columnStatistics is null"); + long dataSize = columnStatistics.getTotalSizeInBytes().orElseThrow(() -> new VerifyException("totalSizeInBytes is not present")); + verify(dataSize >= 0, "dataSize must be greater than or equal to zero"); + knownRowCount += rowCount; + knownDataSize += dataSize; + } + + if (totalRowCount == 0) { + return Estimate.zero(); + } + + if (knownRowCount == 0) { + return Estimate.unknown(); + } + + double averageValueDataSizeInBytes = ((double) knownDataSize) / knownRowCount; + return Estimate.of(averageValueDataSizeInBytes * totalRowCount); + } + + @VisibleForTesting + static Optional calculateRange(Type type, List columnStatistics) + { + if (!isRangeSupported(type)) { + return Optional.empty(); + } + return columnStatistics.stream() + .map(statistics -> createRange(type, statistics)) + .filter(Optional::isPresent) + .map(Optional::get) + .reduce(DoubleRange::union); + } + + private static boolean isRangeSupported(Type type) + { + return type.equals(TINYINT) + || type.equals(SMALLINT) + || type.equals(INTEGER) + || type.equals(BIGINT) + || type.equals(REAL) + || type.equals(DOUBLE) + || type.equals(DATE) + || type instanceof DecimalType; + } + + private static Optional createRange(Type type, HiveColumnStatistics statistics) + { + if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { + return statistics.getIntegerStatistics().flatMap(integerStatistics -> createIntegerRange(type, integerStatistics)); + } + if (type.equals(DOUBLE) || type.equals(REAL)) { + return statistics.getDoubleStatistics().flatMap(MetastoreHiveStatisticsProvider::createDoubleRange); + } + if (type.equals(DATE)) { + return statistics.getDateStatistics().flatMap(MetastoreHiveStatisticsProvider::createDateRange); + } + if (type instanceof DecimalType) { + return statistics.getDecimalStatistics().flatMap(MetastoreHiveStatisticsProvider::createDecimalRange); + } + throw new IllegalArgumentException("Unexpected type: " + type); + } + + private static Optional createIntegerRange(Type type, IntegerStatistics statistics) + { + if (statistics.getMin().isPresent() && statistics.getMax().isPresent()) { + return Optional.of(createIntegerRange(type, statistics.getMin().getAsLong(), statistics.getMax().getAsLong())); + } + return Optional.empty(); + } + + private static DoubleRange createIntegerRange(Type type, long min, long max) + { + return new DoubleRange(normalizeIntegerValue(type, min), normalizeIntegerValue(type, max)); + } + + private static long normalizeIntegerValue(Type type, long value) + { + if (type.equals(BIGINT)) { + return value; + } + if (type.equals(INTEGER)) { + return Ints.saturatedCast(value); + } + if (type.equals(SMALLINT)) { + return Shorts.saturatedCast(value); + } + if (type.equals(TINYINT)) { + return SignedBytes.saturatedCast(value); + } + throw new IllegalArgumentException("Unexpected type: " + type); + } + + private static Optional createDoubleRange(DoubleStatistics statistics) + { + if (statistics.getMin().isPresent() && statistics.getMax().isPresent() && !isNaN(statistics.getMin().getAsDouble()) && !isNaN(statistics.getMax().getAsDouble())) { + return Optional.of(new DoubleRange(statistics.getMin().getAsDouble(), statistics.getMax().getAsDouble())); + } + return Optional.empty(); + } + + private static Optional createDateRange(DateStatistics statistics) + { + if (statistics.getMin().isPresent() && statistics.getMax().isPresent()) { + return Optional.of(new DoubleRange(statistics.getMin().get().toEpochDay(), statistics.getMax().get().toEpochDay())); + } + return Optional.empty(); + } + + private static Optional createDecimalRange(DecimalStatistics statistics) + { + if (statistics.getMin().isPresent() && statistics.getMax().isPresent()) { + return Optional.of(new DoubleRange(statistics.getMin().get().doubleValue(), statistics.getMax().get().doubleValue())); + } + return Optional.empty(); + } + + @VisibleForTesting + interface PartitionsStatisticsProvider + { + Map getPartitionsStatistics(ConnectorSession session, SchemaTableName schemaTableName, List hivePartitions, Table table); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/statistics/TableColumnStatistics.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/statistics/TableColumnStatistics.java new file mode 100644 index 00000000..43ec1211 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/statistics/TableColumnStatistics.java @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.statistics; + +import io.prestosql.spi.statistics.ColumnStatistics; +import io.prestosql.spi.statistics.TableStatistics; + +public class TableColumnStatistics +{ + TableStatistics tableStatistics; + ColumnStatistics columnStatistics; + + public TableColumnStatistics(TableStatistics tableStatistics, ColumnStatistics columnStatistics) + { + this.tableStatistics = tableStatistics; + this.columnStatistics = columnStatistics; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/AsyncQueue.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/AsyncQueue.java new file mode 100644 index 00000000..cda432d5 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/AsyncQueue.java @@ -0,0 +1,242 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.SettableFuture; + +import javax.annotation.concurrent.GuardedBy; +import javax.annotation.concurrent.ThreadSafe; + +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.Executor; +import java.util.function.Function; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.util.concurrent.Futures.immediateFuture; +import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static java.util.Objects.requireNonNull; + +@ThreadSafe +public class AsyncQueue +{ + private final int targetQueueSize; + + @GuardedBy("this") + private final Queue elements; + // This future is completed when the queue transitions from full to not. But it will be replaced by a new instance of future immediately. + @GuardedBy("this") + private SettableFuture notFullSignal = SettableFuture.create(); + // This future is completed when the queue transitions from empty to not. But it will be replaced by a new instance of future immediately. + @GuardedBy("this") + private SettableFuture notEmptySignal = SettableFuture.create(); + @GuardedBy("this") + private boolean finishing; + @GuardedBy("this") + private int borrowerCount; + + private final Executor executor; + + public AsyncQueue(int targetQueueSize, Executor executor) + { + checkArgument(targetQueueSize >= 1, "targetQueueSize must be at least 1"); + this.targetQueueSize = targetQueueSize; + this.elements = new ArrayDeque<>(targetQueueSize * 2); + this.executor = requireNonNull(executor); + } + + /** + * Returns true if all future attempts to retrieve elements from this queue + * are guaranteed to return empty. + */ + public synchronized boolean isFinished() + { + return finishing && borrowerCount == 0 && elements.size() == 0; + } + + public synchronized void finish() + { + if (finishing) { + return; + } + finishing = true; + + signalIfFinishing(); + } + + private synchronized void signalIfFinishing() + { + if (finishing && borrowerCount == 0) { + if (elements.size() == 0) { + notEmptySignal.set(null); + notEmptySignal = SettableFuture.create(); + } + else if (elements.size() >= targetQueueSize) { + notFullSignal.set(null); + notFullSignal = SettableFuture.create(); + } + } + } + + public synchronized ListenableFuture offer(T element) + { + requireNonNull(element); + + if (finishing && borrowerCount == 0) { + return immediateFuture(null); + } + elements.add(element); + int newSize = elements.size(); + if (newSize == 1) { + notEmptySignal.set(null); + notEmptySignal = SettableFuture.create(); + } + if (newSize >= targetQueueSize) { + return notFullSignal; + } + return immediateFuture(null); + } + + public synchronized int size() + { + return elements.size(); + } + + private synchronized List getBatch(int maxSize) + { + int oldSize = elements.size(); + int reduceBy = Math.min(maxSize, oldSize); + if (reduceBy == 0) { + return ImmutableList.of(); + } + List result = new ArrayList<>(reduceBy); + for (int i = 0; i < reduceBy; i++) { + result.add(elements.remove()); + } + // This checks that the queue size changed from above threshold to below. Therefore, writers shall be notified. + if (oldSize >= targetQueueSize && oldSize - reduceBy < targetQueueSize) { + notFullSignal.set(null); + notFullSignal = SettableFuture.create(); + } + return result; + } + + public synchronized ListenableFuture> getBatchAsync(int maxSize) + { + return borrowBatchAsync(maxSize, elements -> new BorrowResult<>(ImmutableList.of(), elements)); + } + + protected synchronized SettableFuture getNotEmptySignal() + { + return notEmptySignal; + } + + /** + * Invoke {@code function} with up to {@code maxSize} elements removed from the head of the queue, + * and insert elements in the return value to the tail of the queue. + *

+ * If no element is currently available, invocation of {@code function} will be deferred until some + * element is available, or no more elements will be. Spurious invocation of {@code function} is + * possible. + *

+ * Insertion through return value of {@code function} will be effective even if {@link #finish()} has been invoked. + * When borrow (of a non-empty list) is ongoing, {@link #isFinished()} will return false. + * If an empty list is supplied to {@code function}, it must not return a result indicating intention + * to insert elements into the queue. + */ + public ListenableFuture borrowBatchAsync(int maxSize, Function, BorrowResult> function) + { + checkArgument(maxSize >= 0, "maxSize must be at least 0"); + + ListenableFuture> borrowedListFuture; + synchronized (this) { + List list = getBatch(maxSize); + if (!list.isEmpty()) { + borrowedListFuture = immediateFuture(list); + borrowerCount++; + } + else if (finishing && borrowerCount == 0) { + borrowedListFuture = immediateFuture(ImmutableList.of()); + } + else { + borrowedListFuture = Futures.transform( + notEmptySignal, + ignored -> { + synchronized (this) { + List batch = getBatch(maxSize); + if (!batch.isEmpty()) { + borrowerCount++; + } + return batch; + } + }, + executor); + } + } + + return Futures.transform( + borrowedListFuture, + elements -> { + // The borrowerCount field was only incremented for non-empty lists. + // Decrements should only happen for non-empty lists. + // When it should, it must always happen even if the caller-supplied function throws. + try { + BorrowResult borrowResult = function.apply(elements); + if (elements.isEmpty()) { + checkArgument(borrowResult.getElementsToInsert().isEmpty(), "Function must not insert anything when no element is borrowed"); + return borrowResult.getResult(); + } + for (T element : borrowResult.getElementsToInsert()) { + offer(element); + } + return borrowResult.getResult(); + } + finally { + if (!elements.isEmpty()) { + synchronized (this) { + borrowerCount--; + signalIfFinishing(); + } + } + } + }, directExecutor()); + } + + public static final class BorrowResult + { + private final List elementsToInsert; + private final R result; + + public BorrowResult(List elementsToInsert, R result) + { + this.elementsToInsert = ImmutableList.copyOf(elementsToInsert); + this.result = result; + } + + public List getElementsToInsert() + { + return elementsToInsert; + } + + public R getResult() + { + return result; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ConfigurationUtils.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ConfigurationUtils.java new file mode 100644 index 00000000..36419d19 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ConfigurationUtils.java @@ -0,0 +1,64 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapred.JobConf; + +import java.util.Map; + +public final class ConfigurationUtils +{ + private static final Configuration INITIAL_CONFIGURATION; + + static { + Configuration.addDefaultResource("hdfs-default.xml"); + Configuration.addDefaultResource("hdfs-site.xml"); + + // must not be transitively reloaded during the future loading of various Hadoop modules + // all the required default resources must be declared above + INITIAL_CONFIGURATION = new Configuration(false); + Configuration defaultConfiguration = new Configuration(); + copy(defaultConfiguration, INITIAL_CONFIGURATION); + } + + private ConfigurationUtils() {} + + public static Configuration getInitialConfiguration() + { + return copy(INITIAL_CONFIGURATION); + } + + public static Configuration copy(Configuration configuration) + { + Configuration copy = new Configuration(false); + copy(configuration, copy); + return copy; + } + + public static void copy(Configuration from, Configuration to) + { + for (Map.Entry entry : from) { + to.set(entry.getKey(), entry.getValue()); + } + } + + public static JobConf toJobConf(Configuration conf) + { + if (conf instanceof JobConf) { + return (JobConf) conf; + } + return new JobConf(conf); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/CustomSplitConversionUtils.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/CustomSplitConversionUtils.java new file mode 100644 index 00000000..31fad652 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/CustomSplitConversionUtils.java @@ -0,0 +1,66 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.spi.PrestoException; +import org.apache.hadoop.mapred.FileSplit; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static io.prestosql.plugin.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT; + +/** + * Utility class for both extracting customSplitInfo Map from a custom FileSplit and transforming the customSplitInfo back into a FileSplit. + */ +public class CustomSplitConversionUtils +{ + private static final List converters = ImmutableList.of(new HudiRealtimeSplitConverter()); + + private CustomSplitConversionUtils() + { + } + + public static Map extractCustomSplitInfo(FileSplit split) + { + for (CustomSplitConverter converter : converters) { + Optional> customSplitData = converter.extractCustomSplitInfo(split); + if (customSplitData.isPresent()) { + return customSplitData.get(); + } + } + return ImmutableMap.of(); + } + + public static FileSplit recreateSplitWithCustomInfo(FileSplit split, Map customSplitInfo) + { + for (CustomSplitConverter converter : converters) { + Optional fileSplit; + try { + fileSplit = converter.recreateFileSplitWithCustomInfo(split, customSplitInfo); + } + catch (IOException e) { + throw new PrestoException(HIVE_UNSUPPORTED_FORMAT, String.format("Split converter %s failed to create FileSplit.", converter.getClass()), e); + } + if (fileSplit.isPresent()) { + return fileSplit.get(); + } + } + return split; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/CustomSplitConverter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/CustomSplitConverter.java new file mode 100644 index 00000000..357f6b44 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/CustomSplitConverter.java @@ -0,0 +1,37 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import org.apache.hadoop.mapred.FileSplit; + +import java.io.IOException; +import java.util.Map; +import java.util.Optional; + +/** + * Interface for Split specific implementation of conversion from Split -> customSplitInfo Map and back. + */ +public interface CustomSplitConverter +{ + /** + * This method is expected to return optional.empty() if the FileSplit does not match the split converter. + */ + Optional> extractCustomSplitInfo(FileSplit split); + + /** + * This method is expected to merge the customSplitInfo with split to recreate the custom FileSplit. + * It is expected to return optional.empty() if the customSplitInfo does not match the split converter. + */ + Optional recreateFileSplitWithCustomInfo(FileSplit split, Map customSplitInfo) throws IOException; +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/DecimalUtils.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/DecimalUtils.java new file mode 100644 index 00000000..87e12134 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/DecimalUtils.java @@ -0,0 +1,58 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import io.airlift.slice.Slice; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; + +import java.math.BigInteger; + +import static io.prestosql.spi.type.Decimals.encodeUnscaledValue; +import static io.prestosql.spi.type.Decimals.rescale; + +public final class DecimalUtils +{ + private DecimalUtils() {} + + public static long getShortDecimalValue(HiveDecimalWritable writable, int columnScale) + { + byte[] bytes = writable.getInternalStorage(); + long value = getShortDecimalValue(bytes); + value = rescale(value, writable.getScale(), columnScale); + return value; + } + + public static long getShortDecimalValue(byte[] bytes) + { + long value = 0; + if ((bytes[0] & 0x80) != 0) { + for (int i = 0; i < 8 - bytes.length; ++i) { + value |= 0xFFL << (8 * (7 - i)); + } + } + + for (int i = 0; i < bytes.length; i++) { + value |= ((long) bytes[bytes.length - i - 1] & 0xFFL) << (8 * i); + } + + return value; + } + + public static Slice getLongDecimalValue(HiveDecimalWritable writable, int columnScale) + { + BigInteger value = new BigInteger(writable.getInternalStorage()); + value = rescale(value, writable.getScale(), columnScale); + return encodeUnscaledValue(value); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/FieldSetterFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/FieldSetterFactory.java new file mode 100644 index 00000000..821707a4 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/FieldSetterFactory.java @@ -0,0 +1,488 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.primitives.Shorts; +import com.google.common.primitives.SignedBytes; +import io.prestosql.plugin.hive.HiveWriteUtils; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.type.BigintType; +import io.prestosql.spi.type.BooleanType; +import io.prestosql.spi.type.CharType; +import io.prestosql.spi.type.DateType; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.DoubleType; +import io.prestosql.spi.type.IntegerType; +import io.prestosql.spi.type.RealType; +import io.prestosql.spi.type.SmallintType; +import io.prestosql.spi.type.TimestampType; +import io.prestosql.spi.type.TinyintType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.VarbinaryType; +import io.prestosql.spi.type.VarcharType; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DateWritableV2; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritableV2; +import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.joda.time.DateTimeZone; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static io.prestosql.plugin.hive.HiveUtil.isArrayType; +import static io.prestosql.plugin.hive.HiveUtil.isMapType; +import static io.prestosql.plugin.hive.HiveUtil.isRowType; +import static io.prestosql.plugin.hive.HiveWriteUtils.getHiveDecimal; +import static java.lang.Float.intBitsToFloat; +import static java.lang.Math.toIntExact; +import static java.util.Objects.requireNonNull; + +public final class FieldSetterFactory +{ + private final DateTimeZone timeZone; + + public FieldSetterFactory(DateTimeZone timeZone) + { + this.timeZone = requireNonNull(timeZone, "timeZone is null"); + } + + public FieldSetter create(SettableStructObjectInspector rowInspector, Object row, StructField field, Type type) + { + if (type.equals(BooleanType.BOOLEAN)) { + return new BooleanFieldSetter(rowInspector, row, field); + } + + if (type.equals(BigintType.BIGINT)) { + return new BigintFieldBuilder(rowInspector, row, field); + } + + if (type.equals(IntegerType.INTEGER)) { + return new IntFieldSetter(rowInspector, row, field); + } + + if (type.equals(SmallintType.SMALLINT)) { + return new SmallintFieldSetter(rowInspector, row, field); + } + + if (type.equals(TinyintType.TINYINT)) { + return new TinyintFieldSetter(rowInspector, row, field); + } + + if (type.equals(RealType.REAL)) { + return new FloatFieldSetter(rowInspector, row, field); + } + + if (type.equals(DoubleType.DOUBLE)) { + return new DoubleFieldSetter(rowInspector, row, field); + } + + if (type instanceof VarcharType) { + return new VarcharFieldSetter(rowInspector, row, field, type); + } + + if (type instanceof CharType) { + return new CharFieldSetter(rowInspector, row, field, type); + } + + if (type.equals(VarbinaryType.VARBINARY)) { + return new BinaryFieldSetter(rowInspector, row, field); + } + + if (type.equals(DateType.DATE)) { + return new DateFieldSetter(rowInspector, row, field); + } + + if (type.equals(TimestampType.TIMESTAMP)) { + return new TimestampFieldSetter(rowInspector, row, field, timeZone); + } + + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + return new DecimalFieldSetter(rowInspector, row, field, decimalType); + } + + if (isArrayType(type)) { + return new ArrayFieldSetter(rowInspector, row, field, type.getTypeParameters().get(0)); + } + + if (isMapType(type)) { + return new MapFieldSetter(rowInspector, row, field, type.getTypeParameters().get(0), type.getTypeParameters().get(1)); + } + + if (isRowType(type)) { + return new RowFieldSetter(rowInspector, row, field, type.getTypeParameters()); + } + + throw new IllegalArgumentException("unsupported type: " + type); + } + + public abstract static class FieldSetter + { + protected final SettableStructObjectInspector rowInspector; + protected final Object row; + protected final StructField field; + + private FieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) + { + this.rowInspector = requireNonNull(rowInspector, "rowInspector is null"); + this.row = requireNonNull(row, "row is null"); + this.field = requireNonNull(field, "field is null"); + } + + public abstract void setField(Block block, int position); + } + + private static class BooleanFieldSetter + extends FieldSetter + { + private final BooleanWritable value = new BooleanWritable(); + + public BooleanFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) + { + super(rowInspector, row, field); + } + + @Override + public void setField(Block block, int position) + { + value.set(BooleanType.BOOLEAN.getBoolean(block, position)); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class BigintFieldBuilder + extends FieldSetter + { + private final LongWritable value = new LongWritable(); + + public BigintFieldBuilder(SettableStructObjectInspector rowInspector, Object row, StructField field) + { + super(rowInspector, row, field); + } + + @Override + public void setField(Block block, int position) + { + value.set(BigintType.BIGINT.getLong(block, position)); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class IntFieldSetter + extends FieldSetter + { + private final IntWritable value = new IntWritable(); + + public IntFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) + { + super(rowInspector, row, field); + } + + @Override + public void setField(Block block, int position) + { + value.set(toIntExact(IntegerType.INTEGER.getLong(block, position))); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class SmallintFieldSetter + extends FieldSetter + { + private final ShortWritable value = new ShortWritable(); + + public SmallintFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) + { + super(rowInspector, row, field); + } + + @Override + public void setField(Block block, int position) + { + value.set(Shorts.checkedCast(SmallintType.SMALLINT.getLong(block, position))); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class TinyintFieldSetter + extends FieldSetter + { + private final ByteWritable value = new ByteWritable(); + + public TinyintFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) + { + super(rowInspector, row, field); + } + + @Override + public void setField(Block block, int position) + { + value.set(SignedBytes.checkedCast(TinyintType.TINYINT.getLong(block, position))); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class DoubleFieldSetter + extends FieldSetter + { + private final DoubleWritable value = new DoubleWritable(); + + public DoubleFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) + { + super(rowInspector, row, field); + } + + @Override + public void setField(Block block, int position) + { + value.set(DoubleType.DOUBLE.getDouble(block, position)); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class FloatFieldSetter + extends FieldSetter + { + private final FloatWritable value = new FloatWritable(); + + public FloatFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) + { + super(rowInspector, row, field); + } + + @Override + public void setField(Block block, int position) + { + value.set(intBitsToFloat((int) RealType.REAL.getLong(block, position))); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class VarcharFieldSetter + extends FieldSetter + { + private final Text value = new Text(); + private final Type type; + + public VarcharFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, Type type) + { + super(rowInspector, row, field); + this.type = type; + } + + @Override + public void setField(Block block, int position) + { + value.set(type.getSlice(block, position).getBytes()); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class CharFieldSetter + extends FieldSetter + { + private final Text value = new Text(); + private final Type type; + + public CharFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, Type type) + { + super(rowInspector, row, field); + this.type = type; + } + + @Override + public void setField(Block block, int position) + { + value.set(type.getSlice(block, position).getBytes()); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class BinaryFieldSetter + extends FieldSetter + { + private final BytesWritable value = new BytesWritable(); + + public BinaryFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) + { + super(rowInspector, row, field); + } + + @Override + public void setField(Block block, int position) + { + byte[] bytes = VarbinaryType.VARBINARY.getSlice(block, position).getBytes(); + value.set(bytes, 0, bytes.length); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class DateFieldSetter + extends FieldSetter + { + private final DateWritableV2 value = new DateWritableV2(); + + public DateFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field) + { + super(rowInspector, row, field); + } + + @Override + public void setField(Block block, int position) + { + value.set(toIntExact(DateType.DATE.getLong(block, position))); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class TimestampFieldSetter + extends FieldSetter + { + private final DateTimeZone timeZone; + private final TimestampWritableV2 value = new TimestampWritableV2(); + + public TimestampFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, DateTimeZone timeZone) + { + super(rowInspector, row, field); + this.timeZone = requireNonNull(timeZone, "timeZone is null"); + } + + @Override + public void setField(Block block, int position) + { + long epochMilli = TimestampType.TIMESTAMP.getLong(block, position); + epochMilli = timeZone.convertLocalToUTC(epochMilli, false); + value.set(Timestamp.ofEpochMilli(epochMilli)); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class DecimalFieldSetter + extends FieldSetter + { + private final HiveDecimalWritable value = new HiveDecimalWritable(); + private final DecimalType decimalType; + + public DecimalFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, DecimalType decimalType) + { + super(rowInspector, row, field); + this.decimalType = decimalType; + } + + @Override + public void setField(Block block, int position) + { + value.set(getHiveDecimal(decimalType, block, position)); + rowInspector.setStructFieldData(row, field, value); + } + } + + private static class ArrayFieldSetter + extends FieldSetter + { + private final Type elementType; + + public ArrayFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, Type elementType) + { + super(rowInspector, row, field); + this.elementType = requireNonNull(elementType, "elementType is null"); + } + + @Override + public void setField(Block block, int position) + { + Block arrayBlock = (Block) block.getObject(position, Block.class); + + List list = new ArrayList<>(arrayBlock.getPositionCount()); + for (int i = 0; i < arrayBlock.getPositionCount(); i++) { + Object element = HiveWriteUtils.getField(elementType, arrayBlock, i); + list.add(element); + } + + rowInspector.setStructFieldData(row, field, list); + } + } + + private static class MapFieldSetter + extends FieldSetter + { + private final Type keyType; + private final Type valueType; + + public MapFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, Type keyType, Type valueType) + { + super(rowInspector, row, field); + this.keyType = requireNonNull(keyType, "keyType is null"); + this.valueType = requireNonNull(valueType, "valueType is null"); + } + + @Override + public void setField(Block block, int position) + { + Block mapBlock = (Block) block.getObject(position, Block.class); + Map map = new HashMap<>(mapBlock.getPositionCount() * 2); + for (int i = 0; i < mapBlock.getPositionCount(); i += 2) { + Object key = HiveWriteUtils.getField(keyType, mapBlock, i); + Object value = HiveWriteUtils.getField(valueType, mapBlock, i + 1); + map.put(key, value); + } + + rowInspector.setStructFieldData(row, field, map); + } + } + + private static class RowFieldSetter + extends FieldSetter + { + private final List fieldTypes; + + public RowFieldSetter(SettableStructObjectInspector rowInspector, Object row, StructField field, List fieldTypes) + { + super(rowInspector, row, field); + this.fieldTypes = ImmutableList.copyOf(fieldTypes); + } + + @Override + public void setField(Block block, int position) + { + Block rowBlock = (Block) block.getObject(position, Block.class); + + // TODO reuse row object and use FieldSetters, like we do at the top level + // Ideally, we'd use the same recursive structure starting from the top, but + // this requires modeling row types in the same way we model table rows + // (multiple blocks vs all fields packed in a single block) + List value = new ArrayList<>(fieldTypes.size()); + for (int i = 0; i < fieldTypes.size(); i++) { + Object element = HiveWriteUtils.getField(fieldTypes.get(i), rowBlock, i); + value.add(element); + } + + rowInspector.setStructFieldData(row, field, value); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/FooterAwareRecordReader.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/FooterAwareRecordReader.java new file mode 100644 index 00000000..66a9a4c1 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/FooterAwareRecordReader.java @@ -0,0 +1,84 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import org.apache.hadoop.hive.ql.exec.FooterBuffer; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; + +import java.io.IOException; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class FooterAwareRecordReader, V extends Writable> + implements RecordReader +{ + private final RecordReader delegate; + private final JobConf job; + private final FooterBuffer footerBuffer = new FooterBuffer(); + + public FooterAwareRecordReader(RecordReader delegate, int footerCount, JobConf job) + throws IOException + { + this.delegate = requireNonNull(delegate, "delegate is null"); + this.job = requireNonNull(job, "job is null"); + + checkArgument(footerCount > 0, "footerCount is expected to be positive"); + + footerBuffer.initializeBuffer(job, delegate, footerCount, delegate.createKey(), delegate.createValue()); + } + + @Override + public boolean next(K key, V value) + throws IOException + { + return footerBuffer.updateBuffer(job, delegate, key, value); + } + + @Override + public K createKey() + { + return delegate.createKey(); + } + + @Override + public V createValue() + { + return delegate.createValue(); + } + + @Override + public long getPos() + throws IOException + { + return delegate.getPos(); + } + + @Override + public void close() + throws IOException + { + delegate.close(); + } + + @Override + public float getProgress() + throws IOException + { + return delegate.getProgress(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HiveBucketingV1.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HiveBucketingV1.java new file mode 100644 index 00000000..5a94e833 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HiveBucketingV1.java @@ -0,0 +1,204 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.primitives.Shorts; +import com.google.common.primitives.SignedBytes; +import io.airlift.slice.Slice; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.spi.Page; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.type.Chars; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.lang.Double.doubleToLongBits; +import static java.lang.Float.floatToIntBits; +import static java.lang.Float.intBitsToFloat; +import static java.lang.Math.toIntExact; +import static java.util.Objects.requireNonNull; +import static org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; + +public final class HiveBucketingV1 +{ + private HiveBucketingV1() {} + + public static int getBucketHashCode(List types, Page page, int position) + { + return getBucketHashCode(types, page, position, page.getChannelCount()); + } + + public static int getBucketHashCode(List types, Page page, int position, int channelCount) + { + checkArgument(types.size() == channelCount); + int result = 0; + for (int i = 0; i < channelCount; i++) { + int fieldHash = hash(types.get(i), page.getBlock(i), position); + result = result * 31 + fieldHash; + } + return result; + } + + public static int getBucketHashCode(List types, Object[] values) + { + checkArgument(types.size() == values.length); + int result = 0; + for (int i = 0; i < values.length; i++) { + int fieldHash = hash(types.get(i), values[i]); + result = result * 31 + fieldHash; + } + return result; + } + + static int hash(TypeInfo type, Block block, int position) + { + // This function mirrors the behavior of function hashCode in + // HIVE-12025 ba83fd7bff serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java + // https://github.com/apache/hive/blob/ba83fd7bff/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java + + if (block.isNull(position)) { + return 0; + } + + switch (type.getCategory()) { + case PRIMITIVE: + PrimitiveTypeInfo typeInfo = (PrimitiveTypeInfo) type; + PrimitiveCategory primitiveCategory = typeInfo.getPrimitiveCategory(); + Type prestoType = requireNonNull(HiveType.getPrimitiveType(typeInfo)); + switch (primitiveCategory) { + case BOOLEAN: + return prestoType.getBoolean(block, position) ? 1 : 0; + case BYTE: + return SignedBytes.checkedCast(prestoType.getLong(block, position)); + case SHORT: + return Shorts.checkedCast(prestoType.getLong(block, position)); + case INT: + return toIntExact(prestoType.getLong(block, position)); + case LONG: + long bigintValue = prestoType.getLong(block, position); + return (int) ((bigintValue >>> 32) ^ bigintValue); + case FLOAT: + // convert to canonical NaN if necessary + return floatToIntBits(intBitsToFloat(toIntExact(prestoType.getLong(block, position)))); + case DOUBLE: + long doubleValue = doubleToLongBits(prestoType.getDouble(block, position)); + return (int) ((doubleValue >>> 32) ^ doubleValue); + case STRING: + return hashBytes(0, prestoType.getSlice(block, position)); + case VARCHAR: + return hashBytes(1, prestoType.getSlice(block, position)); + case CHAR: + return hashBytes(1, Chars.truncateToLengthAndTrimSpaces(prestoType.getSlice(block, position), prestoType)); + case DATE: + // day offset from 1970-01-01 + return toIntExact(prestoType.getLong(block, position)); + default: + throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive primitive category: " + primitiveCategory); + } + case LIST: + return hashOfList((ListTypeInfo) type, block.getObject(position, Block.class)); + case MAP: + return hashOfMap((MapTypeInfo) type, block.getObject(position, Block.class)); + default: + // TODO: support more types, e.g. ROW + throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive category: " + type.getCategory()); + } + } + + private static int hash(TypeInfo type, Object value) + { + if (value == null) { + return 0; + } + + switch (type.getCategory()) { + case PRIMITIVE: + PrimitiveTypeInfo typeInfo = (PrimitiveTypeInfo) type; + PrimitiveCategory primitiveCategory = typeInfo.getPrimitiveCategory(); + switch (primitiveCategory) { + case BOOLEAN: + return (boolean) value ? 1 : 0; + case BYTE: + return SignedBytes.checkedCast((long) value); + case SHORT: + return Shorts.checkedCast((long) value); + case INT: + return toIntExact((long) value); + case LONG: + long bigintValue = (long) value; + return (int) ((bigintValue >>> 32) ^ bigintValue); + case FLOAT: + // convert to canonical NaN if necessary + return floatToIntBits(intBitsToFloat(toIntExact((long) value))); + case DOUBLE: + long doubleValue = doubleToLongBits((double) value); + return (int) ((doubleValue >>> 32) ^ doubleValue); + case STRING: + return hashBytes(0, (Slice) value); + case VARCHAR: + return hashBytes(1, (Slice) value); + case CHAR: + return hashBytes(1, (Slice) value); + case DATE: + // day offset from 1970-01-01 + return toIntExact((long) value); + default: + throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive primitive category: " + primitiveCategory); + } + case LIST: + return hashOfList((ListTypeInfo) type, (Block) value); + case MAP: + return hashOfMap((MapTypeInfo) type, (Block) value); + default: + // TODO: support more types, e.g. ROW + throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive category: " + type.getCategory()); + } + } + + private static int hashOfMap(MapTypeInfo type, Block singleMapBlock) + { + TypeInfo keyTypeInfo = type.getMapKeyTypeInfo(); + TypeInfo valueTypeInfo = type.getMapValueTypeInfo(); + int result = 0; + for (int i = 0; i < singleMapBlock.getPositionCount(); i += 2) { + result += hash(keyTypeInfo, singleMapBlock, i) ^ hash(valueTypeInfo, singleMapBlock, i + 1); + } + return result; + } + + private static int hashOfList(ListTypeInfo type, Block singleListBlock) + { + TypeInfo elementTypeInfo = type.getListElementTypeInfo(); + int result = 0; + for (int i = 0; i < singleListBlock.getPositionCount(); i++) { + result = result * 31 + hash(elementTypeInfo, singleListBlock, i); + } + return result; + } + + private static int hashBytes(int initialValue, Slice bytes) + { + int result = initialValue; + for (int i = 0; i < bytes.length(); i++) { + result = result * 31 + bytes.getByte(i); + } + return result; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HiveBucketingV2.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HiveBucketingV2.java new file mode 100644 index 00000000..c8b8da82 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HiveBucketingV2.java @@ -0,0 +1,228 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.primitives.Shorts; +import com.google.common.primitives.SignedBytes; +import io.airlift.slice.Slice; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.spi.Page; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.type.Chars; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hive.common.util.Murmur3; + +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.lang.Double.doubleToLongBits; +import static java.lang.Double.doubleToRawLongBits; +import static java.lang.Float.floatToIntBits; +import static java.lang.Float.floatToRawIntBits; +import static java.lang.Float.intBitsToFloat; +import static java.lang.Math.toIntExact; +import static java.util.Objects.requireNonNull; +import static org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; + +public final class HiveBucketingV2 +{ + private HiveBucketingV2() {} + + public static int getBucketHashCode(List types, Page page, int position) + { + return getBucketHashCode(types, page, position, page.getChannelCount()); + } + + public static int getBucketHashCode(List types, Page page, int position, int channelCount) + { + checkArgument(types.size() == channelCount); + int result = 0; + for (int i = 0; i < channelCount; i++) { + int fieldHash = hash(types.get(i), page.getBlock(i), position); + result = result * 31 + fieldHash; + } + return result; + } + + public static int getBucketHashCode(List types, Object[] values) + { + checkArgument(types.size() == values.length); + int result = 0; + for (int i = 0; i < values.length; i++) { + int fieldHash = hash(types.get(i), values[i]); + result = result * 31 + fieldHash; + } + return result; + } + + private static int hash(TypeInfo type, Block block, int position) + { + // This function mirrors the behavior of function hashCodeMurmur in + // HIVE-18910 (and following) 7dc47faddb serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java + // https://github.com/apache/hive/blob/7dc47faddba9f079bbe2698aaa4d8712e7654f87/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java + + if (block.isNull(position)) { + return 0; + } + + switch (type.getCategory()) { + case PRIMITIVE: + PrimitiveTypeInfo typeInfo = (PrimitiveTypeInfo) type; + PrimitiveCategory primitiveCategory = typeInfo.getPrimitiveCategory(); + Type prestoType = requireNonNull(HiveType.getPrimitiveType(typeInfo)); + switch (primitiveCategory) { + case BOOLEAN: + return prestoType.getBoolean(block, position) ? 1 : 0; + case BYTE: + return SignedBytes.checkedCast(prestoType.getLong(block, position)); + case SHORT: + return Murmur3.hash32(bytes(Shorts.checkedCast(prestoType.getLong(block, position)))); + case INT: + return Murmur3.hash32(bytes(toIntExact(prestoType.getLong(block, position)))); + case LONG: + return Murmur3.hash32(bytes(prestoType.getLong(block, position))); + case FLOAT: + // convert to canonical NaN if necessary + // Sic! we're `floatToIntBits -> cast to float -> floatToRawIntBits` just as it is (implicitly) done in + // https://github.com/apache/hive/blob/7dc47faddba9f079bbe2698aaa4d8712e7654f87/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java#L830 + return Murmur3.hash32(bytes(floatToRawIntBits(floatToIntBits(intBitsToFloat(toIntExact(prestoType.getLong(block, position))))))); + case DOUBLE: + // Sic! we're `doubleToLongBits -> cast to double -> doubleToRawLongBits` just as it is (implicitly) done in + // https://github.com/apache/hive/blob/7dc47faddba9f079bbe2698aaa4d8712e7654f87/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java#L836 + return Murmur3.hash32(bytes(doubleToRawLongBits(doubleToLongBits(prestoType.getDouble(block, position))))); + case STRING: + return Murmur3.hash32(prestoType.getSlice(block, position).getBytes()); + case VARCHAR: + return Murmur3.hash32(prestoType.getSlice(block, position).getBytes()); + case CHAR: + return Murmur3.hash32(Chars.truncateToLengthAndTrimSpaces(prestoType.getSlice(block, position), prestoType).getBytes()); + case DATE: + // day offset from 1970-01-01 + return Murmur3.hash32(bytes(toIntExact(prestoType.getLong(block, position)))); + default: + throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive primitive category: " + primitiveCategory); + } + case LIST: + return hashOfList((ListTypeInfo) type, block.getObject(position, Block.class)); + case MAP: + return hashOfMap((MapTypeInfo) type, block.getObject(position, Block.class)); + default: + // TODO: support more types, e.g. ROW + throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive category: " + type.getCategory()); + } + } + + private static int hash(TypeInfo type, Object value) + { + if (value == null) { + return 0; + } + + switch (type.getCategory()) { + case PRIMITIVE: + PrimitiveTypeInfo typeInfo = (PrimitiveTypeInfo) type; + PrimitiveCategory primitiveCategory = typeInfo.getPrimitiveCategory(); + switch (primitiveCategory) { + case BOOLEAN: + return (boolean) value ? 1 : 0; + case BYTE: + return SignedBytes.checkedCast((long) value); + case SHORT: + return Murmur3.hash32(bytes(Shorts.checkedCast((long) value))); + case INT: + return Murmur3.hash32(bytes(toIntExact((long) value))); + case LONG: + return Murmur3.hash32(bytes((long) value)); + case FLOAT: + // convert to canonical NaN if necessary + // Sic! we're `floatToIntBits -> cast to float -> floatToRawIntBits` just as it is (implicitly) done in + // https://github.com/apache/hive/blob/7dc47faddba9f079bbe2698aaa4d8712e7654f87/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java#L830 + return Murmur3.hash32(bytes(floatToRawIntBits(floatToIntBits(intBitsToFloat(toIntExact((long) value)))))); + case DOUBLE: + // convert to canonical NaN if necessary + // Sic! we're `doubleToLongBits -> cast to double -> doubleToRawLongBits` just as it is (implicitly) done in + // https://github.com/apache/hive/blob/7dc47faddba9f079bbe2698aaa4d8712e7654f87/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java#L836 + return Murmur3.hash32(bytes(doubleToRawLongBits(doubleToLongBits((double) value)))); + case STRING: + return Murmur3.hash32(((Slice) value).getBytes()); + case VARCHAR: + return Murmur3.hash32(((Slice) value).getBytes()); + case CHAR: + return Murmur3.hash32(((Slice) value).getBytes()); + case DATE: + // day offset from 1970-01-01 + return Murmur3.hash32(bytes(toIntExact((long) value))); + default: + throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive primitive category: " + primitiveCategory); + } + case LIST: + return hashOfList((ListTypeInfo) type, (Block) value); + case MAP: + return hashOfMap((MapTypeInfo) type, (Block) value); + default: + // TODO: support more types, e.g. ROW + throw new UnsupportedOperationException("Computation of Hive bucket hashCode is not supported for Hive category: " + type.getCategory()); + } + } + + private static int hashOfMap(MapTypeInfo type, Block singleMapBlock) + { + TypeInfo keyTypeInfo = type.getMapKeyTypeInfo(); + TypeInfo valueTypeInfo = type.getMapValueTypeInfo(); + int result = 0; + for (int i = 0; i < singleMapBlock.getPositionCount(); i += 2) { + // Sic! we're hashing map keys with v2 but map values with v1 just as in + // https://github.com/apache/hive/blob/7dc47faddba9f079bbe2698aaa4d8712e7654f87/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java#L903-L904 + result += hash(keyTypeInfo, singleMapBlock, i) ^ HiveBucketingV1.hash(valueTypeInfo, singleMapBlock, i + 1); + } + return result; + } + + private static int hashOfList(ListTypeInfo type, Block singleListBlock) + { + TypeInfo elementTypeInfo = type.getListElementTypeInfo(); + int result = 0; + for (int i = 0; i < singleListBlock.getPositionCount(); i++) { + result = result * 31 + hash(elementTypeInfo, singleListBlock, i); + } + return result; + } + + // big-endian + @SuppressWarnings("NumericCastThatLosesPrecision") + private static byte[] bytes(short value) + { + return new byte[] {(byte) ((value >> 8) & 0xff), (byte) (value & 0xff)}; + } + + // big-endian + @SuppressWarnings("NumericCastThatLosesPrecision") + private static byte[] bytes(int value) + { + return new byte[] {(byte) ((value >> 24) & 0xff), (byte) ((value >> 16) & 0xff), (byte) ((value >> 8) & 0xff), (byte) (value & 0xff)}; + } + + // big-endian + @SuppressWarnings("NumericCastThatLosesPrecision") + private static byte[] bytes(long value) + { + return new byte[] { + (byte) ((value >> 56) & 0xff), (byte) ((value >> 48) & 0xff), (byte) ((value >> 40) & 0xff), (byte) ((value >> 32) & 0xff), + (byte) ((value >> 24) & 0xff), (byte) ((value >> 16) & 0xff), (byte) ((value >> 8) & 0xff), (byte) (value & 0xff)}; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HiveFileIterator.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HiveFileIterator.java new file mode 100644 index 00000000..ecaf661e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HiveFileIterator.java @@ -0,0 +1,185 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.AbstractIterator; +import com.google.common.collect.Iterators; +import io.airlift.stats.TimeStat; +import io.prestosql.plugin.hive.DirectoryLister; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.NamenodeStats; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.PrestoException; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.fs.RemoteIterator; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Collections; +import java.util.Deque; +import java.util.Iterator; + +import static java.util.Objects.requireNonNull; + +public class HiveFileIterator + extends AbstractIterator +{ + public enum NestedDirectoryPolicy + { + IGNORED, + RECURSE, + FAIL + } + + private final Deque paths = new ArrayDeque<>(); + private final Table table; + private final FileSystem fileSystem; + private final DirectoryLister directoryLister; + private final NamenodeStats namenodeStats; + private final NestedDirectoryPolicy nestedDirectoryPolicy; + private final PathFilter pathFilter; + + private Iterator remoteIterator = Collections.emptyIterator(); + + public HiveFileIterator( + Table table, + Path path, + FileSystem fileSystem, + DirectoryLister directoryLister, + NamenodeStats namenodeStats, + NestedDirectoryPolicy nestedDirectoryPolicy, + PathFilter pathFilter) + { + paths.addLast(requireNonNull(path, "path is null")); + this.table = requireNonNull(table, "table is null"); + this.fileSystem = requireNonNull(fileSystem, "fileSystem is null"); + this.directoryLister = requireNonNull(directoryLister, "directoryLister is null"); + this.namenodeStats = requireNonNull(namenodeStats, "namenodeStats is null"); + this.nestedDirectoryPolicy = requireNonNull(nestedDirectoryPolicy, "nestedDirectoryPolicy is null"); + this.pathFilter = requireNonNull(pathFilter, "pathFilter is null"); + } + + @Override + protected LocatedFileStatus computeNext() + { + while (true) { + while (remoteIterator.hasNext()) { + LocatedFileStatus status = getLocatedFileStatus(remoteIterator); + + // Ignore hidden files and directories. Hive ignores files starting with _ and . as well. + String fileName = status.getPath().getName(); + if (fileName.startsWith("_") || fileName.startsWith(".")) { + continue; + } + + if (status.isDirectory()) { + switch (nestedDirectoryPolicy) { + case IGNORED: + continue; + case RECURSE: + paths.add(status.getPath()); + continue; + case FAIL: + throw new NestedDirectoryNotAllowedException(); + } + } + + return status; + } + + if (paths.isEmpty()) { + return endOfData(); + } + remoteIterator = getLocatedFileStatusRemoteIterator(paths.removeFirst(), pathFilter); + } + } + + private Iterator getLocatedFileStatusRemoteIterator(Path path, PathFilter pathFilter) + { + try (TimeStat.BlockTimer ignored = namenodeStats.getListLocatedStatus().time()) { + return Iterators.filter(new FileStatusIterator(table, path, fileSystem, directoryLister, namenodeStats), input -> pathFilter.accept(input.getPath())); + } + } + + private LocatedFileStatus getLocatedFileStatus(Iterator iterator) + { + try (TimeStat.BlockTimer ignored = namenodeStats.getRemoteIteratorNext().time()) { + return iterator.next(); + } + } + + private static class FileStatusIterator + implements Iterator + { + private final Path path; + private final NamenodeStats namenodeStats; + private final RemoteIterator fileStatusIterator; + + private FileStatusIterator(Table table, Path path, FileSystem fileSystem, DirectoryLister directoryLister, NamenodeStats namenodeStats) + { + this.path = path; + this.namenodeStats = namenodeStats; + try { + this.fileStatusIterator = directoryLister.list(fileSystem, table, path); + } + catch (IOException e) { + throw processException(e); + } + } + + @Override + public boolean hasNext() + { + try { + return fileStatusIterator.hasNext(); + } + catch (IOException e) { + throw processException(e); + } + } + + @Override + public LocatedFileStatus next() + { + try { + return fileStatusIterator.next(); + } + catch (IOException e) { + throw processException(e); + } + } + + private PrestoException processException(IOException exception) + { + namenodeStats.getRemoteIteratorNext().recordException(exception); + if (exception instanceof FileNotFoundException) { + return new PrestoException(HiveErrorCode.HIVE_FILE_NOT_FOUND, "Partition location does not exist: " + path); + } + return new PrestoException(HiveErrorCode.HIVE_FILESYSTEM_ERROR, "Failed to list directory: " + path, exception); + } + } + + public static class NestedDirectoryNotAllowedException + extends RuntimeException + { + public NestedDirectoryNotAllowedException() + { + super("Nested sub-directories are not allowed"); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HudiRealtimeSplitConverter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HudiRealtimeSplitConverter.java new file mode 100644 index 00000000..39c0e68e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/HudiRealtimeSplitConverter.java @@ -0,0 +1,71 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableMap; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +/** + * HoodieRealtimeFileSplit specific implementation of CustomSplitConverter. + * Extracts customSplitInfo from HoodieRealtimeFileSplit and reconstructs HoodieRealtimeFileSplit from Map. + */ +public class HudiRealtimeSplitConverter + implements CustomSplitConverter +{ + public static final String CUSTOM_SPLIT_CLASS_KEY = "custom_split_class"; + private static final String HUDI_DELTA_FILEPATHS_KEY = "hudi_delta_filepaths"; + private static final String HUDI_BASEPATH_KEY = "hudi_basepath"; + private static final String HUDI_MAX_COMMIT_TIME_KEY = "hudi_max_commit_time"; + + @Override + public Optional> extractCustomSplitInfo(FileSplit split) + { + if (split instanceof HoodieRealtimeFileSplit) { + HoodieRealtimeFileSplit hudiSplit = (HoodieRealtimeFileSplit) split; + Map customSplitInfo = ImmutableMap.builder() + .put(CUSTOM_SPLIT_CLASS_KEY, HoodieRealtimeFileSplit.class.getName()) + .put(HUDI_DELTA_FILEPATHS_KEY, String.join(",", hudiSplit.getDeltaLogPaths())) + .put(HUDI_BASEPATH_KEY, hudiSplit.getBasePath()) + .put(HUDI_MAX_COMMIT_TIME_KEY, hudiSplit.getMaxCommitTime()) + .build(); + return Optional.of(customSplitInfo); + } + return Optional.empty(); + } + + @Override + public Optional recreateFileSplitWithCustomInfo(FileSplit split, Map customSplitInfo) throws IOException + { + String customSplitClass = customSplitInfo.get(CUSTOM_SPLIT_CLASS_KEY); + if (HoodieRealtimeFileSplit.class.getName().equals(customSplitClass)) { + requireNonNull(customSplitInfo.get(HUDI_DELTA_FILEPATHS_KEY), "HUDI_DELTA_FILEPATHS_KEY is missing"); + List deltaLogPaths = Arrays.asList(customSplitInfo.get(HUDI_DELTA_FILEPATHS_KEY).split(",")); + return Optional.of(new HoodieRealtimeFileSplit( + split, + requireNonNull(customSplitInfo.get(HUDI_BASEPATH_KEY), "HUDI_BASEPATH_KEY is missing"), + deltaLogPaths, + requireNonNull(customSplitInfo.get(HUDI_MAX_COMMIT_TIME_KEY), "HUDI_MAX_COMMIT_TIME_KEY is missing"))); + } + return Optional.empty(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/IndexCache.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/IndexCache.java new file mode 100644 index 00000000..ef1cc6d2 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/IndexCache.java @@ -0,0 +1,233 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; +import com.google.common.collect.ImmutableList; +import com.google.common.util.concurrent.ThreadFactoryBuilder; +import com.google.inject.Inject; +import io.airlift.log.Logger; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveSplit; +import io.prestosql.spi.HetuConstant; +import io.prestosql.spi.heuristicindex.IndexCacheKey; +import io.prestosql.spi.heuristicindex.IndexClient; +import io.prestosql.spi.heuristicindex.IndexMetadata; +import io.prestosql.spi.heuristicindex.IndexNotCreatedException; +import io.prestosql.spi.heuristicindex.IndexRecord; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.service.PropertyService; +import org.apache.hadoop.fs.Path; +import org.eclipse.jetty.util.URIUtil; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.Paths; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; + +import static io.prestosql.spi.HetuConstant.KILOBYTE; + +public class IndexCache +{ + private static final Logger LOG = Logger.get(IndexCache.class); + private static final ThreadFactory threadFactory = new ThreadFactoryBuilder().setNameFormat("Hive-IndexCache-pool-%d").setDaemon(true).build(); + protected static final List INDEX_TYPES = ImmutableList.of("MINMAX", "BLOOM", "BITMAP"); + + private static ScheduledExecutorService executor; + + private Long loadDelay; // in millisecond + private LoadingCache> cache; + private List indexRecords; + + @Inject + public IndexCache(CacheLoader loader, IndexClient indexClient) + { + // If the static variables have not been initialized + if (PropertyService.getBooleanProperty(HetuConstant.FILTER_ENABLED)) { + loadDelay = PropertyService.getDurationProperty(HetuConstant.FILTER_CACHE_LOADING_DELAY).toMillis(); + // in millisecond + long refreshRate = Math.max(loadDelay / 2, 5000L); + int numThreads = Math.min(Runtime.getRuntime().availableProcessors(), PropertyService.getLongProperty(HetuConstant.FILTER_CACHE_LOADING_THREADS).intValue()); + executor = Executors.newScheduledThreadPool(numThreads, threadFactory); + CacheBuilder> cacheBuilder = CacheBuilder.newBuilder() + .removalListener(e -> ((List) e.getValue()).forEach(i -> { + try { + i.getIndex().close(); + } + catch (IOException ioException) { + LOG.debug(ioException, "Failed to close index " + i); + } + })) + .expireAfterWrite(PropertyService.getDurationProperty(HetuConstant.FILTER_CACHE_TTL).toMillis(), TimeUnit.MILLISECONDS) + .maximumWeight(PropertyService.getLongProperty(HetuConstant.FILTER_CACHE_MAX_MEMORY)) + .weigher((indexCacheKey, indices) -> { + int memorySize = 0; + for (IndexMetadata indexMetadata : indices) { + // HetuConstant.FILTER_CACHE_MAX_MEMORY is set in KBs + // convert index size to KB + memorySize += (indexMetadata.getIndex().getMemoryUsage() / KILOBYTE); + } + return memorySize; + }); + if (PropertyService.getBooleanProperty(HetuConstant.FILTER_CACHE_SOFT_REFERENCE)) { + cacheBuilder.softValues(); + } + // Refresh cache according to index records in the background. Evict index from cache if it's dropped. + executor.scheduleAtFixedRate(() -> { + try { + if (cache.size() > 0) { + // only refresh cache is it's not empty + List newRecords = indexClient.getAllIndexRecords(); + + if (indexRecords != null) { + for (IndexRecord old : indexRecords) { + boolean found = false; + for (IndexRecord now : newRecords) { + if (now.name.equals(old.name)) { + found = true; + if (now.lastModifiedTime != old.lastModifiedTime) { + // index record has been updated. evict + evictFromCache(old); + LOG.debug("Index for {%s} has been evicted from cache because the index has been updated.", old); + } + } + } + // old record is gone. evict from cache + if (!found) { + evictFromCache(old); + LOG.debug("Index for {%s} has been evicted from cache because the index has been dropped.", old); + } + } + } + + indexRecords = newRecords; + } + } + catch (Exception e) { + LOG.debug(e, "Error using index records to refresh cache"); + } + }, loadDelay, refreshRate, TimeUnit.MILLISECONDS); + cache = cacheBuilder.build(loader); + } + } + + // Override the loadDelay, for testing + public IndexCache(CacheLoader> loader, Long loadDelay, IndexClient indexClient) + { + this(loader, indexClient); + this.loadDelay = loadDelay; + } + + public List getIndices(String catalog, String table, HiveSplit hiveSplit, TupleDomain effectivePredicate, List partitions) + { + if (cache == null || catalog == null || table == null || hiveSplit == null || effectivePredicate == null) { + return Collections.emptyList(); + } + + long lastModifiedTime = hiveSplit.getLastModifiedTime(); + Path path = new Path(hiveSplit.getPath()); + + URI pathUri = URI.create(URIUtil.encodePath(path.toString())); + String tableFqn = catalog + "." + table; + + // for each split, load indexes for each predicate (if the predicate contains an indexed column) + List splitIndexes = new LinkedList<>(); + effectivePredicate.getDomains().get().keySet().stream() + // if the domain column is a partition column, skip it + .filter(key -> partitions == null || !partitions.contains(key)) + .map(HiveColumnHandle::getName) + .map(String::toLowerCase) + .forEach(column -> { + // security check required before using values in a Path + // e.g. catalog.schema.table or dc.catalog.schema.table + if (!tableFqn.matches("([\\p{Alnum}_]+\\.){2,3}[\\p{Alnum}_]+")) { + LOG.warn("Invalid table name " + tableFqn); + return; + } + + if (!column.matches("[\\p{Alnum}_]+")) { + LOG.warn("Invalid column name " + column); + return; + } + + for (String indexType : INDEX_TYPES) { + String indexCacheKeyPath = Paths.get(tableFqn, column, indexType, pathUri.getRawPath()).toString(); + IndexCacheKey indexCacheKey = new IndexCacheKey(indexCacheKeyPath, lastModifiedTime); + // check if cache contains the key + List predicateIndexes = cache.getIfPresent(indexCacheKey); + + // if cache didn't contain the key, it has not been loaded, load it asynchronously + if (predicateIndexes == null) { + executor.schedule(() -> { + try { + cache.get(indexCacheKey); + LOG.debug("Loaded index for %s.", indexCacheKeyPath); + } + catch (ExecutionException e) { + if (e.getCause() instanceof IndexNotCreatedException) { + // Do nothing. Index not registered. + } + else if (LOG.isDebugEnabled()) { + LOG.debug(e, "Unable to load index for %s. ", indexCacheKeyPath); + } + } + }, loadDelay, TimeUnit.MILLISECONDS); + } + else { + // if key was present in cache, we still need to check if the index is validate based on the lastModifiedTime + // the index is only valid if the lastModifiedTime of the split matches the index's lastModifiedTime + for (IndexMetadata index : predicateIndexes) { + if (index.getLastModifiedTime() != lastModifiedTime) { + cache.invalidate(indexCacheKey); + predicateIndexes = Collections.emptyList(); + break; + } + } + + // cache contained the key + splitIndexes.addAll(predicateIndexes); + } + } + }); + + return splitIndexes; + } + + @VisibleForTesting + protected long getCacheSize() + { + return cache.size(); + } + + private void evictFromCache(IndexRecord record) + { + String recordInCacheKey = String.format("%s/%s/%s", record.qualifiedTable, String.join(",", record.columns), record.indexType); + for (IndexCacheKey key : cache.asMap().keySet()) { + if (key.getPath().startsWith(recordInCacheKey)) { + cache.invalidate(key); + } + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/IndexCacheLoader.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/IndexCacheLoader.java new file mode 100644 index 00000000..b42b2f0e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/IndexCacheLoader.java @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.cache.CacheLoader; +import com.google.inject.Inject; +import io.prestosql.spi.heuristicindex.IndexCacheKey; +import io.prestosql.spi.heuristicindex.IndexClient; +import io.prestosql.spi.heuristicindex.IndexMetadata; +import io.prestosql.spi.heuristicindex.IndexNotCreatedException; + +import java.util.List; +import java.util.stream.Collectors; + +import static java.util.Comparator.comparingLong; +import static java.util.Objects.requireNonNull; + +public class IndexCacheLoader + extends CacheLoader> +{ + private final IndexClient indexClient; + + @Inject + public IndexCacheLoader(IndexClient indexClient) + { + this.indexClient = indexClient; + } + + @Override + public List load(IndexCacheKey key) + throws Exception + { + requireNonNull(key); + requireNonNull(indexClient); + + // only load index files if index lastModified matches key lastModified + long lastModified; + + try { + lastModified = indexClient.getLastModifiedTime(key.getPath()); + } + catch (Exception e) { + // no lastModified file found, i.e. index doesn't exist + throw new IndexNotCreatedException(); + } + + if (lastModified != key.getLastModifiedTime()) { + throw new Exception("Index files are expired for key " + key); + } + + List indices; + try { + indices = indexClient.readSplitIndex(key.getPath()); + } + catch (Exception e) { + throw new Exception("No valid index files found for key " + key, e); + } + + // null indicates that the index is not registered in index records + if (indices == null) { + throw new IndexNotCreatedException(); + } + + // lastModified file was valid, but no index files for the given types + if (indices.isEmpty()) { + throw new Exception("No index files found for key " + key); + } + + // Sort the indices based on split starting position + return indices.stream() + .sorted(comparingLong(IndexMetadata::getSplitStart)) + .collect(Collectors.toList()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/InternalHiveSplitFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/InternalHiveSplitFactory.java new file mode 100644 index 00000000..8a5b88c0 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/InternalHiveSplitFactory.java @@ -0,0 +1,267 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.DeleteDeltaLocations; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HivePartitionKey; +import io.prestosql.plugin.hive.HiveSplit.BucketConversion; +import io.prestosql.plugin.hive.HiveTypeName; +import io.prestosql.plugin.hive.InternalHiveSplit; +import io.prestosql.plugin.hive.S3SelectPushdown; +import io.prestosql.spi.HostAddress; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.TupleDomain; +import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputFormat; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.airlift.slice.Slices.utf8Slice; +import static io.prestosql.plugin.hive.HiveColumnHandle.isPathColumnHandle; +import static io.prestosql.plugin.hive.HiveUtil.isSplittable; +import static io.prestosql.plugin.hive.util.CustomSplitConversionUtils.extractCustomSplitInfo; +import static java.util.Objects.requireNonNull; + +public class InternalHiveSplitFactory +{ + private final FileSystem fileSystem; + private final String partitionName; + private final InputFormat inputFormat; + private final Properties schema; + private final List partitionKeys; + private final Optional pathDomain; + private final Map columnCoercions; + private final Optional bucketConversion; + private final boolean forceLocalScheduling; + private final boolean s3SelectPushdownEnabled; + + public InternalHiveSplitFactory( + FileSystem fileSystem, + String partitionName, + InputFormat inputFormat, + Properties schema, + List partitionKeys, + TupleDomain effectivePredicate, + Map columnCoercions, + Optional bucketConversion, + boolean forceLocalScheduling, + boolean s3SelectPushdownEnabled) + { + this.fileSystem = requireNonNull(fileSystem, "fileSystem is null"); + this.partitionName = requireNonNull(partitionName, "partitionName is null"); + this.inputFormat = requireNonNull(inputFormat, "inputFormat is null"); + this.schema = requireNonNull(schema, "schema is null"); + this.partitionKeys = requireNonNull(partitionKeys, "partitionKeys is null"); + pathDomain = getPathDomain(requireNonNull(effectivePredicate, "effectivePredicate is null")); + this.columnCoercions = requireNonNull(columnCoercions, "columnCoercions is null"); + this.bucketConversion = requireNonNull(bucketConversion, "bucketConversion is null"); + this.forceLocalScheduling = forceLocalScheduling; + this.s3SelectPushdownEnabled = s3SelectPushdownEnabled; + } + + public String getPartitionName() + { + return partitionName; + } + + public Optional createInternalHiveSplit(LocatedFileStatus status, boolean splittable, Optional deleteDeltaLocations, Optional startRowOffsetOfFile) + { + return createInternalHiveSplit(status, OptionalInt.empty(), splittable, deleteDeltaLocations, startRowOffsetOfFile); + } + + public Optional createInternalHiveSplit(LocatedFileStatus status, int bucketNumber, Optional deleteDeltaLocations) + { + return createInternalHiveSplit(status, OptionalInt.of(bucketNumber), false, deleteDeltaLocations, Optional.empty()); + } + + private Optional createInternalHiveSplit(LocatedFileStatus status, OptionalInt bucketNumber, boolean splittable, Optional deleteDeltaLocations, Optional startRowOffsetOfFile) + { + splittable = splittable && isSplittable(inputFormat, fileSystem, status.getPath()); + return createInternalHiveSplit( + status.getPath(), + status.getBlockLocations(), + 0, + status.getLen(), + status.getLen(), + status.getModificationTime(), + bucketNumber, + splittable, + deleteDeltaLocations, + startRowOffsetOfFile, + ImmutableMap.of()); + } + + public Optional createInternalHiveSplit(FileSplit split) + throws IOException + { + FileStatus file = fileSystem.getFileStatus(split.getPath()); + Map customSplitInfo = extractCustomSplitInfo(split); + return createInternalHiveSplit( + split.getPath(), + fileSystem.getFileBlockLocations(file, split.getStart(), split.getLength()), + split.getStart(), + split.getLength(), + file.getLen(), + file.getModificationTime(), + OptionalInt.empty(), + false, + Optional.empty(), + Optional.empty(), + customSplitInfo); + } + + private Optional createInternalHiveSplit( + Path path, + BlockLocation[] blockLocations, + long start, + long length, + long fileSize, + long lastModifiedTime, + OptionalInt bucketNumber, + boolean splittable, + Optional deleteDeltaLocations, + Optional startRowOffsetOfFile, + Map customSplitInfo) + { + String pathString = path.toString(); + if (!pathMatchesPredicate(pathDomain, pathString)) { + return Optional.empty(); + } + + boolean forceLocalScheduling = this.forceLocalScheduling; + + // For empty files, some filesystem (e.g. LocalFileSystem) produce one empty block + // while others (e.g. hdfs.DistributedFileSystem) produces no block. + // Synthesize an empty block if one does not already exist. + if (fileSize == 0 && blockLocations.length == 0) { + blockLocations = new BlockLocation[] {new BlockLocation()}; + // Turn off force local scheduling because hosts list doesn't exist. + forceLocalScheduling = false; + } + + ImmutableList.Builder blockBuilder = ImmutableList.builder(); + for (BlockLocation blockLocation : blockLocations) { + // clamp the block range + long blockStart = Math.max(start, blockLocation.getOffset()); + long blockEnd = Math.min(start + length, blockLocation.getOffset() + blockLocation.getLength()); + if (blockStart > blockEnd) { + // block is outside split range + continue; + } + if (blockStart == blockEnd && !(blockStart == start && blockEnd == start + length)) { + // skip zero-width block, except in the special circumstance: slice is empty, and the block covers the empty slice interval. + continue; + } + blockBuilder.add(new InternalHiveSplit.InternalHiveBlock(blockStart, blockEnd, getHostAddresses(blockLocation))); + } + List blocks = blockBuilder.build(); + checkBlocks(blocks, start, length); + + if (!splittable) { + // not splittable, use the hosts from the first block if it exists + blocks = ImmutableList.of(new InternalHiveSplit.InternalHiveBlock(start, start + length, blocks.get(0).getAddresses())); + } + + return Optional.of(new InternalHiveSplit( + partitionName, + pathString, + start, + start + length, + fileSize, + lastModifiedTime, + schema, + partitionKeys, + blocks, + bucketNumber, + splittable, + forceLocalScheduling && allBlocksHaveAddress(blocks), + columnCoercions, + bucketConversion, + s3SelectPushdownEnabled && S3SelectPushdown.isCompressionCodecSupported(inputFormat, path), + deleteDeltaLocations, + startRowOffsetOfFile, + customSplitInfo)); + } + + private static void checkBlocks(List blocks, long start, long length) + { + checkArgument(length >= 0); + checkArgument(!blocks.isEmpty()); + checkArgument(start == blocks.get(0).getStart()); + checkArgument(start + length == blocks.get(blocks.size() - 1).getEnd()); + for (int i = 1; i < blocks.size(); i++) { + checkArgument(blocks.get(i - 1).getEnd() == blocks.get(i).getStart()); + } + } + + private static boolean allBlocksHaveAddress(Collection blocks) + { + return blocks.stream() + .map(InternalHiveSplit.InternalHiveBlock::getAddresses) + .noneMatch(List::isEmpty); + } + + private static List getHostAddresses(BlockLocation blockLocation) + { + // Hadoop FileSystem returns "localhost" as a default + return Arrays.stream(getBlockHosts(blockLocation)) + .map(HostAddress::fromString) + .filter(address -> !address.getHostText().equals("localhost")) + .collect(toImmutableList()); + } + + private static String[] getBlockHosts(BlockLocation blockLocation) + { + try { + return blockLocation.getHosts(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static Optional getPathDomain(TupleDomain effectivePredicate) + { + return effectivePredicate.getDomains() + .flatMap(domains -> domains.entrySet().stream() + .filter(entry -> isPathColumnHandle(entry.getKey())) + .map(Map.Entry::getValue) + .findFirst()); + } + + private static boolean pathMatchesPredicate(Optional pathDomain, String path) + { + return pathDomain + .map(domain -> domain.includesNullableValue(utf8Slice(path))) + .orElse(true); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/LoggingInvocationHandler.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/LoggingInvocationHandler.java new file mode 100644 index 00000000..fb7242ab --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/LoggingInvocationHandler.java @@ -0,0 +1,161 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.reflect.AbstractInvocationHandler; +import io.airlift.log.Logger; +import io.airlift.parameternames.ParameterNames; +import io.airlift.units.Duration; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.lang.reflect.Parameter; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Consumer; +import java.util.stream.IntStream; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.collectingAndThen; +import static java.util.stream.Collectors.joining; + +public class LoggingInvocationHandler + extends AbstractInvocationHandler +{ + private final Object delegate; + private final ParameterNamesProvider parameterNames; + private final Consumer logger; + + public LoggingInvocationHandler(Object delegate, ParameterNamesProvider parameterNames, Consumer logger) + { + this.delegate = requireNonNull(delegate, "delegate is null"); + this.parameterNames = requireNonNull(parameterNames, "parameterNames is null"); + this.logger = requireNonNull(logger, "logger is null"); + } + + @Override + protected Object handleInvocation(Object proxy, Method method, Object[] args) + throws Throwable + { + Object result; + long startNanos = System.nanoTime(); + try { + result = method.invoke(delegate, args); + } + catch (InvocationTargetException e) { + Duration elapsed = Duration.nanosSince(startNanos); + Throwable t = e.getCause(); + logger.accept(format("%s took %s and failed with %s", invocationDescription(method, args), elapsed, t)); + throw t; + } + Duration elapsed = Duration.nanosSince(startNanos); + logger.accept(format("%s succeeded in %s", invocationDescription(method, args), elapsed)); + return result; + } + + private String invocationDescription(Method method, Object[] args) + { + Optional> parameterNames = this.parameterNames.getParameterNames(method); + return "Invocation of " + method.getName() + + IntStream.range(0, args.length) + .mapToObj(i -> { + if (parameterNames.isPresent()) { + return format("%s=%s", parameterNames.get().get(i), formatArgument(args[i])); + } + return formatArgument(args[i]); + }) + .collect(joining(", ", "(", ")")); + } + + private static String formatArgument(Object arg) + { + if (arg instanceof String) { + return "'" + ((String) arg).replace("'", "''") + "'"; + } + return String.valueOf(arg); + } + + public interface ParameterNamesProvider + { + Optional> getParameterNames(Method method); + } + + public static class ReflectiveParameterNamesProvider + implements ParameterNamesProvider + { + @Override + public Optional> getParameterNames(Method method) + { + Parameter[] parameters = method.getParameters(); + if (Arrays.stream(parameters).noneMatch(Parameter::isNamePresent)) { + return Optional.empty(); + } + return Arrays.stream(parameters) + .map(Parameter::getName) + .collect(collectingAndThen(toImmutableList(), Optional::of)); + } + } + + public static class AirliftParameterNamesProvider + implements ParameterNamesProvider + { + private static final Logger log = Logger.get(AirliftParameterNamesProvider.class); + + private final Map> parameterNames; + + public AirliftParameterNamesProvider(Class interfaceClass, Class implementationClass) + { + requireNonNull(interfaceClass, "interfaceClass is null"); + requireNonNull(implementationClass, "implementationClass is null"); + + ImmutableMap.Builder> parameterNames = ImmutableMap.builder(); + for (Method interfaceMethod : interfaceClass.getMethods()) { + tryGetParameterNamesForMethod(interfaceMethod, implementationClass) + .map(ImmutableList::copyOf) + .ifPresent(names -> parameterNames.put(interfaceMethod, names)); + } + this.parameterNames = parameterNames.build(); + } + + private static Optional> tryGetParameterNamesForMethod(Method interfaceMethod, Class implementationClass) + { + Optional> names = ParameterNames.tryGetParameterNames(interfaceMethod); + if (names.isPresent()) { + return names; + } + + Method implementationMethod; + try { + implementationMethod = implementationClass.getMethod(interfaceMethod.getName(), interfaceMethod.getParameterTypes()); + } + catch (NoSuchMethodException e) { + log.debug(e, "Could not find implementation for %s", interfaceMethod); + return Optional.empty(); + } + return ParameterNames.tryGetParameterNames(implementationMethod); + } + + @Override + public Optional> getParameterNames(Method method) + { + return Optional.ofNullable(parameterNames.get(method)); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/MergingPageIterator.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/MergingPageIterator.java new file mode 100644 index 00000000..6ed4abe9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/MergingPageIterator.java @@ -0,0 +1,142 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.AbstractIterator; +import com.google.common.collect.ImmutableList; +import io.prestosql.spi.Page; +import io.prestosql.spi.PageBuilder; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.SortOrder; +import io.prestosql.spi.type.Type; + +import java.util.Collection; +import java.util.Iterator; +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.Iterators.concat; +import static com.google.common.collect.Iterators.mergeSorted; +import static com.google.common.collect.Iterators.transform; +import static io.prestosql.plugin.hive.util.SortBuffer.appendPositionTo; +import static java.util.Comparator.naturalOrder; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; + +public class MergingPageIterator + extends AbstractIterator +{ + private final List types; + private final List sortFields; + private final List sortOrders; + private final PageBuilder pageBuilder; + private final Iterator pagePositions; + + public MergingPageIterator( + Collection> iterators, + List types, + List sortFields, + List sortOrders) + { + requireNonNull(sortFields, "sortFields is null"); + requireNonNull(sortOrders, "sortOrders is null"); + checkArgument(sortFields.size() == sortOrders.size(), "sortFields and sortOrders size must match"); + + this.types = ImmutableList.copyOf(requireNonNull(types, "types is null")); + this.sortFields = ImmutableList.copyOf(sortFields); + this.sortOrders = ImmutableList.copyOf(sortOrders); + this.pageBuilder = new PageBuilder(types); + this.pagePositions = mergeSorted( + iterators.stream() + .map(pages -> concat(transform(pages, PagePositionIterator::new))) + .collect(toList()), + naturalOrder()); + } + + @Override + protected Page computeNext() + { + while (!pageBuilder.isFull() && pagePositions.hasNext()) { + pagePositions.next().appendTo(pageBuilder); + } + + if (pageBuilder.isEmpty()) { + return endOfData(); + } + + Page page = pageBuilder.build(); + pageBuilder.reset(); + return page; + } + + private class PagePositionIterator + extends AbstractIterator + { + private final Page page; + private int position = -1; + + private PagePositionIterator(Page page) + { + this.page = requireNonNull(page, "page is null"); + } + + @Override + protected PagePosition computeNext() + { + position++; + if (position == page.getPositionCount()) { + return endOfData(); + } + return new PagePosition(page, position); + } + } + + @SuppressWarnings("ComparableImplementedButEqualsNotOverridden") + private class PagePosition + implements Comparable + { + private final Page page; + private final int position; + + public PagePosition(Page page, int position) + { + this.page = requireNonNull(page, "page is null"); + this.position = position; + } + + public void appendTo(PageBuilder pageBuilder) + { + appendPositionTo(page, position, pageBuilder); + } + + @Override + public int compareTo(PagePosition other) + { + for (int i = 0; i < sortFields.size(); i++) { + int channel = sortFields.get(i); + SortOrder order = sortOrders.get(i); + Type type = types.get(channel); + + Block block = page.getBlock(channel); + Block otherBlock = other.page.getBlock(channel); + + int result = order.compareBlockValue(type, block, position, otherBlock, other.position); + if (result != 0) { + return result; + } + } + return 0; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/PageSourceUtil.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/PageSourceUtil.java new file mode 100644 index 00000000..8a6d3a3e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/PageSourceUtil.java @@ -0,0 +1,250 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.huawei.boostkit.omnidata.model.AggregationInfo; +import com.huawei.boostkit.omnidata.model.Column; +import com.huawei.boostkit.omnidata.model.Predicate; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveOffloadExpression; +import io.prestosql.plugin.hive.HivePartitionKey; +import io.prestosql.plugin.hive.HiveUtil; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.plan.Symbol; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.InputReferenceExpression; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.relation.VariableReferenceExpression; +import io.prestosql.spi.relation.VariableToChannelTranslator; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Set; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.Maps.uniqueIndex; +import static com.huawei.boostkit.omnidata.OmniDataProperty.GRPC_CLIENT_CERT_PATH; +import static com.huawei.boostkit.omnidata.OmniDataProperty.GRPC_CLIENT_PRIVATE_KEY_PATH; +import static com.huawei.boostkit.omnidata.OmniDataProperty.GRPC_CRL_PATH; +import static com.huawei.boostkit.omnidata.OmniDataProperty.GRPC_SSL_ENABLED; +import static com.huawei.boostkit.omnidata.OmniDataProperty.GRPC_TRUST_CA_PATH; +import static com.huawei.boostkit.omnidata.OmniDataProperty.PKI_DIR; +import static io.prestosql.expressions.LogicalRowExpressions.TRUE_CONSTANT; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.DUMMY_OFFLOADED; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static io.prestosql.plugin.hive.HiveColumnHandle.DUMMY_OFFLOADED_COLUMN_INDEX; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toMap; + +public class PageSourceUtil +{ + private PageSourceUtil() {} + + public static List buildColumnsProjections(HiveOffloadExpression expression, + List columns, + Map layoutMap, + Map projectionsLayout) + { + checkArgument(projectionsLayout.isEmpty(), "buildColumnsProjections: input reference error."); + int channel = 0; + ImmutableList.Builder builder = ImmutableList.builder(); + if (!expression.getProjections().isEmpty()) { + for (Map.Entry entry : expression.getProjections().entrySet()) { + RowExpression rowExpression = VariableToChannelTranslator.translate(entry.getValue(), layoutMap); + builder.add(rowExpression); + projectionsLayout.put(new VariableReferenceExpression(entry.getKey().getName(), rowExpression.getType()), channel++); + } + return builder.build(); + } + + Map nameMap = layoutMap.entrySet().stream().collect(toMap(key -> key.getKey().getName(), val -> val.getValue())); + Map typeMap = layoutMap.entrySet().stream().collect( + toMap(key -> key.getKey().getName(), val -> val.getKey().getType())); + Set columnSet = new HashSet<>(); + for (HiveColumnHandle columnHandle : columns) { + if (columnHandle.getHiveColumnIndex() == DUMMY_OFFLOADED_COLUMN_INDEX) { + continue; + } + if (columnSet.add(columnHandle.getName())) { + Type type = typeMap.get(columnHandle.getName()); + InputReferenceExpression inputReferenceExpression = + new InputReferenceExpression(nameMap.get(columnHandle.getName()), type); + projectionsLayout.put(new VariableReferenceExpression(columnHandle.getName(), type), channel++); + builder.add(inputReferenceExpression); + } + } + return builder.build(); + } + + public static List combineDatasourceColumns(List columns, HiveOffloadExpression expression) + { + // May contain duplicate columns + Set nameSet = new HashSet<>(); + ImmutableList.Builder builder = new ImmutableList.Builder<>(); + for (HiveColumnHandle columnHandle : columns) { + if (columnHandle.getColumnType() == DUMMY_OFFLOADED) { + continue; + } + if (nameSet.add(columnHandle.getName())) { + builder.add(columnHandle); + } + } + + for (HiveColumnHandle offload : expression.getOffloadColumns()) { + if (offload.getColumnType() == DUMMY_OFFLOADED) { + continue; + } + if (nameSet.add(offload.getName())) { + builder.add(offload); + } + } + return builder.build(); + } + + public static Map getColumnsLayout(List columnHandles, TypeManager typeManager) + { + ImmutableMap.Builder builder = new ImmutableMap.Builder<>(); + for (int channel = 0; channel < columnHandles.size(); channel++) { + HiveColumnHandle columnHandle = columnHandles.get(channel); + String name = columnHandle.getName(); + Type type = typeManager.getType(columnHandle.getTypeSignature()); + builder.put(new VariableReferenceExpression(name, type), channel); + } + return builder.build(); + } + + public static Optional translateAggregationInfo(Optional aggregationInfo, Map layOut) + { + if (!aggregationInfo.isPresent()) { + return aggregationInfo; + } + + ImmutableMap.Builder functionBuilder = new ImmutableMap.Builder<>(); + for (Map.Entry entry : aggregationInfo.get().getAggregations().entrySet()) { + RowExpression callExpression = VariableToChannelTranslator.translate(entry.getValue().getCall(), layOut); + checkArgument(callExpression instanceof CallExpression); + AggregationInfo.AggregateFunction aggregateFunction = new AggregationInfo.AggregateFunction((CallExpression) callExpression, entry.getValue().isDistinct()); + functionBuilder.put(entry.getKey(), aggregateFunction); + } + ImmutableList.Builder referenceBuilder = new ImmutableList.Builder<>(); + for (RowExpression variable : aggregationInfo.get().getGroupingKeys()) { + RowExpression inputReference = VariableToChannelTranslator.translate(variable, layOut); + referenceBuilder.add(inputReference); + } + return Optional.of(new AggregationInfo(functionBuilder.build(), referenceBuilder.build())); + } + + private static Column buildColumn(HiveColumnHandle columnHandle, + TypeManager typeManager, + List partitionKeys, + OptionalInt bucketNumber, + Path path) + { + Type columnType = typeManager.getType(columnHandle.getTypeSignature()); + if (!columnHandle.getColumnType().equals(PARTITION_KEY)) { + return new Column(columnHandle.getHiveColumnIndex(), columnHandle.getColumnName(), columnType); + } + + Map partitionKeysMap = uniqueIndex(partitionKeys, HivePartitionKey::getName); + String prefilledValue = HiveUtil.getPrefilledColumnValue(columnHandle, partitionKeysMap.get(columnHandle.getName()), path, bucketNumber); + Object columnValue = HiveUtil.typedPartitionKey(prefilledValue, columnType, prefilledValue); + return new Column(columnHandle.getHiveColumnIndex(), columnHandle.getColumnName(), columnType, true, columnValue); + } + + public static Predicate buildPushdownContext(List columns, + HiveOffloadExpression expression, + TypeManager typeManager, + TupleDomain effectivePredicate, + List partitionKeys, + OptionalInt bucketNumber, + Path path) + { + // Translate variable reference to input reference because PageFunctionCompiler can only support input reference. + List datasourceColumns = combineDatasourceColumns(columns, expression); + Map datasoureLayout = getColumnsLayout(datasourceColumns, typeManager); + Optional filter = TRUE_CONSTANT.equals(expression.getFilterExpression()) + ? Optional.empty() : Optional.of(VariableToChannelTranslator.translate(expression.getFilterExpression(), datasoureLayout)); + Map projectionsLayout = new HashMap<>(); + List filterProjections = buildColumnsProjections(expression, columns, datasoureLayout, projectionsLayout); + List types = filterProjections.stream().map(RowExpression::getType).collect(Collectors.toList()); + Optional aggregationInfo = translateAggregationInfo(expression.getAggregations(), projectionsLayout); + List pushDownColumns = new ArrayList<>(); + datasourceColumns.forEach( + column -> { + pushDownColumns.add(buildColumn( + column, + typeManager, + partitionKeys, + bucketNumber, + path)); + }); + + Map domains = effectivePredicate.getDomains().get().entrySet() + .stream().collect(toMap(e -> e.getKey().getName(), Map.Entry::getValue)); + + return new Predicate( + types, + pushDownColumns, + filter, + filterProjections, + domains, + ImmutableMap.of(), + aggregationInfo, + expression.getLimit()); + } + + public static void closeWithSuppression(ConnectorPageSource pageSource, Throwable throwable) + { + requireNonNull(throwable, "throwable is null"); + try { + pageSource.close(); + } + catch (RuntimeException | IOException e) { + // Self-suppression not permitted + if (throwable != e) { + throwable.addSuppressed(e); + } + } + } + + public static ImmutableMap getSslConfiguredProperties(HiveConfig hiveConfig) + { + ImmutableMap.Builder properties = new ImmutableMap.Builder<>(); + properties.put(GRPC_SSL_ENABLED, String.valueOf(hiveConfig.isOmniDataSslEnabled())); + if (hiveConfig.isOmniDataSslEnabled()) { + hiveConfig.getOmniDataSslPkiDir().ifPresent(entry -> properties.put(PKI_DIR, entry)); + hiveConfig.getOmniDataSslClientCertFilePath().ifPresent(entry -> properties.put(GRPC_CLIENT_CERT_PATH, entry)); + hiveConfig.getOmniDataSslPrivateKeyFilePath().ifPresent(entry -> properties.put(GRPC_CLIENT_PRIVATE_KEY_PATH, entry)); + hiveConfig.getOmniDataSslTrustCertFilePath().ifPresent(entry -> properties.put(GRPC_TRUST_CA_PATH, entry)); + hiveConfig.getOmniDataSslCrlFilePath().ifPresent(entry -> properties.put(GRPC_CRL_PATH, entry)); + } + return properties.build(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ResumableTask.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ResumableTask.java new file mode 100644 index 00000000..ac5bffb0 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ResumableTask.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; + +public interface ResumableTask +{ + /** + * Process the task either fully, or in part. + * + * @return a finished status if the task is complete, otherwise includes a continuation future to indicate + * when it should be continued to be processed. + */ + TaskStatus process(); + + class TaskStatus + { + private final boolean finished; + private final ListenableFuture continuationFuture; + + private TaskStatus(boolean finished, ListenableFuture continuationFuture) + { + this.finished = finished; + this.continuationFuture = continuationFuture; + } + + public static TaskStatus finished() + { + return new TaskStatus(true, Futures.immediateFuture(null)); + } + + public static TaskStatus continueOn(ListenableFuture continuationFuture) + { + return new TaskStatus(false, continuationFuture); + } + + public boolean isFinished() + { + return finished; + } + + public ListenableFuture getContinuationFuture() + { + return continuationFuture; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ResumableTasks.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ResumableTasks.java new file mode 100644 index 00000000..0ec72b6b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ResumableTasks.java @@ -0,0 +1,54 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import io.airlift.log.Logger; + +import java.util.concurrent.Executor; + +public final class ResumableTasks +{ + private static final Logger log = Logger.get(ResumableTasks.class); + + private ResumableTasks() + { + } + + public static void submit(Executor executor, ResumableTask task) + { + executor.execute(new Runnable() + { + @Override + public void run() + { + ResumableTask.TaskStatus status = safeProcessTask(task); + if (!status.isFinished()) { + // if task is not complete, schedule it it to run again when the future finishes + status.getContinuationFuture().addListener(this, executor); + } + } + }); + } + + private static ResumableTask.TaskStatus safeProcessTask(ResumableTask task) + { + try { + return task.process(); + } + catch (Throwable t) { + log.warn(t, "ResumableTask completed exceptionally"); + return ResumableTask.TaskStatus.finished(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/RetryDriver.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/RetryDriver.java new file mode 100644 index 00000000..78cfb1e9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/RetryDriver.java @@ -0,0 +1,170 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import io.airlift.log.Logger; +import io.airlift.units.Duration; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.Callable; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; + +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.SECONDS; + +public class RetryDriver +{ + private static final Logger log = Logger.get(RetryDriver.class); + public static final int DEFAULT_MAX_ATTEMPTS = 10; + public static final Duration DEFAULT_SLEEP_TIME = new Duration(1, SECONDS); + public static final Duration DEFAULT_MAX_RETRY_TIME = new Duration(30, SECONDS); + public static final double DEFAULT_SCALE_FACTOR = 2.0; + + private final int maxAttempts; + private final Duration minSleepTime; + private final Duration maxSleepTime; + private final double scaleFactor; + private final Duration maxRetryTime; + private final List> exceptionWhiteList; + private final Optional retryRunnable; + + private RetryDriver( + int maxAttempts, + Duration minSleepTime, + Duration maxSleepTime, + double scaleFactor, + Duration maxRetryTime, + List> exceptionWhiteList, + Optional retryRunnable) + { + this.maxAttempts = maxAttempts; + this.minSleepTime = minSleepTime; + this.maxSleepTime = maxSleepTime; + this.scaleFactor = scaleFactor; + this.maxRetryTime = maxRetryTime; + this.exceptionWhiteList = exceptionWhiteList; + this.retryRunnable = retryRunnable; + } + + private RetryDriver() + { + this(DEFAULT_MAX_ATTEMPTS, + DEFAULT_SLEEP_TIME, + DEFAULT_SLEEP_TIME, + DEFAULT_SCALE_FACTOR, + DEFAULT_MAX_RETRY_TIME, + ImmutableList.of(), + Optional.empty()); + } + + public static RetryDriver retry() + { + return new RetryDriver(); + } + + public final RetryDriver maxAttempts(int maxAttempts) + { + return new RetryDriver(maxAttempts, minSleepTime, maxSleepTime, scaleFactor, maxRetryTime, exceptionWhiteList, retryRunnable); + } + + public final RetryDriver exponentialBackoff(Duration minSleepTime, Duration maxSleepTime, Duration maxRetryTime, double scaleFactor) + { + return new RetryDriver(maxAttempts, minSleepTime, maxSleepTime, scaleFactor, maxRetryTime, exceptionWhiteList, retryRunnable); + } + + public final RetryDriver onRetry(Runnable retryRunnable) + { + return new RetryDriver(maxAttempts, minSleepTime, maxSleepTime, scaleFactor, maxRetryTime, exceptionWhiteList, Optional.ofNullable(retryRunnable)); + } + + @SafeVarargs + public final RetryDriver stopOn(Class... classes) + { + requireNonNull(classes, "classes is null"); + List> exceptions = ImmutableList.>builder() + .addAll(exceptionWhiteList) + .addAll(Arrays.asList(classes)) + .build(); + + return new RetryDriver(maxAttempts, minSleepTime, maxSleepTime, scaleFactor, maxRetryTime, exceptions, retryRunnable); + } + + public RetryDriver stopOnIllegalExceptions() + { + return stopOn(NullPointerException.class, IllegalStateException.class, IllegalArgumentException.class); + } + + public V run(String callableName, Callable callable) + throws Exception + { + requireNonNull(callableName, "callableName is null"); + requireNonNull(callable, "callable is null"); + + List suppressedExceptions = new ArrayList<>(); + long startTime = System.nanoTime(); + int attempt = 0; + while (true) { + attempt++; + + if (attempt > 1) { + retryRunnable.ifPresent(Runnable::run); + } + + try { + return callable.call(); + } + catch (Exception e) { + for (Class clazz : exceptionWhiteList) { + if (clazz.isInstance(e)) { + addSuppressed(e, suppressedExceptions); + throw e; + } + } + if (attempt >= maxAttempts || Duration.nanosSince(startTime).compareTo(maxRetryTime) >= 0) { + addSuppressed(e, suppressedExceptions); + throw e; + } + log.debug("Failed on executing %s with attempt %d, will retry. Exception: %s", callableName, attempt, e.getMessage()); + + suppressedExceptions.add(e); + + int delayInMs = (int) Math.min(minSleepTime.toMillis() * Math.pow(scaleFactor, attempt - 1), maxSleepTime.toMillis()); + int jitter = ThreadLocalRandom.current().nextInt(Math.max(1, (int) (delayInMs * 0.1))); + try { + TimeUnit.MILLISECONDS.sleep(delayInMs + jitter); + } + catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + Exception exception = new RuntimeException(ie); + addSuppressed(exception, suppressedExceptions); + throw exception; + } + } + } + } + + private static void addSuppressed(Exception exception, List suppressedExceptions) + { + for (Throwable suppressedException : suppressedExceptions) { + if (exception != suppressedException) { + exception.addSuppressed(suppressedException); + } + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/SerDeUtils.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/SerDeUtils.java new file mode 100644 index 00000000..6346b54e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/SerDeUtils.java @@ -0,0 +1,293 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.annotations.VisibleForTesting; +import io.airlift.slice.Slices; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.type.BigintType; +import io.prestosql.spi.type.BooleanType; +import io.prestosql.spi.type.CharType; +import io.prestosql.spi.type.DateType; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.DoubleType; +import io.prestosql.spi.type.IntegerType; +import io.prestosql.spi.type.RealType; +import io.prestosql.spi.type.SmallintType; +import io.prestosql.spi.type.TimestampType; +import io.prestosql.spi.type.TinyintType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.lazy.LazyDate; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; + +import java.util.List; +import java.util.Map; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.prestosql.spi.type.Chars.truncateToLengthAndTrimSpaces; +import static io.prestosql.spi.type.VarbinaryType.VARBINARY; +import static java.lang.Float.floatToRawIntBits; +import static java.util.Objects.requireNonNull; + +public final class SerDeUtils +{ + private SerDeUtils() {} + + public static Block getBlockObject(Type type, Object object, ObjectInspector objectInspector) + { + Block block = serializeObject(type, null, object, objectInspector); + return requireNonNull(block, "serialized result is null"); + } + + public static Block serializeObject(Type type, BlockBuilder builder, Object object, ObjectInspector inspector) + { + return serializeObject(type, builder, object, inspector, true); + } + + // This version supports optionally disabling the filtering of null map key, which should only be used for building test data sets + // that contain null map keys. For production, null map keys are not allowed. + @VisibleForTesting + public static Block serializeObject(Type type, BlockBuilder builder, Object object, ObjectInspector inspector, boolean filterNullMapKeys) + { + switch (inspector.getCategory()) { + case PRIMITIVE: + serializePrimitive(type, builder, object, (PrimitiveObjectInspector) inspector); + return null; + case LIST: + return serializeList(type, builder, object, (ListObjectInspector) inspector); + case MAP: + return serializeMap(type, builder, object, (MapObjectInspector) inspector, filterNullMapKeys); + case STRUCT: + return serializeStruct(type, builder, object, (StructObjectInspector) inspector); + } + throw new RuntimeException("Unknown object inspector category: " + inspector.getCategory()); + } + + private static void serializePrimitive(Type type, BlockBuilder builder, Object object, PrimitiveObjectInspector inspector) + { + requireNonNull(builder, "parent builder is null"); + + if (object == null) { + builder.appendNull(); + return; + } + + switch (inspector.getPrimitiveCategory()) { + case BOOLEAN: + BooleanType.BOOLEAN.writeBoolean(builder, ((BooleanObjectInspector) inspector).get(object)); + return; + case BYTE: + TinyintType.TINYINT.writeLong(builder, ((ByteObjectInspector) inspector).get(object)); + return; + case SHORT: + SmallintType.SMALLINT.writeLong(builder, ((ShortObjectInspector) inspector).get(object)); + return; + case INT: + IntegerType.INTEGER.writeLong(builder, ((IntObjectInspector) inspector).get(object)); + return; + case LONG: + BigintType.BIGINT.writeLong(builder, ((LongObjectInspector) inspector).get(object)); + return; + case FLOAT: + RealType.REAL.writeLong(builder, floatToRawIntBits(((FloatObjectInspector) inspector).get(object))); + return; + case DOUBLE: + DoubleType.DOUBLE.writeDouble(builder, ((DoubleObjectInspector) inspector).get(object)); + return; + case STRING: + type.writeSlice(builder, Slices.utf8Slice(((StringObjectInspector) inspector).getPrimitiveJavaObject(object))); + return; + case VARCHAR: + type.writeSlice(builder, Slices.utf8Slice(((HiveVarcharObjectInspector) inspector).getPrimitiveJavaObject(object).getValue())); + return; + case CHAR: + CharType charType = (CharType) type; + HiveChar hiveChar = ((HiveCharObjectInspector) inspector).getPrimitiveJavaObject(object); + type.writeSlice(builder, truncateToLengthAndTrimSpaces(Slices.utf8Slice(hiveChar.getValue()), charType.getLength())); + return; + case DATE: + DateType.DATE.writeLong(builder, formatDateAsLong(object, (DateObjectInspector) inspector)); + return; + case TIMESTAMP: + TimestampType.TIMESTAMP.writeLong(builder, formatTimestampAsLong(object, (TimestampObjectInspector) inspector)); + return; + case BINARY: + VARBINARY.writeSlice(builder, Slices.wrappedBuffer(((BinaryObjectInspector) inspector).getPrimitiveJavaObject(object))); + return; + case DECIMAL: + DecimalType decimalType = (DecimalType) type; + HiveDecimalWritable hiveDecimal = ((HiveDecimalObjectInspector) inspector).getPrimitiveWritableObject(object); + if (decimalType.isShort()) { + decimalType.writeLong(builder, DecimalUtils.getShortDecimalValue(hiveDecimal, decimalType.getScale())); + } + else { + decimalType.writeSlice(builder, DecimalUtils.getLongDecimalValue(hiveDecimal, decimalType.getScale())); + } + return; + } + throw new RuntimeException("Unknown primitive type: " + inspector.getPrimitiveCategory()); + } + + private static Block serializeList(Type type, BlockBuilder builder, Object object, ListObjectInspector inspector) + { + List list = inspector.getList(object); + if (list == null) { + requireNonNull(builder, "parent builder is null").appendNull(); + return null; + } + + List typeParameters = type.getTypeParameters(); + checkArgument(typeParameters.size() == 1, "list must have exactly 1 type parameter"); + Type elementType = typeParameters.get(0); + ObjectInspector elementInspector = inspector.getListElementObjectInspector(); + BlockBuilder currentBuilder; + if (builder != null) { + currentBuilder = builder.beginBlockEntry(); + } + else { + currentBuilder = elementType.createBlockBuilder(null, list.size()); + } + + for (Object element : list) { + serializeObject(elementType, currentBuilder, element, elementInspector); + } + + if (builder != null) { + builder.closeEntry(); + return null; + } + else { + Block resultBlock = currentBuilder.build(); + return resultBlock; + } + } + + private static Block serializeMap(Type type, BlockBuilder builder, Object object, MapObjectInspector inspector, boolean filterNullMapKeys) + { + Map map = inspector.getMap(object); + if (map == null) { + requireNonNull(builder, "parent builder is null").appendNull(); + return null; + } + + List typeParameters = type.getTypeParameters(); + checkArgument(typeParameters.size() == 2, "map must have exactly 2 type parameter"); + Type keyType = typeParameters.get(0); + Type valueType = typeParameters.get(1); + ObjectInspector keyInspector = inspector.getMapKeyObjectInspector(); + ObjectInspector valueInspector = inspector.getMapValueObjectInspector(); + BlockBuilder currentBuilder; + + boolean builderSynthesized = false; + if (builder == null) { + builderSynthesized = true; + builder = type.createBlockBuilder(null, 1); + } + currentBuilder = builder.beginBlockEntry(); + + for (Map.Entry entry : map.entrySet()) { + // Hive skips map entries with null keys + if (!filterNullMapKeys || entry.getKey() != null) { + serializeObject(keyType, currentBuilder, entry.getKey(), keyInspector); + serializeObject(valueType, currentBuilder, entry.getValue(), valueInspector); + } + } + + builder.closeEntry(); + if (builderSynthesized) { + return (Block) type.getObject(builder, 0); + } + else { + return null; + } + } + + private static Block serializeStruct(Type type, BlockBuilder builder, Object object, StructObjectInspector inspector) + { + if (object == null) { + requireNonNull(builder, "parent builder is null").appendNull(); + return null; + } + + List typeParameters = type.getTypeParameters(); + List allStructFieldRefs = inspector.getAllStructFieldRefs(); + checkArgument(typeParameters.size() == allStructFieldRefs.size()); + BlockBuilder currentBuilder; + + boolean builderSynthesized = false; + if (builder == null) { + builderSynthesized = true; + builder = type.createBlockBuilder(null, 1); + } + currentBuilder = builder.beginBlockEntry(); + + for (int i = 0; i < typeParameters.size(); i++) { + StructField field = allStructFieldRefs.get(i); + serializeObject(typeParameters.get(i), currentBuilder, inspector.getStructFieldData(object, field), field.getFieldObjectInspector()); + } + + builder.closeEntry(); + if (builderSynthesized) { + return (Block) type.getObject(builder, 0); + } + else { + return null; + } + } + + @SuppressWarnings("deprecation") + private static long formatDateAsLong(Object object, DateObjectInspector inspector) + { + if (object instanceof LazyDate) { + return ((LazyDate) object).getWritableObject().getDays(); + } + if (object instanceof DateWritable) { + return ((DateWritable) object).getDays(); + } + + return inspector.getPrimitiveJavaObject(object).toEpochDay(); + } + + private static long formatTimestampAsLong(Object object, TimestampObjectInspector inspector) + { + if (object instanceof TimestampWritable) { + return ((TimestampWritable) object).getTimestamp().getTime(); + } + return inspector.getPrimitiveJavaObject(object).toEpochMilli(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/SortBuffer.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/SortBuffer.java new file mode 100644 index 00000000..0440363c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/SortBuffer.java @@ -0,0 +1,138 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import io.airlift.units.DataSize; +import io.prestosql.spi.Page; +import io.prestosql.spi.PageBuilder; +import io.prestosql.spi.PageSorter; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.block.SortOrder; +import io.prestosql.spi.type.Type; +import org.openjdk.jol.info.ClassLayout; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.Consumer; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Verify.verify; +import static java.lang.Math.addExact; +import static java.util.Objects.requireNonNull; + +public class SortBuffer +{ + private static final int INSTANCE_SIZE = ClassLayout.parseClass(SortBuffer.class).instanceSize(); + + private final long maxMemoryBytes; + private final List types; + private final List sortFields; + private final List sortOrders; + private final PageSorter pageSorter; + private final List pages = new ArrayList<>(); + private final PageBuilder pageBuilder; + + private long usedMemoryBytes; + private int rowCount; + + public SortBuffer( + DataSize maxMemory, + List types, + List sortFields, + List sortOrders, + PageSorter pageSorter) + { + checkArgument(maxMemory.toBytes() > 0, "maxMemory is zero"); + this.maxMemoryBytes = maxMemory.toBytes(); + this.types = requireNonNull(types, "types is null"); + this.sortFields = ImmutableList.copyOf(requireNonNull(sortFields, "sortFields is null")); + this.sortOrders = ImmutableList.copyOf(requireNonNull(sortOrders, "sortOrders is null")); + this.pageSorter = requireNonNull(pageSorter, "pageSorter is null"); + this.pageBuilder = new PageBuilder(types); + } + + public long getRetainedBytes() + { + return INSTANCE_SIZE + usedMemoryBytes; + } + + public boolean isEmpty() + { + return pages.isEmpty(); + } + + public boolean canAdd(Page page) + { + return (usedMemoryBytes < maxMemoryBytes) && + ((((long) rowCount) + page.getPositionCount()) <= Integer.MAX_VALUE); + } + + public void add(Page page) + { + checkState(canAdd(page), "page buffer is full"); + pages.add(page); + usedMemoryBytes += page.getRetainedSizeInBytes(); + rowCount = addExact(rowCount, page.getPositionCount()); + } + + public static void appendPositionTo(Page page, int position, PageBuilder pageBuilder) + { + pageBuilder.declarePosition(); + for (int i = 0; i < page.getChannelCount(); i++) { + Type type = pageBuilder.getType(i); + Block block = page.getBlock(i); + BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(i); + type.appendTo(block, position, blockBuilder); + } + } + + public void flushTo(Consumer consumer) + { + checkState(!pages.isEmpty(), "page buffer is empty"); + + long[] addresses = pageSorter.sort(types, pages, sortFields, sortOrders, rowCount); + + int[] pageIndex = new int[addresses.length]; + int[] positionIndex = new int[addresses.length]; + for (int i = 0; i < addresses.length; i++) { + pageIndex[i] = pageSorter.decodePageIndex(addresses[i]); + positionIndex[i] = pageSorter.decodePositionIndex(addresses[i]); + } + + verify(pageBuilder.isEmpty()); + + for (int i = 0; i < pageIndex.length; i++) { + Page page = pages.get(pageIndex[i]); + int position = positionIndex[i]; + appendPositionTo(page, position, pageBuilder); + + if (pageBuilder.isFull()) { + consumer.accept(pageBuilder.build()); + pageBuilder.reset(); + } + } + + if (!pageBuilder.isEmpty()) { + consumer.accept(pageBuilder.build()); + pageBuilder.reset(); + } + + pages.clear(); + rowCount = 0; + usedMemoryBytes = 0; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/Statistics.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/Statistics.java new file mode 100644 index 00000000..f89a7f12 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/Statistics.java @@ -0,0 +1,456 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import io.prestosql.plugin.hive.HiveBasicStatistics; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.metastore.BooleanStatistics; +import io.prestosql.plugin.hive.metastore.DateStatistics; +import io.prestosql.plugin.hive.metastore.DecimalStatistics; +import io.prestosql.plugin.hive.metastore.DoubleStatistics; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; +import io.prestosql.plugin.hive.metastore.IntegerStatistics; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.statistics.ColumnStatisticMetadata; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.statistics.ComputedStatistics; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.SqlDate; +import io.prestosql.spi.type.SqlDecimal; +import io.prestosql.spi.type.Type; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Optional; +import java.util.OptionalDouble; +import java.util.OptionalLong; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.Sets.intersection; +import static io.prestosql.plugin.hive.HiveBasicStatistics.createZeroStatistics; +import static io.prestosql.plugin.hive.HiveWriteUtils.createPartitionValues; +import static io.prestosql.plugin.hive.util.Statistics.ReduceOperator.ADD; +import static io.prestosql.plugin.hive.util.Statistics.ReduceOperator.MAX; +import static io.prestosql.plugin.hive.util.Statistics.ReduceOperator.MIN; +import static io.prestosql.spi.statistics.ColumnStatisticType.MAX_VALUE; +import static io.prestosql.spi.statistics.ColumnStatisticType.MAX_VALUE_SIZE_IN_BYTES; +import static io.prestosql.spi.statistics.ColumnStatisticType.MIN_VALUE; +import static io.prestosql.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; +import static io.prestosql.spi.statistics.ColumnStatisticType.NUMBER_OF_NON_NULL_VALUES; +import static io.prestosql.spi.statistics.ColumnStatisticType.NUMBER_OF_TRUE_VALUES; +import static io.prestosql.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TimestampType.TIMESTAMP; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.MILLISECONDS; + +public final class Statistics +{ + private Statistics() {} + + public static PartitionStatistics merge(PartitionStatistics first, PartitionStatistics second) + { + return new PartitionStatistics( + reduce(first.getBasicStatistics(), second.getBasicStatistics(), ADD), + merge(first.getColumnStatistics(), second.getColumnStatistics())); + } + + public static HiveBasicStatistics reduce(HiveBasicStatistics first, HiveBasicStatistics second, ReduceOperator operator) + { + return new HiveBasicStatistics( + reduce(first.getFileCount(), second.getFileCount(), operator, false), + reduce(first.getRowCount(), second.getRowCount(), operator, false), + reduce(first.getInMemoryDataSizeInBytes(), second.getInMemoryDataSizeInBytes(), operator, false), + reduce(first.getOnDiskDataSizeInBytes(), second.getOnDiskDataSizeInBytes(), operator, false)); + } + + public static Map merge(Map first, Map second) + { + // only keep columns that have statistics for both sides + Set columns = intersection(first.keySet(), second.keySet()); + return columns.stream() + .collect(toImmutableMap( + column -> column, + column -> merge(first.get(column), second.get(column)))); + } + + public static HiveColumnStatistics merge(HiveColumnStatistics first, HiveColumnStatistics second) + { + return new HiveColumnStatistics( + mergeIntegerStatistics(first.getIntegerStatistics(), second.getIntegerStatistics()), + mergeDoubleStatistics(first.getDoubleStatistics(), second.getDoubleStatistics()), + mergeDecimalStatistics(first.getDecimalStatistics(), second.getDecimalStatistics()), + mergeDateStatistics(first.getDateStatistics(), second.getDateStatistics()), + mergeBooleanStatistics(first.getBooleanStatistics(), second.getBooleanStatistics()), + reduce(first.getMaxValueSizeInBytes(), second.getMaxValueSizeInBytes(), MAX, true), + reduce(first.getTotalSizeInBytes(), second.getTotalSizeInBytes(), ADD, true), + reduce(first.getNullsCount(), second.getNullsCount(), ADD, false), + reduce(first.getDistinctValuesCount(), second.getDistinctValuesCount(), MAX, false)); + } + + private static Optional mergeIntegerStatistics(Optional first, Optional second) + { + // normally, either both or none is present + if (first.isPresent() && second.isPresent()) { + return Optional.of(new IntegerStatistics( + reduce(first.get().getMin(), second.get().getMin(), MIN, true), + reduce(first.get().getMax(), second.get().getMax(), MAX, true))); + } + return Optional.empty(); + } + + private static Optional mergeDoubleStatistics(Optional first, Optional second) + { + // normally, either both or none is present + if (first.isPresent() && second.isPresent()) { + return Optional.of(new DoubleStatistics( + reduce(first.get().getMin(), second.get().getMin(), MIN, true), + reduce(first.get().getMax(), second.get().getMax(), MAX, true))); + } + return Optional.empty(); + } + + private static Optional mergeDecimalStatistics(Optional first, Optional second) + { + // normally, either both or none is present + if (first.isPresent() && second.isPresent()) { + return Optional.of(new DecimalStatistics( + reduce(first.get().getMin(), second.get().getMin(), MIN, true), + reduce(first.get().getMax(), second.get().getMax(), MAX, true))); + } + return Optional.empty(); + } + + private static Optional mergeDateStatistics(Optional first, Optional second) + { + // normally, either both or none is present + if (first.isPresent() && second.isPresent()) { + return Optional.of(new DateStatistics( + reduce(first.get().getMin(), second.get().getMin(), MIN, true), + reduce(first.get().getMax(), second.get().getMax(), MAX, true))); + } + return Optional.empty(); + } + + private static Optional mergeBooleanStatistics(Optional first, Optional second) + { + // normally, either both or none is present + if (first.isPresent() && second.isPresent()) { + return Optional.of(new BooleanStatistics( + reduce(first.get().getTrueCount(), second.get().getTrueCount(), ADD, false), + reduce(first.get().getFalseCount(), second.get().getFalseCount(), ADD, false))); + } + return Optional.empty(); + } + + private static OptionalLong reduce(OptionalLong first, OptionalLong second, ReduceOperator operator, boolean returnFirstNonEmpty) + { + if (first.isPresent() && second.isPresent()) { + switch (operator) { + case ADD: + return OptionalLong.of(first.getAsLong() + second.getAsLong()); + case SUBTRACT: + return OptionalLong.of(first.getAsLong() - second.getAsLong()); + case MAX: + return OptionalLong.of(max(first.getAsLong(), second.getAsLong())); + case MIN: + return OptionalLong.of(min(first.getAsLong(), second.getAsLong())); + default: + throw new IllegalArgumentException("Unexpected operator: " + operator); + } + } + if (returnFirstNonEmpty) { + return first.isPresent() ? first : second; + } + return OptionalLong.empty(); + } + + private static OptionalDouble reduce(OptionalDouble first, OptionalDouble second, ReduceOperator operator, boolean returnFirstNonEmpty) + { + if (first.isPresent() && second.isPresent()) { + switch (operator) { + case ADD: + return OptionalDouble.of(first.getAsDouble() + second.getAsDouble()); + case SUBTRACT: + return OptionalDouble.of(first.getAsDouble() - second.getAsDouble()); + case MAX: + return OptionalDouble.of(max(first.getAsDouble(), second.getAsDouble())); + case MIN: + return OptionalDouble.of(min(first.getAsDouble(), second.getAsDouble())); + default: + throw new IllegalArgumentException("Unexpected operator: " + operator); + } + } + if (returnFirstNonEmpty) { + return first.isPresent() ? first : second; + } + return OptionalDouble.empty(); + } + + @SuppressWarnings("unchecked") + private static > Optional reduce(Optional first, Optional second, ReduceOperator operator, boolean returnFirstNonEmpty) + { + if (first.isPresent() && second.isPresent()) { + switch (operator) { + case MAX: + return Optional.of(max(first.get(), second.get())); + case MIN: + return Optional.of(min(first.get(), second.get())); + default: + throw new IllegalArgumentException("Unexpected operator: " + operator); + } + } + if (returnFirstNonEmpty) { + return first.isPresent() ? first : second; + } + return Optional.empty(); + } + + private static > T max(T first, T second) + { + return first.compareTo(second) >= 0 ? first : second; + } + + private static > T min(T first, T second) + { + return first.compareTo(second) <= 0 ? first : second; + } + + public static PartitionStatistics createEmptyPartitionStatistics(Map columnTypes, Map> columnStatisticsMetadataTypes) + { + Map columnStatistics = columnStatisticsMetadataTypes.entrySet().stream() + .collect(toImmutableMap(Entry::getKey, entry -> createColumnStatisticsForEmptyPartition(columnTypes.get(entry.getKey()), entry.getValue()))); + return new PartitionStatistics(createZeroStatistics(), columnStatistics); + } + + private static HiveColumnStatistics createColumnStatisticsForEmptyPartition(Type columnType, Set columnStatisticTypes) + { + requireNonNull(columnType, "columnType is null"); + HiveColumnStatistics.Builder result = HiveColumnStatistics.builder(); + for (ColumnStatisticType columnStatisticType : columnStatisticTypes) { + switch (columnStatisticType) { + case MAX_VALUE_SIZE_IN_BYTES: + result.setMaxValueSizeInBytes(0); + break; + case TOTAL_SIZE_IN_BYTES: + result.setTotalSizeInBytes(0); + break; + case NUMBER_OF_DISTINCT_VALUES: + result.setDistinctValuesCount(0); + break; + case NUMBER_OF_NON_NULL_VALUES: + result.setNullsCount(0); + break; + case NUMBER_OF_TRUE_VALUES: + result.setBooleanStatistics(new BooleanStatistics(OptionalLong.of(0L), OptionalLong.of(0L))); + break; + case MIN_VALUE: + case MAX_VALUE: + setMinMaxForEmptyPartition(columnType, result); + break; + default: + throw new PrestoException(HiveErrorCode.HIVE_UNKNOWN_COLUMN_STATISTIC_TYPE, "Unknown column statistics type: " + columnStatisticType.name()); + } + } + return result.build(); + } + + private static void setMinMaxForEmptyPartition(Type type, HiveColumnStatistics.Builder result) + { + if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { + result.setIntegerStatistics(new IntegerStatistics(OptionalLong.empty(), OptionalLong.empty())); + } + else if (type.equals(DOUBLE) || type.equals(REAL)) { + result.setDoubleStatistics(new DoubleStatistics(OptionalDouble.empty(), OptionalDouble.empty())); + } + else if (type.equals(DATE)) { + result.setDateStatistics(new DateStatistics(Optional.empty(), Optional.empty())); + } + else if (type.equals(TIMESTAMP)) { + result.setIntegerStatistics(new IntegerStatistics(OptionalLong.empty(), OptionalLong.empty())); + } + else if (type instanceof DecimalType) { + result.setDecimalStatistics(new DecimalStatistics(Optional.empty(), Optional.empty())); + } + else { + throw new IllegalArgumentException("Unexpected type: " + type); + } + } + + public static Map, ComputedStatistics> createComputedStatisticsToPartitionMap( + Collection computedStatistics, + List partitionColumns, + Map columnTypes) + { + List partitionColumnTypes = partitionColumns.stream() + .map(columnTypes::get) + .collect(toImmutableList()); + + return computedStatistics.stream() + .collect(toImmutableMap(statistics -> getPartitionValues(statistics, partitionColumns, partitionColumnTypes), statistics -> statistics)); + } + + private static List getPartitionValues(ComputedStatistics statistics, List partitionColumns, List partitionColumnTypes) + { + checkArgument(statistics.getGroupingColumns().equals(partitionColumns), + "Unexpected grouping. Partition columns: %s. Grouping columns: %s", partitionColumns, statistics.getGroupingColumns()); + Page partitionColumnsPage = new Page(1, statistics.getGroupingValues().toArray(new Block[] {})); + return createPartitionValues(partitionColumnTypes, partitionColumnsPage, 0); + } + + public static Map fromComputedStatistics( + ConnectorSession session, + Map computedStatistics, + Map columnTypes, + long rowCount) + { + Map> result = new HashMap<>(); + computedStatistics.forEach((metadata, block) -> { + Map columnStatistics = result.computeIfAbsent(metadata.getColumnName(), key -> new HashMap<>()); + columnStatistics.put(metadata.getStatisticType(), block); + }); + return result.entrySet() + .stream() + .collect(toImmutableMap(Entry::getKey, + entry -> createHiveColumnStatistics(session, entry.getValue(), columnTypes.get(entry.getKey()), rowCount))); + } + + private static HiveColumnStatistics createHiveColumnStatistics( + ConnectorSession session, + Map computedStatistics, + Type columnType, + long rowCount) + { + HiveColumnStatistics.Builder result = HiveColumnStatistics.builder(); + + // MIN_VALUE, MAX_VALUE + // We ask the engine to compute either both or neither + verify(computedStatistics.containsKey(MIN_VALUE) == computedStatistics.containsKey(MAX_VALUE)); + if (computedStatistics.containsKey(MIN_VALUE)) { + setMinMax(session, columnType, computedStatistics.get(MIN_VALUE), computedStatistics.get(MAX_VALUE), result); + } + + // MAX_VALUE_SIZE_IN_BYTES + if (computedStatistics.containsKey(MAX_VALUE_SIZE_IN_BYTES)) { + result.setMaxValueSizeInBytes(getIntegerValue(session, BIGINT, computedStatistics.get(MAX_VALUE_SIZE_IN_BYTES))); + } + + // TOTAL_VALUES_SIZE_IN_BYTES + if (computedStatistics.containsKey(TOTAL_SIZE_IN_BYTES)) { + result.setTotalSizeInBytes(getIntegerValue(session, BIGINT, computedStatistics.get(TOTAL_SIZE_IN_BYTES))); + } + + // NUMBER OF NULLS + if (computedStatistics.containsKey(NUMBER_OF_NON_NULL_VALUES)) { + result.setNullsCount(rowCount - BIGINT.getLong(computedStatistics.get(NUMBER_OF_NON_NULL_VALUES), 0)); + } + + // NDV + if (computedStatistics.containsKey(NUMBER_OF_DISTINCT_VALUES) && computedStatistics.containsKey(NUMBER_OF_NON_NULL_VALUES)) { + // number of distinct value is estimated using HLL, and can be higher than the number of non null values + long numberOfNonNullValues = BIGINT.getLong(computedStatistics.get(NUMBER_OF_NON_NULL_VALUES), 0); + long numberOfDistinctValues = BIGINT.getLong(computedStatistics.get(NUMBER_OF_DISTINCT_VALUES), 0); + if (numberOfDistinctValues > numberOfNonNullValues) { + result.setDistinctValuesCount(numberOfNonNullValues); + } + else { + result.setDistinctValuesCount(numberOfDistinctValues); + } + } + + // NUMBER OF FALSE, NUMBER OF TRUE + if (computedStatistics.containsKey(NUMBER_OF_TRUE_VALUES) && computedStatistics.containsKey(NUMBER_OF_NON_NULL_VALUES)) { + long numberOfTrue = BIGINT.getLong(computedStatistics.get(NUMBER_OF_TRUE_VALUES), 0); + long numberOfNonNullValues = BIGINT.getLong(computedStatistics.get(NUMBER_OF_NON_NULL_VALUES), 0); + result.setBooleanStatistics(new BooleanStatistics(OptionalLong.of(numberOfTrue), OptionalLong.of(numberOfNonNullValues - numberOfTrue))); + } + return result.build(); + } + + private static void setMinMax(ConnectorSession session, Type type, Block min, Block max, HiveColumnStatistics.Builder result) + { + if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { + result.setIntegerStatistics(new IntegerStatistics(getIntegerValue(session, type, min), getIntegerValue(session, type, max))); + } + else if (type.equals(DOUBLE) || type.equals(REAL)) { + result.setDoubleStatistics(new DoubleStatistics(getDoubleValue(session, type, min), getDoubleValue(session, type, max))); + } + else if (type.equals(DATE)) { + result.setDateStatistics(new DateStatistics(getDateValue(session, type, min), getDateValue(session, type, max))); + } + else if (type.equals(TIMESTAMP)) { + result.setIntegerStatistics(new IntegerStatistics(getTimestampValue(min), getTimestampValue(max))); + } + else if (type instanceof DecimalType) { + result.setDecimalStatistics(new DecimalStatistics(getDecimalValue(session, type, min), getDecimalValue(session, type, max))); + } + else { + throw new IllegalArgumentException("Unexpected type: " + type); + } + } + + private static OptionalLong getIntegerValue(ConnectorSession session, Type type, Block block) + { + // works for BIGINT as well as for other integer types TINYINT/SMALLINT/INTEGER that store values as byte/short/int + return block.isNull(0) ? OptionalLong.empty() : OptionalLong.of(((Number) type.getObjectValue(session, block, 0)).longValue()); + } + + private static OptionalDouble getDoubleValue(ConnectorSession session, Type type, Block block) + { + return block.isNull(0) ? OptionalDouble.empty() : OptionalDouble.of(((Number) type.getObjectValue(session, block, 0)).doubleValue()); + } + + private static Optional getDateValue(ConnectorSession session, Type type, Block block) + { + return block.isNull(0) ? Optional.empty() : Optional.of(LocalDate.ofEpochDay(((SqlDate) type.getObjectValue(session, block, 0)).getDays())); + } + + private static OptionalLong getTimestampValue(Block block) + { + // TODO https://github.com/prestodb/presto/issues/7122 + return block.isNull(0) ? OptionalLong.empty() : OptionalLong.of(MILLISECONDS.toSeconds(block.getLong(0, 0))); + } + + private static Optional getDecimalValue(ConnectorSession session, Type type, Block block) + { + return block.isNull(0) ? Optional.empty() : Optional.of(((SqlDecimal) type.getObjectValue(session, block, 0)).toBigDecimal()); + } + + public enum ReduceOperator + { + ADD, + SUBTRACT, + MIN, + MAX, + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/TempFileReader.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/TempFileReader.java new file mode 100644 index 00000000..658afc78 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/TempFileReader.java @@ -0,0 +1,100 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.AbstractIterator; +import io.airlift.units.DataSize; +import io.prestosql.orc.OrcDataSource; +import io.prestosql.orc.OrcPredicate; +import io.prestosql.orc.OrcReader; +import io.prestosql.orc.OrcRecordReader; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.type.Type; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InterruptedIOException; +import java.util.List; + +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; +import static io.prestosql.orc.OrcReader.INITIAL_BATCH_SIZE; +import static java.util.Objects.requireNonNull; +import static org.joda.time.DateTimeZone.UTC; + +public class TempFileReader + extends AbstractIterator + implements Closeable +{ + private final OrcRecordReader reader; + + public TempFileReader(List types, OrcDataSource dataSource) + { + requireNonNull(types, "types is null"); + + try { + OrcReader orcReader = new OrcReader( + dataSource, + new DataSize(1, MEGABYTE), + new DataSize(8, MEGABYTE), + new DataSize(16, MEGABYTE)); + reader = orcReader.createRecordReader( + orcReader.getRootColumn().getNestedColumns(), + types, + OrcPredicate.TRUE, + UTC, + newSimpleAggregatedMemoryContext(), + INITIAL_BATCH_SIZE, + TempFileReader::handleException); + } + catch (IOException e) { + throw handleException(e); + } + } + + @Override + public void close() + throws IOException + { + reader.close(); + } + + @Override + protected Page computeNext() + { + try { + if (Thread.currentThread().isInterrupted()) { + throw new InterruptedIOException(); + } + + Page page = reader.nextPage(); + if (page == null) { + return endOfData(); + } + + // eagerly load the page + return page.getLoadedPage(); + } + catch (IOException e) { + throw handleException(e); + } + } + + private static PrestoException handleException(Exception e) + { + return new PrestoException(HiveErrorCode.HIVE_WRITER_DATA_ERROR, "Failed to read temporary data", e); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/TempFileWriter.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/TempFileWriter.java new file mode 100644 index 00000000..7776421c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/TempFileWriter.java @@ -0,0 +1,96 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableMap; +import io.airlift.units.DataSize; +import io.prestosql.orc.OrcDataSink; +import io.prestosql.orc.OrcWriteValidation.OrcWriteValidationMode; +import io.prestosql.orc.OrcWriter; +import io.prestosql.orc.OrcWriterOptions; +import io.prestosql.orc.OrcWriterStats; +import io.prestosql.spi.Page; +import io.prestosql.spi.type.Type; + +import java.io.Closeable; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.Optional; +import java.util.stream.IntStream; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.airlift.units.DataSize.Unit.BYTE; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.orc.metadata.CompressionKind.LZ4; + +public class TempFileWriter + implements Closeable +{ + private final OrcWriter orcWriter; + + public TempFileWriter(List types, OrcDataSink sink) + { + this.orcWriter = createOrcFileWriter(sink, types); + } + + public void writePage(Page page) + { + try { + orcWriter.write(page); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public void close() + throws IOException + { + orcWriter.close(); + } + + public long getWrittenBytes() + { + return orcWriter.getWrittenBytes(); + } + + public long getRetainedBytes() + { + return orcWriter.getRetainedBytes(); + } + + private static OrcWriter createOrcFileWriter(OrcDataSink sink, List types) + { + List columnNames = IntStream.range(0, types.size()) + .mapToObj(String::valueOf) + .collect(toImmutableList()); + + return new OrcWriter( + sink, + columnNames, + types, + LZ4, + new OrcWriterOptions() + .withMaxStringStatisticsLimit(new DataSize(0, BYTE)) + .withStripeMinSize(new DataSize(64, MEGABYTE)) + .withDictionaryMaxMemory(new DataSize(1, MEGABYTE)), + false, + ImmutableMap.of(), + false, + OrcWriteValidationMode.BOTH, + new OrcWriterStats(), Optional.empty(), Optional.empty()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ThrottledAsyncQueue.java b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ThrottledAsyncQueue.java new file mode 100644 index 00000000..f8edf164 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/java/io/prestosql/plugin/hive/util/ThrottledAsyncQueue.java @@ -0,0 +1,85 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.RateLimiter; + +import javax.annotation.concurrent.ThreadSafe; + +import java.util.List; +import java.util.concurrent.Executor; +import java.util.function.Function; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.util.concurrent.Futures.immediateFuture; + +/** + * An asynchronous queue that limits the rate at which batches will be + * made available as well as the number of elements they will contain. + * + * @param The type of elements accepted by the queue. + */ +@ThreadSafe +public class ThrottledAsyncQueue + extends AsyncQueue +{ + private final int maxBatchSizePerSec; + private final Executor executor; + private final RateLimiter rateLimiter; + + public ThrottledAsyncQueue(int maxBatchSizePerSec, int targetQueueSize, Executor executor) + { + super(targetQueueSize, executor); + this.executor = executor; + this.maxBatchSizePerSec = maxBatchSizePerSec; + this.rateLimiter = RateLimiter.create(maxBatchSizePerSec); + } + + @Override + public synchronized ListenableFuture borrowBatchAsync(int maxSize, Function, BorrowResult> function) + { + checkArgument(maxSize >= 0, "maxSize must be at least 0"); + + ListenableFuture throttleFuture = immediateFuture(null); + if (size() > 0) { + // the queue is not empty, try to return a batch immediately if we are not throttled + int size = maxBatchSize(maxSize); + if (rateLimiter.tryAcquire(size)) { + return super.borrowBatchAsync(size, function); + } + } + else if (!isFinished()) { + // the queue is empty but not finished, wait before we can query a batch + throttleFuture = getNotEmptySignal(); + } + + return Futures.transformAsync( + throttleFuture, + any -> { + int size = maxBatchSize(maxSize); + if (size > 0) { + rateLimiter.acquire(size); + } + return super.borrowBatchAsync(size, function); + }, + executor); + } + + private int maxBatchSize(int maxSize) + { + return Math.min(maxSize, Math.min(size(), maxBatchSizePerSec)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2010.txt b/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2010.txt new file mode 100644 index 00000000..8e65f88c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2010.txt @@ -0,0 +1,13 @@ +Copyright 2010 Proofpoint, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2012.txt b/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2012.txt new file mode 100644 index 00000000..3197b0a2 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2012.txt @@ -0,0 +1,13 @@ +Copyright 2012 Proofpoint, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2020.txt b/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2020.txt new file mode 100644 index 00000000..0de8ccbb --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2020.txt @@ -0,0 +1,12 @@ +Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2021.txt b/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2021.txt new file mode 100644 index 00000000..6a498fa4 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-alternate-2021.txt @@ -0,0 +1,12 @@ +Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-third.txt b/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-third.txt new file mode 100644 index 00000000..a4928795 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header-third.txt @@ -0,0 +1,12 @@ +Copyright (C) 2018-2020. Autohome Technologies Co., Ltd. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header.txt b/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header.txt new file mode 100644 index 00000000..6a498fa4 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/main/resource/license/license-header.txt @@ -0,0 +1,12 @@ +Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/modernizer/violations.xml b/omnidata/omnidata-openlookeng-connector/connector/src/modernizer/violations.xml new file mode 100644 index 00000000..67dee5e1 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/modernizer/violations.xml @@ -0,0 +1,32 @@ + + + + java/lang/Class.newInstance:()Ljava/lang/Object; + 1.1 + Prefer Class.getConstructor().newInstance() + + + + java/lang/String.toLowerCase:()Ljava/lang/String; + 1.1 + Prefer String.toLowerCase(java.util.Locale) + + + + com/google/common/primitives/Ints.checkedCast:(J)I + 1.8 + Prefer Math.toIntExact(long) + + + + org/testng/Assert.assertEquals:(Ljava/lang/Iterable;Ljava/lang/Iterable;)V + 1.8 + Use io.prestosql.testing.assertions.Assert.assertEquals due to TestNG #543 + + + + org/testng/Assert.assertEquals:(Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/String;)V + 1.8 + Use io.prestosql.testing.assertions.Assert.assertEquals due to TestNG #543 + + diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHive.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHive.java new file mode 100644 index 00000000..58c176a9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHive.java @@ -0,0 +1,5321 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultimap; +import com.google.common.collect.ImmutableSet; +import com.google.common.net.HostAndPort; +import io.airlift.json.JsonCodec; +import io.airlift.log.Logger; +import io.airlift.slice.Slice; +import io.airlift.stats.CounterStat; +import io.airlift.units.DataSize; +import io.airlift.units.Duration; +import io.prestosql.GroupByHashPageIndexerFactory; +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.authentication.NoHdfsAuthentication; +import io.prestosql.plugin.hive.metastore.CachingHiveMetastore; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.HivePrivilegeInfo; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.PartitionWithStatistics; +import io.prestosql.plugin.hive.metastore.PrincipalPrivileges; +import io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore; +import io.prestosql.plugin.hive.metastore.SortingColumn; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.metastore.thrift.BridgingHiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.MetastoreLocator; +import io.prestosql.plugin.hive.metastore.thrift.TestingMetastoreLocator; +import io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastoreConfig; +import io.prestosql.plugin.hive.orc.OrcConcatPageSource; +import io.prestosql.plugin.hive.orc.OrcPageSource; +import io.prestosql.plugin.hive.parquet.ParquetPageSource; +import io.prestosql.plugin.hive.rcfile.RcFilePageSource; +import io.prestosql.plugin.hive.security.SqlStandardAccessControlMetadata; +import io.prestosql.spi.Page; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ColumnMetadata; +import io.prestosql.spi.connector.ConnectorInsertTableHandle; +import io.prestosql.spi.connector.ConnectorMetadata; +import io.prestosql.spi.connector.ConnectorNewTableLayout; +import io.prestosql.spi.connector.ConnectorOutputTableHandle; +import io.prestosql.spi.connector.ConnectorPageSink; +import io.prestosql.spi.connector.ConnectorPageSinkProvider; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorPageSourceProvider; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorSplit; +import io.prestosql.spi.connector.ConnectorSplitManager; +import io.prestosql.spi.connector.ConnectorSplitSource; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.ConnectorTableMetadata; +import io.prestosql.spi.connector.ConnectorTableProperties; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.connector.ConnectorViewDefinition; +import io.prestosql.spi.connector.ConnectorViewDefinition.ViewColumn; +import io.prestosql.spi.connector.Constraint; +import io.prestosql.spi.connector.ConstraintApplicationResult; +import io.prestosql.spi.connector.DiscretePredicates; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.connector.RecordPageSource; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.SchemaTablePrefix; +import io.prestosql.spi.connector.TableAlreadyExistsException; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.connector.ViewNotFoundException; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.NullableValue; +import io.prestosql.spi.predicate.Range; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.predicate.ValueSet; +import io.prestosql.spi.statistics.ColumnStatistics; +import io.prestosql.spi.statistics.TableStatistics; +import io.prestosql.spi.type.ArrayType; +import io.prestosql.spi.type.MapType; +import io.prestosql.spi.type.NamedTypeSignature; +import io.prestosql.spi.type.RowFieldName; +import io.prestosql.spi.type.RowType; +import io.prestosql.spi.type.SqlDate; +import io.prestosql.spi.type.SqlTimestamp; +import io.prestosql.spi.type.SqlVarbinary; +import io.prestosql.spi.type.StandardTypes; +import io.prestosql.spi.type.Type; +import io.prestosql.sql.gen.JoinCompiler; +import io.prestosql.testing.MaterializedResult; +import io.prestosql.testing.MaterializedRow; +import io.prestosql.testing.TestingConnectorSession; +import io.prestosql.testing.TestingNodeManager; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.TableType; +import org.joda.time.DateTime; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.math.BigDecimal; +import java.time.LocalDateTime; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalDouble; +import java.util.OptionalInt; +import java.util.OptionalLong; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.IntStream; +import java.util.stream.LongStream; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.google.common.collect.Iterables.concat; +import static com.google.common.collect.Iterables.getOnlyElement; +import static com.google.common.collect.Lists.newArrayList; +import static com.google.common.collect.Lists.reverse; +import static com.google.common.collect.Maps.uniqueIndex; +import static com.google.common.collect.MoreCollectors.onlyElement; +import static com.google.common.collect.Sets.difference; +import static com.google.common.hash.Hashing.sha256; +import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static io.airlift.concurrent.MoreFutures.getFutureValue; +import static io.airlift.concurrent.Threads.daemonThreadsNamed; +import static io.airlift.slice.Slices.utf8Slice; +import static io.airlift.testing.Assertions.assertEqualsIgnoreOrder; +import static io.airlift.testing.Assertions.assertGreaterThan; +import static io.airlift.testing.Assertions.assertGreaterThanOrEqual; +import static io.airlift.testing.Assertions.assertInstanceOf; +import static io.airlift.testing.Assertions.assertLessThanOrEqual; +import static io.airlift.units.DataSize.Unit.KILOBYTE; +import static io.prestosql.metadata.MetadataManager.createTestMetadataManager; +import static io.prestosql.plugin.hive.AbstractTestHive.TransactionDeleteInsertTestTag.COMMIT; +import static io.prestosql.plugin.hive.AbstractTestHive.TransactionDeleteInsertTestTag.ROLLBACK_AFTER_APPEND_PAGE; +import static io.prestosql.plugin.hive.AbstractTestHive.TransactionDeleteInsertTestTag.ROLLBACK_AFTER_BEGIN_INSERT; +import static io.prestosql.plugin.hive.AbstractTestHive.TransactionDeleteInsertTestTag.ROLLBACK_AFTER_DELETE; +import static io.prestosql.plugin.hive.AbstractTestHive.TransactionDeleteInsertTestTag.ROLLBACK_AFTER_FINISH_INSERT; +import static io.prestosql.plugin.hive.AbstractTestHive.TransactionDeleteInsertTestTag.ROLLBACK_AFTER_SINK_FINISH; +import static io.prestosql.plugin.hive.AbstractTestHive.TransactionDeleteInsertTestTag.ROLLBACK_RIGHT_AWAY; +import static io.prestosql.plugin.hive.HiveBasicStatistics.createEmptyStatistics; +import static io.prestosql.plugin.hive.HiveBasicStatistics.createZeroStatistics; +import static io.prestosql.plugin.hive.HiveBucketing.BucketingVersion.BUCKETING_V1; +import static io.prestosql.plugin.hive.HiveColumnHandle.BUCKET_COLUMN_NAME; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveColumnHandle.bucketColumnHandle; +import static io.prestosql.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME; +import static io.prestosql.plugin.hive.HiveMetadata.PRESTO_VERSION_NAME; +import static io.prestosql.plugin.hive.HiveMetadata.convertToPredicate; +import static io.prestosql.plugin.hive.HiveStorageFormat.AVRO; +import static io.prestosql.plugin.hive.HiveStorageFormat.CSV; +import static io.prestosql.plugin.hive.HiveStorageFormat.JSON; +import static io.prestosql.plugin.hive.HiveStorageFormat.ORC; +import static io.prestosql.plugin.hive.HiveStorageFormat.PARQUET; +import static io.prestosql.plugin.hive.HiveStorageFormat.RCBINARY; +import static io.prestosql.plugin.hive.HiveStorageFormat.RCTEXT; +import static io.prestosql.plugin.hive.HiveStorageFormat.SEQUENCEFILE; +import static io.prestosql.plugin.hive.HiveStorageFormat.TEXTFILE; +import static io.prestosql.plugin.hive.HiveTableProperties.BUCKETED_BY_PROPERTY; +import static io.prestosql.plugin.hive.HiveTableProperties.BUCKET_COUNT_PROPERTY; +import static io.prestosql.plugin.hive.HiveTableProperties.IS_EXTERNAL_TABLE; +import static io.prestosql.plugin.hive.HiveTableProperties.LOCATION_PROPERTY; +import static io.prestosql.plugin.hive.HiveTableProperties.PARTITIONED_BY_PROPERTY; +import static io.prestosql.plugin.hive.HiveTableProperties.SORTED_BY_PROPERTY; +import static io.prestosql.plugin.hive.HiveTableProperties.STORAGE_FORMAT_PROPERTY; +import static io.prestosql.plugin.hive.HiveTableProperties.TRANSACTIONAL; +import static io.prestosql.plugin.hive.HiveTestUtils.PAGE_SORTER; +import static io.prestosql.plugin.hive.HiveTestUtils.SESSION; +import static io.prestosql.plugin.hive.HiveTestUtils.TYPE_MANAGER; +import static io.prestosql.plugin.hive.HiveTestUtils.arrayType; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveDataStreamFactories; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveFileWriterFactories; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveRecordCursorProvider; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveSelectiveFactories; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultOrcFileWriterFactory; +import static io.prestosql.plugin.hive.HiveTestUtils.getNoOpIndexCache; +import static io.prestosql.plugin.hive.HiveTestUtils.getTypes; +import static io.prestosql.plugin.hive.HiveTestUtils.mapType; +import static io.prestosql.plugin.hive.HiveTestUtils.rowType; +import static io.prestosql.plugin.hive.HiveType.HIVE_INT; +import static io.prestosql.plugin.hive.HiveType.HIVE_LONG; +import static io.prestosql.plugin.hive.HiveType.HIVE_STRING; +import static io.prestosql.plugin.hive.HiveType.toHiveType; +import static io.prestosql.plugin.hive.HiveUtil.columnExtraInfo; +import static io.prestosql.plugin.hive.HiveUtil.toPartitionValues; +import static io.prestosql.plugin.hive.HiveWriteUtils.createDirectory; +import static io.prestosql.plugin.hive.LocationHandle.WriteMode.STAGE_AND_MOVE_TO_TARGET_DIRECTORY; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.StandardErrorCode.TRANSACTION_CONFLICT; +import static io.prestosql.spi.connector.ConnectorSplitManager.SplitSchedulingStrategy.UNGROUPED_SCHEDULING; +import static io.prestosql.spi.connector.NotPartitionedPartitionHandle.NOT_PARTITIONED; +import static io.prestosql.spi.security.PrincipalType.USER; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.CharType.createCharType; +import static io.prestosql.spi.type.Chars.isCharType; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DecimalType.createDecimalType; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.HyperLogLogType.HYPER_LOG_LOG; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TimestampType.TIMESTAMP; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; +import static io.prestosql.spi.type.VarbinaryType.VARBINARY; +import static io.prestosql.spi.type.VarcharType.VARCHAR; +import static io.prestosql.spi.type.VarcharType.createUnboundedVarcharType; +import static io.prestosql.spi.type.VarcharType.createVarcharType; +import static io.prestosql.spi.type.Varchars.isVarcharType; +import static io.prestosql.testing.DateTimeTestingUtils.sqlTimestampOf; +import static io.prestosql.testing.MaterializedResult.materializeSourceDataStream; +import static java.lang.Float.floatToRawIntBits; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.nio.file.Files.createTempDirectory; +import static java.util.Collections.emptyMap; +import static java.util.Locale.ENGLISH; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.Executors.newCachedThreadPool; +import static java.util.concurrent.Executors.newFixedThreadPool; +import static java.util.concurrent.Executors.newScheduledThreadPool; +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.common.FileUtils.makePartName; +import static org.assertj.core.api.Assertions.assertThat; +import static org.joda.time.DateTimeZone.UTC; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; + +public abstract class AbstractTestHive +{ + protected static final String TEMPORARY_TABLE_PREFIX = "tmp_presto_test_"; + + protected static final String INVALID_DATABASE = "totally_invalid_database_name"; + protected static final String INVALID_TABLE = "totally_invalid_table_name"; + protected static final String INVALID_COLUMN = "totally_invalid_column_name"; + + protected static final String TEST_SERVER_VERSION = "test_version"; + + private static final Type ARRAY_TYPE = arrayType(createUnboundedVarcharType()); + private static final Type MAP_TYPE = mapType(createUnboundedVarcharType(), BIGINT); + private static final Type ROW_TYPE = rowType(ImmutableList.of( + new NamedTypeSignature(Optional.of(new RowFieldName("f_string", false)), createUnboundedVarcharType().getTypeSignature()), + new NamedTypeSignature(Optional.of(new RowFieldName("f_bigint", false)), BIGINT.getTypeSignature()), + new NamedTypeSignature(Optional.of(new RowFieldName("f_boolean", false)), BOOLEAN.getTypeSignature()))); + + private static final List CREATE_TABLE_COLUMNS = ImmutableList.builder() + .add(new ColumnMetadata("id", BIGINT)) + .add(new ColumnMetadata("t_string", createUnboundedVarcharType())) + .add(new ColumnMetadata("t_tinyint", TINYINT)) + .add(new ColumnMetadata("t_smallint", SMALLINT)) + .add(new ColumnMetadata("t_integer", INTEGER)) + .add(new ColumnMetadata("t_bigint", BIGINT)) + .add(new ColumnMetadata("t_float", REAL)) + .add(new ColumnMetadata("t_double", DOUBLE)) + .add(new ColumnMetadata("t_boolean", BOOLEAN)) + .add(new ColumnMetadata("t_array", ARRAY_TYPE)) + .add(new ColumnMetadata("t_map", MAP_TYPE)) + .add(new ColumnMetadata("t_row", ROW_TYPE)) + .build(); + + private static final MaterializedResult CREATE_TABLE_DATA = + MaterializedResult.resultBuilder(SESSION, BIGINT, createUnboundedVarcharType(), TINYINT, SMALLINT, INTEGER, BIGINT, REAL, DOUBLE, BOOLEAN, ARRAY_TYPE, MAP_TYPE, ROW_TYPE) + .row(1L, "hello", (byte) 45, (short) 345, 234, 123L, -754.1985f, 43.5, true, ImmutableList.of("apple", "banana"), ImmutableMap.of("one", 1L, "two", 2L), ImmutableList.of("true", 1L, true)) + .row(2L, null, null, null, null, null, null, null, null, null, null, null) + .row(3L, "bye", (byte) 46, (short) 346, 345, 456L, 754.2008f, 98.1, false, ImmutableList.of("ape", "bear"), ImmutableMap.of("three", 3L, "four", 4L), ImmutableList.of("false", 0L, false)) + .build(); + + private static final List CREATE_TABLE_COLUMNS_PARTITIONED = ImmutableList.builder() + .addAll(CREATE_TABLE_COLUMNS) + .add(new ColumnMetadata("ds", createUnboundedVarcharType())) + .build(); + + private static final MaterializedResult CREATE_TABLE_PARTITIONED_DATA = new MaterializedResult( + CREATE_TABLE_DATA.getMaterializedRows().stream() + .map(row -> new MaterializedRow(row.getPrecision(), newArrayList(concat(row.getFields(), ImmutableList.of("2015-07-0" + row.getField(0)))))) + .collect(toList()), + ImmutableList.builder() + .addAll(CREATE_TABLE_DATA.getTypes()) + .add(createUnboundedVarcharType()) + .build()); + + private static final String CREATE_TABLE_PARTITIONED_DATA_2ND_PARTITION_VALUE = "2015-07-04"; + + private static final MaterializedResult CREATE_TABLE_PARTITIONED_DATA_2ND = + MaterializedResult.resultBuilder(SESSION, BIGINT, createUnboundedVarcharType(), TINYINT, SMALLINT, INTEGER, BIGINT, REAL, DOUBLE, BOOLEAN, ARRAY_TYPE, MAP_TYPE, ROW_TYPE, createUnboundedVarcharType()) + .row(4L, "hello", (byte) 45, (short) 345, 234, 123L, 754.1985f, 43.5, true, ImmutableList.of("apple", "banana"), ImmutableMap.of("one", 1L, "two", 2L), ImmutableList.of("true", 1L, true), CREATE_TABLE_PARTITIONED_DATA_2ND_PARTITION_VALUE) + .row(5L, null, null, null, null, null, null, null, null, null, null, null, CREATE_TABLE_PARTITIONED_DATA_2ND_PARTITION_VALUE) + .row(6L, "bye", (byte) 46, (short) 346, 345, 456L, -754.2008f, 98.1, false, ImmutableList.of("ape", "bear"), ImmutableMap.of("three", 3L, "four", 4L), ImmutableList.of("false", 0L, false), CREATE_TABLE_PARTITIONED_DATA_2ND_PARTITION_VALUE) + .build(); + + private static final List MISMATCH_SCHEMA_PRIMITIVE_COLUMN_BEFORE = ImmutableList.builder() + .add(new ColumnMetadata("tinyint_to_smallint", TINYINT)) + .add(new ColumnMetadata("tinyint_to_integer", TINYINT)) + .add(new ColumnMetadata("tinyint_to_bigint", TINYINT)) + .add(new ColumnMetadata("smallint_to_integer", SMALLINT)) + .add(new ColumnMetadata("smallint_to_bigint", SMALLINT)) + .add(new ColumnMetadata("integer_to_bigint", INTEGER)) + .add(new ColumnMetadata("integer_to_varchar", INTEGER)) + .add(new ColumnMetadata("varchar_to_integer", createUnboundedVarcharType())) + .add(new ColumnMetadata("float_to_double", REAL)) + .add(new ColumnMetadata("varchar_to_drop_in_row", createUnboundedVarcharType())) + .build(); + + private static final List MISMATCH_SCHEMA_TABLE_BEFORE = ImmutableList.builder() + .addAll(MISMATCH_SCHEMA_PRIMITIVE_COLUMN_BEFORE) + .add(new ColumnMetadata("struct_to_struct", toRowType(MISMATCH_SCHEMA_PRIMITIVE_COLUMN_BEFORE))) + .add(new ColumnMetadata("list_to_list", arrayType(toRowType(MISMATCH_SCHEMA_PRIMITIVE_COLUMN_BEFORE)))) + .add(new ColumnMetadata("map_to_map", mapType(MISMATCH_SCHEMA_PRIMITIVE_COLUMN_BEFORE.get(1).getType(), toRowType(MISMATCH_SCHEMA_PRIMITIVE_COLUMN_BEFORE)))) + .add(new ColumnMetadata("ds", createUnboundedVarcharType())) + .build(); + + private static RowType toRowType(List columns) + { + return rowType(columns.stream() + .map(col -> new NamedTypeSignature(Optional.of(new RowFieldName(format("f_%s", col.getName()), false)), col.getType().getTypeSignature())) + .collect(toList())); + } + + private static final MaterializedResult MISMATCH_SCHEMA_PRIMITIVE_FIELDS_DATA_BEFORE = + MaterializedResult.resultBuilder(SESSION, TINYINT, TINYINT, TINYINT, SMALLINT, SMALLINT, INTEGER, INTEGER, createUnboundedVarcharType(), REAL, createUnboundedVarcharType()) + .row((byte) -11, (byte) 12, (byte) -13, (short) 14, (short) 15, -16, 17, "2147483647", 18.0f, "2016-08-01") + .row((byte) 21, (byte) -22, (byte) 23, (short) -24, (short) 25, 26, -27, "asdf", -28.0f, "2016-08-02") + .row((byte) -31, (byte) -32, (byte) 33, (short) 34, (short) -35, 36, 37, "-923", 39.5f, "2016-08-03") + .row(null, (byte) 42, (byte) 43, (short) 44, (short) -45, 46, 47, "2147483648", 49.5f, "2016-08-03") + .build(); + + private static final MaterializedResult MISMATCH_SCHEMA_TABLE_DATA_BEFORE = + MaterializedResult.resultBuilder(SESSION, MISMATCH_SCHEMA_TABLE_BEFORE.stream().map(ColumnMetadata::getType).collect(toList())) + .rows(MISMATCH_SCHEMA_PRIMITIVE_FIELDS_DATA_BEFORE.getMaterializedRows() + .stream() + .map(materializedRow -> { + List result = materializedRow.getFields(); + List rowResult = materializedRow.getFields(); + result.add(rowResult); + result.add(Arrays.asList(rowResult, null, rowResult)); + result.add(ImmutableMap.of(rowResult.get(1), rowResult)); + result.add(rowResult.get(9)); + return new MaterializedRow(materializedRow.getPrecision(), result); + }).collect(toList())) + .build(); + + private static final List MISMATCH_SCHEMA_PRIMITIVE_COLUMN_AFTER = ImmutableList.builder() + .add(new ColumnMetadata("tinyint_to_smallint", SMALLINT)) + .add(new ColumnMetadata("tinyint_to_integer", INTEGER)) + .add(new ColumnMetadata("tinyint_to_bigint", BIGINT)) + .add(new ColumnMetadata("smallint_to_integer", INTEGER)) + .add(new ColumnMetadata("smallint_to_bigint", BIGINT)) + .add(new ColumnMetadata("integer_to_bigint", BIGINT)) + .add(new ColumnMetadata("integer_to_varchar", createUnboundedVarcharType())) + .add(new ColumnMetadata("varchar_to_integer", INTEGER)) + .add(new ColumnMetadata("float_to_double", DOUBLE)) + .add(new ColumnMetadata("varchar_to_drop_in_row", createUnboundedVarcharType())) + .build(); + + private static final Type MISMATCH_SCHEMA_ROW_TYPE_APPEND = toRowType(ImmutableList.builder() + .addAll(MISMATCH_SCHEMA_PRIMITIVE_COLUMN_AFTER) + .add(new ColumnMetadata(format("%s_append", MISMATCH_SCHEMA_PRIMITIVE_COLUMN_AFTER.get(0).getName()), MISMATCH_SCHEMA_PRIMITIVE_COLUMN_AFTER.get(0).getType())) + .build()); + private static final Type MISMATCH_SCHEMA_ROW_TYPE_DROP = toRowType(MISMATCH_SCHEMA_PRIMITIVE_COLUMN_AFTER.subList(0, MISMATCH_SCHEMA_PRIMITIVE_COLUMN_AFTER.size() - 1)); + + private static final List MISMATCH_SCHEMA_TABLE_AFTER = ImmutableList.builder() + .addAll(MISMATCH_SCHEMA_PRIMITIVE_COLUMN_AFTER) + .add(new ColumnMetadata("struct_to_struct", MISMATCH_SCHEMA_ROW_TYPE_APPEND)) + .add(new ColumnMetadata("list_to_list", arrayType(MISMATCH_SCHEMA_ROW_TYPE_APPEND))) + .add(new ColumnMetadata("map_to_map", mapType(MISMATCH_SCHEMA_PRIMITIVE_COLUMN_AFTER.get(1).getType(), MISMATCH_SCHEMA_ROW_TYPE_DROP))) + .add(new ColumnMetadata("ds", createUnboundedVarcharType())) + .build(); + + private static final MaterializedResult MISMATCH_SCHEMA_PRIMITIVE_FIELDS_DATA_AFTER = + MaterializedResult.resultBuilder(SESSION, SMALLINT, INTEGER, BIGINT, INTEGER, BIGINT, BIGINT, createUnboundedVarcharType(), INTEGER, DOUBLE, createUnboundedVarcharType()) + .row((short) -11, 12, -13L, 14, 15L, -16L, "17", 2147483647, 18.0, "2016-08-01") + .row((short) 21, -22, 23L, -24, 25L, 26L, "-27", null, -28.0, "2016-08-02") + .row((short) -31, -32, 33L, 34, -35L, 36L, "37", -923, 39.5, "2016-08-03") + .row(null, 42, 43L, 44, -45L, 46L, "47", null, 49.5, "2016-08-03") + .build(); + + private static final MaterializedResult MISMATCH_SCHEMA_TABLE_DATA_AFTER = + MaterializedResult.resultBuilder(SESSION, MISMATCH_SCHEMA_TABLE_AFTER.stream().map(ColumnMetadata::getType).collect(toList())) + .rows(MISMATCH_SCHEMA_PRIMITIVE_FIELDS_DATA_AFTER.getMaterializedRows() + .stream() + .map(materializedRow -> { + List result = materializedRow.getFields(); + List appendFieldRowResult = materializedRow.getFields(); + appendFieldRowResult.add(null); + List dropFieldRowResult = materializedRow.getFields().subList(0, materializedRow.getFields().size() - 1); + result.add(appendFieldRowResult); + result.add(Arrays.asList(appendFieldRowResult, null, appendFieldRowResult)); + result.add(ImmutableMap.of(result.get(1), dropFieldRowResult)); + result.add(result.get(9)); + return new MaterializedRow(materializedRow.getPrecision(), result); + }).collect(toList())) + .build(); + + protected Set createTableFormats = difference( + ImmutableSet.copyOf(HiveStorageFormat.values()), + // exclude formats that change table schema with serde + ImmutableSet.of(AVRO, CSV)); + + private static final JoinCompiler JOIN_COMPILER = new JoinCompiler(createTestMetadataManager()); + + private static final List STATISTICS_TABLE_COLUMNS = ImmutableList.builder() + .add(new ColumnMetadata("t_boolean", BOOLEAN)) + .add(new ColumnMetadata("t_bigint", BIGINT)) + .add(new ColumnMetadata("t_integer", INTEGER)) + .add(new ColumnMetadata("t_smallint", SMALLINT)) + .add(new ColumnMetadata("t_tinyint", TINYINT)) + .add(new ColumnMetadata("t_double", DOUBLE)) + .add(new ColumnMetadata("t_float", REAL)) + .add(new ColumnMetadata("t_string", createUnboundedVarcharType())) + .add(new ColumnMetadata("t_varchar", createVarcharType(100))) + .add(new ColumnMetadata("t_char", createCharType(5))) + .add(new ColumnMetadata("t_varbinary", VARBINARY)) + .add(new ColumnMetadata("t_date", DATE)) + .add(new ColumnMetadata("t_timestamp", TIMESTAMP)) + .add(new ColumnMetadata("t_short_decimal", createDecimalType(5, 2))) + .add(new ColumnMetadata("t_long_decimal", createDecimalType(20, 3))) + .build(); + + protected static final List STATISTICS_PARTITIONED_TABLE_COLUMNS = ImmutableList.builder() + .addAll(STATISTICS_TABLE_COLUMNS) + .add(new ColumnMetadata("ds", VARCHAR)) + .build(); + + protected static final PartitionStatistics EMPTY_TABLE_STATISTICS = new PartitionStatistics(createZeroStatistics(), ImmutableMap.of()); + protected static final PartitionStatistics BASIC_STATISTICS_1 = new PartitionStatistics(new HiveBasicStatistics(0, 20, 3, 0), ImmutableMap.of()); + protected static final PartitionStatistics BASIC_STATISTICS_2 = new PartitionStatistics(new HiveBasicStatistics(0, 30, 2, 0), ImmutableMap.of()); + + private static final PartitionStatistics STATISTICS_1 = + new PartitionStatistics( + BASIC_STATISTICS_1.getBasicStatistics(), + ImmutableMap.builder() + .put("t_boolean", HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(5), OptionalLong.of(6), OptionalLong.of(3))) + .put("t_bigint", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(1234L), OptionalLong.of(5678L), OptionalLong.of(2), OptionalLong.of(5))) + .put("t_integer", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(123L), OptionalLong.of(567L), OptionalLong.of(3), OptionalLong.of(4))) + .put("t_smallint", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(12L), OptionalLong.of(56L), OptionalLong.of(2), OptionalLong.of(6))) + .put("t_tinyint", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(1L), OptionalLong.of(2L), OptionalLong.of(1), OptionalLong.of(3))) + .put("t_double", HiveColumnStatistics.createDoubleColumnStatistics(OptionalDouble.of(1234.25), OptionalDouble.of(5678.58), OptionalLong.of(7), OptionalLong.of(8))) + .put("t_float", HiveColumnStatistics.createDoubleColumnStatistics(OptionalDouble.of(123.25), OptionalDouble.of(567.58), OptionalLong.of(9), OptionalLong.of(10))) + .put("t_string", HiveColumnStatistics.createStringColumnStatistics(OptionalLong.of(10), OptionalLong.of(50), OptionalLong.of(3), OptionalLong.of(7))) + .put("t_varchar", HiveColumnStatistics.createStringColumnStatistics(OptionalLong.of(100), OptionalLong.of(230), OptionalLong.of(5), OptionalLong.of(3))) + .put("t_char", HiveColumnStatistics.createStringColumnStatistics(OptionalLong.of(5), OptionalLong.of(500), OptionalLong.of(1), OptionalLong.of(4))) + .put("t_varbinary", HiveColumnStatistics.createBinaryColumnStatistics(OptionalLong.of(4), OptionalLong.of(300), OptionalLong.of(1))) + .put("t_date", HiveColumnStatistics.createDateColumnStatistics(Optional.of(java.time.LocalDate.ofEpochDay(1)), Optional.of(java.time.LocalDate.ofEpochDay(2)), OptionalLong.of(7), OptionalLong.of(6))) + .put("t_timestamp", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(1234567L), OptionalLong.of(71234567L), OptionalLong.of(7), OptionalLong.of(5))) + .put("t_short_decimal", HiveColumnStatistics.createDecimalColumnStatistics(Optional.of(new BigDecimal(10)), Optional.of(new BigDecimal(12)), OptionalLong.of(3), OptionalLong.of(5))) + .put("t_long_decimal", HiveColumnStatistics.createDecimalColumnStatistics(Optional.of(new BigDecimal("12345678901234567.123")), Optional.of(new BigDecimal("81234567890123456.123")), OptionalLong.of(2), OptionalLong.of(1))) + .build()); + + private static final PartitionStatistics STATISTICS_1_1 = + new PartitionStatistics( + new HiveBasicStatistics(OptionalLong.of(0), OptionalLong.of(15), OptionalLong.empty(), OptionalLong.of(0)), + STATISTICS_1.getColumnStatistics().entrySet() + .stream() + .filter(entry -> entry.getKey().hashCode() % 2 == 0) + .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue))); + + private static final PartitionStatistics STATISTICS_1_2 = + new PartitionStatistics( + new HiveBasicStatistics(OptionalLong.of(0), OptionalLong.of(15), OptionalLong.of(3), OptionalLong.of(0)), + STATISTICS_1.getColumnStatistics().entrySet() + .stream() + .filter(entry -> entry.getKey().hashCode() % 2 == 1) + .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue))); + + private static final PartitionStatistics STATISTICS_2 = + new PartitionStatistics( + BASIC_STATISTICS_2.getBasicStatistics(), + ImmutableMap.builder() + .put("t_boolean", HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(4), OptionalLong.of(3), OptionalLong.of(2))) + .put("t_bigint", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(2345L), OptionalLong.of(6789L), OptionalLong.of(4), OptionalLong.of(7))) + .put("t_integer", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(234L), OptionalLong.of(678L), OptionalLong.of(5), OptionalLong.of(6))) + .put("t_smallint", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(23L), OptionalLong.of(65L), OptionalLong.of(7), OptionalLong.of(5))) + .put("t_tinyint", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(12), OptionalLong.of(3L), OptionalLong.of(2), OptionalLong.of(3))) + .put("t_double", HiveColumnStatistics.createDoubleColumnStatistics(OptionalDouble.of(2345.25), OptionalDouble.of(6785.58), OptionalLong.of(6), OptionalLong.of(3))) + .put("t_float", HiveColumnStatistics.createDoubleColumnStatistics(OptionalDouble.of(235.25), OptionalDouble.of(676.58), OptionalLong.of(7), OptionalLong.of(11))) + .put("t_string", HiveColumnStatistics.createStringColumnStatistics(OptionalLong.of(11), OptionalLong.of(600), OptionalLong.of(2), OptionalLong.of(6))) + .put("t_varchar", HiveColumnStatistics.createStringColumnStatistics(OptionalLong.of(99), OptionalLong.of(223), OptionalLong.of(7), OptionalLong.of(1))) + .put("t_char", HiveColumnStatistics.createStringColumnStatistics(OptionalLong.of(6), OptionalLong.of(60), OptionalLong.of(0), OptionalLong.of(3))) + .put("t_varbinary", HiveColumnStatistics.createBinaryColumnStatistics(OptionalLong.of(2), OptionalLong.of(10), OptionalLong.of(2))) + .put("t_date", HiveColumnStatistics.createDateColumnStatistics(Optional.of(java.time.LocalDate.ofEpochDay(2)), Optional.of(java.time.LocalDate.ofEpochDay(3)), OptionalLong.of(8), OptionalLong.of(7))) + .put("t_timestamp", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(2345671L), OptionalLong.of(12345677L), OptionalLong.of(9), OptionalLong.of(1))) + .put("t_short_decimal", HiveColumnStatistics.createDecimalColumnStatistics(Optional.of(new BigDecimal(11)), Optional.of(new BigDecimal(14)), OptionalLong.of(5), OptionalLong.of(7))) + .put("t_long_decimal", HiveColumnStatistics.createDecimalColumnStatistics(Optional.of(new BigDecimal("71234567890123456.123")), Optional.of(new BigDecimal("78123456789012345.123")), OptionalLong.of(2), OptionalLong.of(1))) + .build()); + + private static final PartitionStatistics STATISTICS_EMPTY_OPTIONAL_FIELDS = + new PartitionStatistics( + new HiveBasicStatistics(OptionalLong.of(0), OptionalLong.of(20), OptionalLong.empty(), OptionalLong.of(0)), + ImmutableMap.builder() + .put("t_boolean", HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(4), OptionalLong.of(3), OptionalLong.of(2))) + .put("t_bigint", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.of(4), OptionalLong.of(7))) + .put("t_integer", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.of(5), OptionalLong.of(6))) + .put("t_smallint", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.of(7), OptionalLong.of(5))) + .put("t_tinyint", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.of(2), OptionalLong.of(3))) + .put("t_double", HiveColumnStatistics.createDoubleColumnStatistics(OptionalDouble.empty(), OptionalDouble.empty(), OptionalLong.of(6), OptionalLong.of(3))) + .put("t_float", HiveColumnStatistics.createDoubleColumnStatistics(OptionalDouble.empty(), OptionalDouble.empty(), OptionalLong.of(7), OptionalLong.of(11))) + .put("t_string", HiveColumnStatistics.createStringColumnStatistics(OptionalLong.of(0), OptionalLong.of(0), OptionalLong.of(2), OptionalLong.of(6))) + .put("t_varchar", HiveColumnStatistics.createStringColumnStatistics(OptionalLong.of(0), OptionalLong.of(0), OptionalLong.of(7), OptionalLong.of(1))) + .put("t_char", HiveColumnStatistics.createStringColumnStatistics(OptionalLong.of(0), OptionalLong.of(0), OptionalLong.of(0), OptionalLong.of(3))) + .put("t_varbinary", HiveColumnStatistics.createBinaryColumnStatistics(OptionalLong.of(0), OptionalLong.of(0), OptionalLong.of(2))) + // https://issues.apache.org/jira/browse/HIVE-20098 + // .put("t_date", createDateColumnStatistics(Optional.empty(), Optional.empty(), OptionalLong.of(8), OptionalLong.of(7))) + .put("t_timestamp", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.of(9), OptionalLong.of(1))) + .put("t_short_decimal", HiveColumnStatistics.createDecimalColumnStatistics(Optional.empty(), Optional.empty(), OptionalLong.of(5), OptionalLong.of(7))) + .put("t_long_decimal", HiveColumnStatistics.createDecimalColumnStatistics(Optional.empty(), Optional.empty(), OptionalLong.of(2), OptionalLong.of(1))) + .build()); + + protected String database; + protected SchemaTableName tablePartitionFormat; + protected SchemaTableName tableUnpartitioned; + protected SchemaTableName tableOffline; + protected SchemaTableName tableOfflinePartition; + protected SchemaTableName tableNotReadable; + protected SchemaTableName view; + protected SchemaTableName invalidTable; + protected SchemaTableName tableBucketedStringInt; + protected SchemaTableName tableBucketedBigintBoolean; + protected SchemaTableName tableBucketedDoubleFloat; + protected SchemaTableName tablePartitionSchemaChange; + protected SchemaTableName tablePartitionSchemaChangeNonCanonical; + protected SchemaTableName tableBucketEvolution; + + protected ConnectorTableHandle invalidTableHandle; + + protected ColumnHandle dsColumn; + protected ColumnHandle fileFormatColumn; + protected ColumnHandle dummyColumn; + protected ColumnHandle intColumn; + protected ColumnHandle invalidColumnHandle; + + protected ConnectorTableProperties tablePartitionFormatProperties; + protected ConnectorTableProperties tableUnpartitionedProperties; + protected List tablePartitionFormatPartitions; + protected List tableUnpartitionedPartitions; + + protected HdfsEnvironment hdfsEnvironment; + protected LocationService locationService; + + protected HiveMetadataFactory metadataFactory; + protected HiveTransactionManager transactionManager; + protected HiveMetastore metastoreClient; + protected ConnectorSplitManager splitManager; + protected ConnectorPageSourceProvider pageSourceProvider; + protected ConnectorPageSinkProvider pageSinkProvider; + protected ExecutorService executor; + protected ExecutorService executorRefresh; + + private ScheduledExecutorService heartbeatService; + private ScheduledExecutorService vacuumExecutorService; + private ScheduledExecutorService hiveMetastoreClientService; + + @BeforeClass + public void setupClass() + { + executor = newCachedThreadPool(daemonThreadsNamed("hive-%s")); + executorRefresh = newCachedThreadPool(daemonThreadsNamed("hive-refresh-%s")); + heartbeatService = newScheduledThreadPool(1); + vacuumExecutorService = newScheduledThreadPool(1); + hiveMetastoreClientService = newScheduledThreadPool(1); + } + + @AfterClass(alwaysRun = true) + public void tearDown() + { + if (executor != null) { + executor.shutdownNow(); + executor = null; + } + if (heartbeatService != null) { + heartbeatService.shutdownNow(); + heartbeatService = null; + } + if (vacuumExecutorService != null) { + vacuumExecutorService.shutdownNow(); + vacuumExecutorService = null; + } + } + + protected void setupHive(String databaseName) + { + database = databaseName; + tablePartitionFormat = new SchemaTableName(database, "presto_test_partition_format"); + tableUnpartitioned = new SchemaTableName(database, "presto_test_unpartitioned"); + tableOffline = new SchemaTableName(database, "presto_test_offline"); + tableOfflinePartition = new SchemaTableName(database, "presto_test_offline_partition"); + tableNotReadable = new SchemaTableName(database, "presto_test_not_readable"); + view = new SchemaTableName(database, "presto_test_view"); + invalidTable = new SchemaTableName(database, INVALID_TABLE); + tableBucketedStringInt = new SchemaTableName(database, "presto_test_bucketed_by_string_int"); + tableBucketedBigintBoolean = new SchemaTableName(database, "presto_test_bucketed_by_bigint_boolean"); + tableBucketedDoubleFloat = new SchemaTableName(database, "presto_test_bucketed_by_double_float"); + tablePartitionSchemaChange = new SchemaTableName(database, "presto_test_partition_schema_change"); + tablePartitionSchemaChangeNonCanonical = new SchemaTableName(database, "presto_test_partition_schema_change_non_canonical"); + tableBucketEvolution = new SchemaTableName(database, "presto_test_bucket_evolution"); + + invalidTableHandle = new HiveTableHandle(database, INVALID_TABLE, ImmutableMap.of(), ImmutableList.of(), Optional.empty()); + + dsColumn = new HiveColumnHandle("ds", HIVE_STRING, parseTypeSignature(StandardTypes.VARCHAR), -1, PARTITION_KEY, Optional.empty()); + fileFormatColumn = new HiveColumnHandle("file_format", HIVE_STRING, parseTypeSignature(StandardTypes.VARCHAR), -1, PARTITION_KEY, Optional.empty()); + dummyColumn = new HiveColumnHandle("dummy", HIVE_INT, parseTypeSignature(StandardTypes.INTEGER), -1, PARTITION_KEY, Optional.empty()); + intColumn = new HiveColumnHandle("t_int", HIVE_INT, parseTypeSignature(StandardTypes.INTEGER), -1, PARTITION_KEY, Optional.empty()); + invalidColumnHandle = new HiveColumnHandle(INVALID_COLUMN, HIVE_STRING, parseTypeSignature(StandardTypes.VARCHAR), 0, REGULAR, Optional.empty()); + + List partitionColumns = ImmutableList.of(dsColumn, fileFormatColumn, dummyColumn); + tablePartitionFormatPartitions = ImmutableList.builder() + .add(new HivePartition(tablePartitionFormat, + "ds=2012-12-29/file_format=textfile/dummy=1", + ImmutableMap.builder() + .put(dsColumn, NullableValue.of(createUnboundedVarcharType(), utf8Slice("2012-12-29"))) + .put(fileFormatColumn, NullableValue.of(createUnboundedVarcharType(), utf8Slice("textfile"))) + .put(dummyColumn, NullableValue.of(INTEGER, 1L)) + .build())) + .add(new HivePartition(tablePartitionFormat, + "ds=2012-12-29/file_format=sequencefile/dummy=2", + ImmutableMap.builder() + .put(dsColumn, NullableValue.of(createUnboundedVarcharType(), utf8Slice("2012-12-29"))) + .put(fileFormatColumn, NullableValue.of(createUnboundedVarcharType(), utf8Slice("sequencefile"))) + .put(dummyColumn, NullableValue.of(INTEGER, 2L)) + .build())) + .add(new HivePartition(tablePartitionFormat, + "ds=2012-12-29/file_format=rctext/dummy=3", + ImmutableMap.builder() + .put(dsColumn, NullableValue.of(createUnboundedVarcharType(), utf8Slice("2012-12-29"))) + .put(fileFormatColumn, NullableValue.of(createUnboundedVarcharType(), utf8Slice("rctext"))) + .put(dummyColumn, NullableValue.of(INTEGER, 3L)) + .build())) + .add(new HivePartition(tablePartitionFormat, + "ds=2012-12-29/file_format=rcbinary/dummy=4", + ImmutableMap.builder() + .put(dsColumn, NullableValue.of(createUnboundedVarcharType(), utf8Slice("2012-12-29"))) + .put(fileFormatColumn, NullableValue.of(createUnboundedVarcharType(), utf8Slice("rcbinary"))) + .put(dummyColumn, NullableValue.of(INTEGER, 4L)) + .build())) + .build(); + tableUnpartitionedPartitions = ImmutableList.of(new HivePartition(tableUnpartitioned)); + tablePartitionFormatProperties = new ConnectorTableProperties( + TupleDomain.withColumnDomains(ImmutableMap.of( + dsColumn, Domain.create(ValueSet.ofRanges(Range.equal(createUnboundedVarcharType(), utf8Slice("2012-12-29"))), false), + fileFormatColumn, Domain.create(ValueSet.ofRanges(Range.equal(createUnboundedVarcharType(), utf8Slice("textfile")), Range.equal(createUnboundedVarcharType(), utf8Slice("sequencefile")), Range.equal(createUnboundedVarcharType(), utf8Slice("rctext")), Range.equal(createUnboundedVarcharType(), utf8Slice("rcbinary"))), false), + dummyColumn, Domain.create(ValueSet.ofRanges(Range.equal(INTEGER, 1L), Range.equal(INTEGER, 2L), Range.equal(INTEGER, 3L), Range.equal(INTEGER, 4L)), false))), + Optional.empty(), + Optional.empty(), + Optional.of(new DiscretePredicates(partitionColumns, ImmutableList.of( + TupleDomain.withColumnDomains(ImmutableMap.of( + dsColumn, Domain.create(ValueSet.ofRanges(Range.equal(createUnboundedVarcharType(), utf8Slice("2012-12-29"))), false), + fileFormatColumn, Domain.create(ValueSet.ofRanges(Range.equal(createUnboundedVarcharType(), utf8Slice("textfile"))), false), + dummyColumn, Domain.create(ValueSet.ofRanges(Range.equal(INTEGER, 1L)), false))), + TupleDomain.withColumnDomains(ImmutableMap.of( + dsColumn, Domain.create(ValueSet.ofRanges(Range.equal(createUnboundedVarcharType(), utf8Slice("2012-12-29"))), false), + fileFormatColumn, Domain.create(ValueSet.ofRanges(Range.equal(createUnboundedVarcharType(), utf8Slice("sequencefile"))), false), + dummyColumn, Domain.create(ValueSet.ofRanges(Range.equal(INTEGER, 2L)), false))), + TupleDomain.withColumnDomains(ImmutableMap.of( + dsColumn, Domain.create(ValueSet.ofRanges(Range.equal(createUnboundedVarcharType(), utf8Slice("2012-12-29"))), false), + fileFormatColumn, Domain.create(ValueSet.ofRanges(Range.equal(createUnboundedVarcharType(), utf8Slice("rctext"))), false), + dummyColumn, Domain.create(ValueSet.ofRanges(Range.equal(INTEGER, 3L)), false))), + TupleDomain.withColumnDomains(ImmutableMap.of( + dsColumn, Domain.create(ValueSet.ofRanges(Range.equal(createUnboundedVarcharType(), utf8Slice("2012-12-29"))), false), + fileFormatColumn, Domain.create(ValueSet.ofRanges(Range.equal(createUnboundedVarcharType(), utf8Slice("rcbinary"))), false), + dummyColumn, Domain.create(ValueSet.ofRanges(Range.equal(INTEGER, 4L)), false)))))), + ImmutableList.of()); + tableUnpartitionedProperties = new ConnectorTableProperties(); + } + + protected final void setup(String host, int port, String databaseName, String timeZone) + { + HiveConfig hiveConfig = getHiveConfig() + .setParquetTimeZone(timeZone) + .setRcfileTimeZone(timeZone); + String proxy = System.getProperty("hive.metastore.thrift.client.socks-proxy"); + if (proxy != null) { + hiveConfig.setMetastoreSocksProxy(HostAndPort.fromString(proxy)); + } + + MetastoreLocator metastoreLocator = new TestingMetastoreLocator(hiveConfig, host, port); + HiveMetastore metastore = new CachingHiveMetastore( + new BridgingHiveMetastore(new ThriftHiveMetastore(metastoreLocator, new ThriftHiveMetastoreConfig())), + executor, + executorRefresh, Duration.valueOf("1m"), + Duration.valueOf("15s"), + Duration.valueOf("1m"), + Duration.valueOf("15s"), + 10000, false); + + setup(databaseName, hiveConfig, metastore); + } + + protected final void setup(String databaseName, HiveConfig hiveConfig, HiveMetastore hiveMetastore) + { + setupHive(databaseName); + + metastoreClient = hiveMetastore; + HivePartitionManager partitionManager = new HivePartitionManager(TYPE_MANAGER, hiveConfig); + HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hiveConfig), ImmutableSet.of()); + hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hiveConfig, new NoHdfsAuthentication()); + locationService = new HiveLocationService(hdfsEnvironment); + JsonCodec partitionUpdateCodec = JsonCodec.jsonCodec(PartitionUpdate.class); + metadataFactory = new HiveMetadataFactory( + metastoreClient, + hdfsEnvironment, + partitionManager, + 10, + false, + false, + false, + true, + true, + 1000, + Optional.empty(), + Duration.valueOf("5m"), + TYPE_MANAGER, + locationService, + partitionUpdateCodec, + newFixedThreadPool(2), + vacuumExecutorService, + heartbeatService, + hiveMetastoreClientService, + new HiveTypeTranslator(), + TEST_SERVER_VERSION, + SqlStandardAccessControlMetadata::new, + 10, 0.1, false, + Optional.of(Duration.valueOf("5m")), hiveConfig.getMetastoreWriteBatchSize()); + transactionManager = new HiveTransactionManager(); + splitManager = new HiveSplitManager( + transactionHandle -> ((HiveMetadata) transactionManager.get(transactionHandle)).getMetastore(), + partitionManager, + new NamenodeStats(), + hdfsEnvironment, + new CachingDirectoryLister(hiveConfig), + directExecutor(), + new HiveCoercionPolicy(TYPE_MANAGER), + new CounterStat(), + 100, + hiveConfig.getMaxOutstandingSplitsSize(), + hiveConfig.getMinPartitionBatchSize(), + hiveConfig.getMaxPartitionBatchSize(), + hiveConfig.getMaxInitialSplits(), + hiveConfig.getSplitLoaderConcurrency(), + hiveConfig.getMaxSplitsPerSecond(), + false, null, hiveConfig); + pageSinkProvider = new HivePageSinkProvider( + getDefaultHiveFileWriterFactories(hiveConfig), + hdfsEnvironment, + PAGE_SORTER, + metastoreClient, + new GroupByHashPageIndexerFactory(JOIN_COMPILER), + TYPE_MANAGER, + getHiveConfig(), + locationService, + partitionUpdateCodec, + new TestingNodeManager("fake-environment"), + new HiveEventClient(), + new HiveSessionProperties(hiveConfig, new OrcFileWriterConfig(), new ParquetFileWriterConfig()), + new HiveWriterStats(), + getDefaultOrcFileWriterFactory(hiveConfig)); + pageSourceProvider = new HivePageSourceProvider(hiveConfig, hdfsEnvironment, getDefaultHiveRecordCursorProvider(hiveConfig), getDefaultHiveDataStreamFactories(hiveConfig), TYPE_MANAGER, getNoOpIndexCache(), getDefaultHiveSelectiveFactories(hiveConfig)); + } + + /** + * Allow subclass to change default configuration. + */ + protected HiveConfig getHiveConfig() + { + return new HiveConfig() + .setMaxOpenSortFiles(10) + .setWriterSortBufferSize(new DataSize(100, KILOBYTE)); + } + + protected ConnectorSession newSession() + { + return newSession(ImmutableMap.of()); + } + + protected ConnectorSession newSession(Map propertyValues) + { + HiveSessionProperties properties = new HiveSessionProperties(getHiveConfig(), new OrcFileWriterConfig(), new ParquetFileWriterConfig()); + return new TestingConnectorSession(properties.getSessionProperties(), propertyValues); + } + + protected Transaction newTransaction() + { + return new HiveTransaction(transactionManager, metadataFactory.get()); + } + + interface Transaction + extends AutoCloseable + { + ConnectorMetadata getMetadata(); + + SemiTransactionalHiveMetastore getMetastore(String schema); + + ConnectorTransactionHandle getTransactionHandle(); + + void commit(); + + void rollback(); + + @Override + void close(); + } + + static class HiveTransaction + implements Transaction + { + private final HiveTransactionManager transactionManager; + private final ConnectorTransactionHandle transactionHandle; + private boolean closed; + + public HiveTransaction(HiveTransactionManager transactionManager, HiveMetadata hiveMetadata) + { + this.transactionManager = requireNonNull(transactionManager, "transactionManager is null"); + this.transactionHandle = new HiveTransactionHandle(); + transactionManager.put(transactionHandle, hiveMetadata); + getMetastore().testOnlyThrowOnCleanupFailures(); + } + + @Override + public ConnectorMetadata getMetadata() + { + return transactionManager.get(transactionHandle); + } + + @Override + public SemiTransactionalHiveMetastore getMetastore(String schema) + { + return getMetastore(); + } + + private SemiTransactionalHiveMetastore getMetastore() + { + return ((HiveMetadata) transactionManager.get(transactionHandle)).getMetastore(); + } + + @Override + public ConnectorTransactionHandle getTransactionHandle() + { + return transactionHandle; + } + + @Override + public void commit() + { + checkState(!closed); + closed = true; + HiveMetadata metadata = (HiveMetadata) transactionManager.remove(transactionHandle); + checkArgument(metadata != null, "no such transaction: %s", transactionHandle); + metadata.commit(); + } + + @Override + public void rollback() + { + checkState(!closed); + closed = true; + HiveMetadata metadata = (HiveMetadata) transactionManager.remove(transactionHandle); + checkArgument(metadata != null, "no such transaction: %s", transactionHandle); + metadata.rollback(); + } + + @Override + public void close() + { + if (!closed) { + try { + getMetastore().testOnlyCheckIsReadOnly(); // transactions in this test with writes in it must explicitly commit or rollback + } + finally { + rollback(); + } + } + } + } + + @Test + public void testGetDatabaseNames() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + List databases = metadata.listSchemaNames(newSession()); + assertTrue(databases.contains(database)); + } + } + + @Test + public void testGetTableNames() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + List tables = metadata.listTables(newSession(), Optional.of(database)); + assertTrue(tables.contains(tablePartitionFormat)); + assertTrue(tables.contains(tableUnpartitioned)); + } + } + + @Test + public void testGetAllTableNames() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + List tables = metadata.listTables(newSession(), Optional.empty()); + assertTrue(tables.contains(tablePartitionFormat)); + assertTrue(tables.contains(tableUnpartitioned)); + } + } + + @Test + public void testGetAllTableColumns() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + Map> allColumns = metadata.listTableColumns(newSession(), new SchemaTablePrefix()); + assertTrue(allColumns.containsKey(tablePartitionFormat)); + assertTrue(allColumns.containsKey(tableUnpartitioned)); + } + } + + @Test + public void testGetAllTableColumnsInSchema() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + Map> allColumns = metadata.listTableColumns(newSession(), new SchemaTablePrefix(database)); + assertTrue(allColumns.containsKey(tablePartitionFormat)); + assertTrue(allColumns.containsKey(tableUnpartitioned)); + } + } + + @Test + public void testListUnknownSchema() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + assertNull(metadata.getTableHandle(session, new SchemaTableName(INVALID_DATABASE, INVALID_TABLE))); + assertEquals(metadata.listTables(session, Optional.of(INVALID_DATABASE)), ImmutableList.of()); + assertEquals(metadata.listTableColumns(session, new SchemaTablePrefix(INVALID_DATABASE, INVALID_TABLE)), ImmutableMap.of()); + assertEquals(metadata.listViews(session, Optional.of(INVALID_DATABASE)), ImmutableList.of()); + assertEquals(metadata.getViews(session, Optional.of(INVALID_DATABASE)), ImmutableMap.of()); + assertEquals(metadata.getView(session, new SchemaTableName(INVALID_DATABASE, INVALID_TABLE)), Optional.empty()); + } + } + + @Test + public void testGetPartitions() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tablePartitionFormat); + tableHandle = applyFilter(metadata, tableHandle, Constraint.alwaysTrue()); + ConnectorTableProperties properties = metadata.getTableProperties(newSession(), tableHandle); + assertExpectedTableProperties(properties, tablePartitionFormatProperties); + assertExpectedPartitions(tableHandle, tablePartitionFormatPartitions); + } + } + + @Test + public void testGetPartitionsWithBindings() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tablePartitionFormat); + Constraint constraint = new Constraint(TupleDomain.withColumnDomains(ImmutableMap.of(intColumn, Domain.singleValue(BIGINT, 5L)))); + tableHandle = applyFilter(metadata, tableHandle, constraint); + ConnectorTableProperties properties = metadata.getTableProperties(newSession(), tableHandle); + assertExpectedTableProperties(properties, tablePartitionFormatProperties); + assertExpectedPartitions(tableHandle, tablePartitionFormatPartitions); + } + } + + @Test + public void testMismatchSchemaTable() + throws Exception + { + for (HiveStorageFormat storageFormat : createTableFormats) { + // TODO: fix coercion for JSON + if (storageFormat == JSON) { + continue; + } + SchemaTableName temporaryMismatchSchemaTable = temporaryTable("mismatch_schema"); + try { + doTestMismatchSchemaTable( + temporaryMismatchSchemaTable, + storageFormat, + MISMATCH_SCHEMA_TABLE_BEFORE, + MISMATCH_SCHEMA_TABLE_DATA_BEFORE, + MISMATCH_SCHEMA_TABLE_AFTER, + MISMATCH_SCHEMA_TABLE_DATA_AFTER); + } + finally { + dropTable(temporaryMismatchSchemaTable); + } + } + } + + protected void doTestMismatchSchemaTable( + SchemaTableName schemaTableName, + HiveStorageFormat storageFormat, + List tableBefore, + MaterializedResult dataBefore, + List tableAfter, + MaterializedResult dataAfter) + throws Exception + { + String schemaName = schemaTableName.getSchemaName(); + String tableName = schemaTableName.getTableName(); + + doCreateEmptyTable(schemaTableName, storageFormat, tableBefore); + + // insert the data + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + ConnectorTableHandle tableHandle = getTableHandle(metadata, schemaTableName); + + ConnectorInsertTableHandle insertTableHandle = metadata.beginInsert(session, tableHandle); + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, insertTableHandle); + sink.appendPage(dataBefore.toPage()); + Collection fragments = getFutureValue(sink.finish()); + + metadata.finishInsert(session, insertTableHandle, fragments, ImmutableList.of()); + + transaction.commit(); + } + + // load the table and verify the data + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + ConnectorTableHandle tableHandle = getTableHandle(metadata, schemaTableName); + + List columnHandles = metadata.getColumnHandles(session, tableHandle).values().stream() + .filter(columnHandle -> !((HiveColumnHandle) columnHandle).isHidden()) + .collect(toList()); + + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.empty()); + assertEqualsIgnoreOrder(result.getMaterializedRows(), dataBefore.getMaterializedRows()); + transaction.commit(); + } + + // alter the table schema + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + PrincipalPrivileges principalPrivileges = testingPrincipalPrivilege(session); + Table oldTable = transaction.getMetastore(schemaName).getTable(new HiveIdentity(session), schemaName, tableName).get(); + HiveTypeTranslator hiveTypeTranslator = new HiveTypeTranslator(); + List dataColumns = tableAfter.stream() + .filter(columnMetadata -> !columnMetadata.getName().equals("ds")) + .map(columnMetadata -> new Column(columnMetadata.getName(), toHiveType(hiveTypeTranslator, columnMetadata.getType()), Optional.empty())) + .collect(toList()); + Table.Builder newTable = Table.builder(oldTable) + .setDataColumns(dataColumns); + + transaction.getMetastore(schemaName).replaceView(new HiveIdentity(session), schemaName, tableName, newTable.build(), principalPrivileges); + + transaction.commit(); + } + + // load the altered table and verify the data + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + ConnectorTableHandle tableHandle = getTableHandle(metadata, schemaTableName); + List columnHandles = metadata.getColumnHandles(session, tableHandle).values().stream() + .filter(columnHandle -> !((HiveColumnHandle) columnHandle).isHidden()) + .collect(toList()); + + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.empty()); + assertEqualsIgnoreOrder(result.getMaterializedRows(), dataAfter.getMaterializedRows()); + + transaction.commit(); + } + + // insertions to the partitions with type mismatches should fail + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + ConnectorTableHandle tableHandle = getTableHandle(metadata, schemaTableName); + + ConnectorInsertTableHandle insertTableHandle = metadata.beginInsert(session, tableHandle); + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, insertTableHandle); + sink.appendPage(dataAfter.toPage()); + Collection fragments = getFutureValue(sink.finish()); + + metadata.finishInsert(session, insertTableHandle, fragments, ImmutableList.of()); + + transaction.commit(); + + fail("expected exception"); + } + catch (PrestoException e) { + // expected + assertEquals(e.getErrorCode(), HiveErrorCode.HIVE_PARTITION_SCHEMA_MISMATCH.toErrorCode()); + } + } + + protected void assertExpectedTableProperties(ConnectorTableProperties actualProperties, ConnectorTableProperties expectedProperties) + { + assertEquals(actualProperties.getPredicate(), expectedProperties.getPredicate()); + assertEquals(actualProperties.getDiscretePredicates().isPresent(), expectedProperties.getDiscretePredicates().isPresent()); + actualProperties.getDiscretePredicates().ifPresent(actual -> { + DiscretePredicates expected = expectedProperties.getDiscretePredicates().get(); + assertEquals(actual.getColumns(), expected.getColumns()); + assertEqualsIgnoreOrder(actual.getPredicates(), expected.getPredicates()); + }); + assertEquals(actualProperties.getStreamPartitioningColumns(), expectedProperties.getStreamPartitioningColumns()); + assertEquals(actualProperties.getLocalProperties(), expectedProperties.getLocalProperties()); + } + + protected void assertExpectedPartitions(ConnectorTableHandle table, Iterable expectedPartitions) + { + Iterable actualPartitions = ((HiveTableHandle) table).getPartitions().orElseThrow(AssertionError::new); + Map actualById = uniqueIndex(actualPartitions, HivePartition::getPartitionId); + for (Object expected : expectedPartitions) { + assertInstanceOf(expected, HivePartition.class); + HivePartition expectedPartition = (HivePartition) expected; + + Object actual = actualById.get(expectedPartition.getPartitionId()); + assertEquals(actual, expected); + assertInstanceOf(actual, HivePartition.class); + HivePartition actualPartition = (HivePartition) actual; + + assertNotNull(actualPartition, "partition " + expectedPartition.getPartitionId()); + assertEquals(actualPartition.getPartitionId(), expectedPartition.getPartitionId()); + assertEquals(actualPartition.getKeys(), expectedPartition.getKeys()); + assertEquals(actualPartition.getTableName(), expectedPartition.getTableName()); + } + } + + @Test + public void testGetPartitionNamesUnpartitioned() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableUnpartitioned); + tableHandle = applyFilter(metadata, tableHandle, Constraint.alwaysTrue()); + ConnectorTableProperties properties = metadata.getTableProperties(newSession(), tableHandle); + assertExpectedTableProperties(properties, new ConnectorTableProperties()); + assertExpectedPartitions(tableHandle, tableUnpartitionedPartitions); + } + } + + @Test + public void testGetTableSchemaPartitionFormat() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(newSession(), getTableHandle(metadata, tablePartitionFormat)); + Map map = uniqueIndex(tableMetadata.getColumns(), ColumnMetadata::getName); + + assertPrimitiveField(map, "t_string", createUnboundedVarcharType(), false); + assertPrimitiveField(map, "t_tinyint", TINYINT, false); + assertPrimitiveField(map, "t_smallint", SMALLINT, false); + assertPrimitiveField(map, "t_int", INTEGER, false); + assertPrimitiveField(map, "t_bigint", BIGINT, false); + assertPrimitiveField(map, "t_float", REAL, false); + assertPrimitiveField(map, "t_double", DOUBLE, false); + assertPrimitiveField(map, "t_boolean", BOOLEAN, false); + assertPrimitiveField(map, "ds", createUnboundedVarcharType(), true); + assertPrimitiveField(map, "file_format", createUnboundedVarcharType(), true); + assertPrimitiveField(map, "dummy", INTEGER, true); + } + } + + @Test + public void testGetTableSchemaUnpartitioned() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableUnpartitioned); + ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(newSession(), tableHandle); + Map map = uniqueIndex(tableMetadata.getColumns(), ColumnMetadata::getName); + + assertPrimitiveField(map, "t_string", createUnboundedVarcharType(), false); + assertPrimitiveField(map, "t_tinyint", TINYINT, false); + } + } + + @Test + public void testGetTableSchemaOffline() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + Map> columns = metadata.listTableColumns(newSession(), tableOffline.toSchemaTablePrefix()); + assertEquals(columns.size(), 1); + Map map = uniqueIndex(getOnlyElement(columns.values()), ColumnMetadata::getName); + + assertPrimitiveField(map, "t_string", createUnboundedVarcharType(), false); + } + } + + @Test + public void testGetTableSchemaOfflinePartition() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableOfflinePartition); + ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(newSession(), tableHandle); + Map map = uniqueIndex(tableMetadata.getColumns(), ColumnMetadata::getName); + + assertPrimitiveField(map, "t_string", createUnboundedVarcharType(), false); + } + } + + @Test + public void testGetTableSchemaNotReadablePartition() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableNotReadable); + ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(newSession(), tableHandle); + Map map = uniqueIndex(tableMetadata.getColumns(), ColumnMetadata::getName); + + assertPrimitiveField(map, "t_string", createUnboundedVarcharType(), false); + } + } + + @Test + public void testGetTableSchemaException() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + assertNull(metadata.getTableHandle(newSession(), invalidTable)); + } + } + + @Test + public void testGetTableStatsBucketedStringInt() + { + assertTableStatsComputed( + tableBucketedStringInt, + ImmutableSet.of( + "t_bigint", + "t_boolean", + "t_double", + "t_float", + "t_int", + "t_smallint", + "t_string", + "t_tinyint", + "ds")); + } + + @Test + public void testGetTableStatsUnpartitioned() + { + assertTableStatsComputed( + tableUnpartitioned, + ImmutableSet.of("t_string", "t_tinyint")); + } + + private void assertTableStatsComputed( + SchemaTableName tableName, + Set expectedColumnStatsColumns) + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, Constraint.alwaysTrue(), true); + + assertFalse(tableStatistics.getRowCount().isUnknown(), "row count is unknown"); + + Map columnsStatistics = tableStatistics + .getColumnStatistics() + .entrySet() + .stream() + .collect( + toImmutableMap( + entry -> ((HiveColumnHandle) entry.getKey()).getName(), + Map.Entry::getValue)); + + assertEquals(columnsStatistics.keySet(), expectedColumnStatsColumns, "columns with statistics"); + + Map columnHandles = metadata.getColumnHandles(session, tableHandle); + columnsStatistics.forEach((columnName, columnStatistics) -> { + ColumnHandle columnHandle = columnHandles.get(columnName); + Type columnType = metadata.getColumnMetadata(session, tableHandle, columnHandle).getType(); + + assertFalse( + columnStatistics.getNullsFraction().isUnknown(), + "unknown nulls fraction for " + columnName); + + assertFalse( + columnStatistics.getDistinctValuesCount().isUnknown(), + "unknown distinct values count for " + columnName); + + if (isVarcharType(columnType)) { + assertFalse( + columnStatistics.getDataSize().isUnknown(), + "unknown data size for " + columnName); + } + else { + assertTrue( + columnStatistics.getDataSize().isUnknown(), + "unknown data size for" + columnName); + } + }); + } + } + + @Test + public void testGetPartitionSplitsBatch() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tablePartitionFormat); + ConnectorSplitSource splitSource = splitManager.getSplits(transaction.getTransactionHandle(), session, tableHandle, UNGROUPED_SCHEDULING); + + assertEquals(getSplitCount(splitSource), tablePartitionFormatPartitions.size()); + } + } + + @Test + public void testGetPartitionSplitsBatchUnpartitioned() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableUnpartitioned); + ConnectorSplitSource splitSource = splitManager.getSplits(transaction.getTransactionHandle(), session, tableHandle, UNGROUPED_SCHEDULING); + + assertEquals(getSplitCount(splitSource), 1); + } + } + + @Test(expectedExceptions = TableNotFoundException.class) + public void testGetPartitionSplitsBatchInvalidTable() + { + try (Transaction transaction = newTransaction()) { + splitManager.getSplits(transaction.getTransactionHandle(), newSession(), invalidTableHandle, UNGROUPED_SCHEDULING); + } + } + + @Test + public void testGetPartitionTableOffline() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + try { + getTableHandle(metadata, tableOffline); + fail("expected TableOfflineException"); + } + catch (TableOfflineException e) { + assertEquals(e.getTableName(), tableOffline); + } + } + } + + @Test + public void testGetPartitionSplitsTableOfflinePartition() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableOfflinePartition); + assertNotNull(tableHandle); + + ColumnHandle dsColumn = metadata.getColumnHandles(session, tableHandle).get("ds"); + assertNotNull(dsColumn); + + Domain domain = Domain.singleValue(createUnboundedVarcharType(), utf8Slice("2012-12-30")); + TupleDomain tupleDomain = TupleDomain.withColumnDomains(ImmutableMap.of(dsColumn, domain)); + tableHandle = applyFilter(metadata, tableHandle, new Constraint(tupleDomain)); + + try { + getSplitCount(splitManager.getSplits(transaction.getTransactionHandle(), session, tableHandle, UNGROUPED_SCHEDULING)); + fail("Expected PartitionOfflineException"); + } + catch (PartitionOfflineException e) { + assertEquals(e.getTableName(), tableOfflinePartition); + assertEquals(e.getPartition(), "ds=2012-12-30"); + } + } + } + + @Test + public void testGetPartitionSplitsTableNotReadablePartition() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableNotReadable); + assertNotNull(tableHandle); + + try { + getSplitCount(splitManager.getSplits(transaction.getTransactionHandle(), session, tableHandle, UNGROUPED_SCHEDULING)); + fail("Expected HiveNotReadableException"); + } + catch (HiveNotReadableException e) { + assertThat(e).hasMessageMatching("Table '.*\\.presto_test_not_readable' is not readable: reason for not readable"); + assertEquals(e.getTableName(), tableNotReadable); + assertEquals(e.getPartition(), Optional.empty()); + } + } + } + + @Test + public void testBucketedTableStringInt() + throws Exception + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableBucketedStringInt); + List columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values()); + Map columnIndex = indexColumns(columnHandles); + + assertTableIsBucketed(tableHandle, transaction, session); + + String testString = "test"; + Integer testInt = 13; + Short testSmallint = 12; + + // Reverse the order of bindings as compared to bucketing order + ImmutableMap bindings = ImmutableMap.builder() + .put(columnHandles.get(columnIndex.get("t_int")), NullableValue.of(INTEGER, (long) testInt)) + .put(columnHandles.get(columnIndex.get("t_string")), NullableValue.of(createUnboundedVarcharType(), utf8Slice(testString))) + .put(columnHandles.get(columnIndex.get("t_smallint")), NullableValue.of(SMALLINT, (long) testSmallint)) + .build(); + + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.fromFixedValues(bindings), OptionalInt.of(1), Optional.empty()); + + boolean rowFound = false; + for (MaterializedRow row : result) { + if (testString.equals(row.getField(columnIndex.get("t_string"))) && + testInt.equals(row.getField(columnIndex.get("t_int"))) && + testSmallint.equals(row.getField(columnIndex.get("t_smallint")))) { + rowFound = true; + } + } + assertTrue(rowFound); + } + } + + @SuppressWarnings("ConstantConditions") + @Test + public void testBucketedTableBigintBoolean() + throws Exception + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableBucketedBigintBoolean); + List columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values()); + Map columnIndex = indexColumns(columnHandles); + + assertTableIsBucketed(tableHandle, transaction, session); + + String testString = "test"; + Long testBigint = 89L; + Boolean testBoolean = true; + + ImmutableMap bindings = ImmutableMap.builder() + .put(columnHandles.get(columnIndex.get("t_string")), NullableValue.of(createUnboundedVarcharType(), utf8Slice(testString))) + .put(columnHandles.get(columnIndex.get("t_bigint")), NullableValue.of(BIGINT, testBigint)) + .put(columnHandles.get(columnIndex.get("t_boolean")), NullableValue.of(BOOLEAN, testBoolean)) + .build(); + + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.fromFixedValues(bindings), OptionalInt.of(1), Optional.empty()); + + boolean rowFound = false; + for (MaterializedRow row : result) { + if (testString.equals(row.getField(columnIndex.get("t_string"))) && + testBigint.equals(row.getField(columnIndex.get("t_bigint"))) && + testBoolean.equals(row.getField(columnIndex.get("t_boolean")))) { + rowFound = true; + break; + } + } + assertTrue(rowFound); + } + } + + @Test + public void testBucketedTableDoubleFloat() + throws Exception + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableBucketedDoubleFloat); + List columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values()); + Map columnIndex = indexColumns(columnHandles); + + assertTableIsBucketed(tableHandle, transaction, session); + + ImmutableMap bindings = ImmutableMap.builder() + .put(columnHandles.get(columnIndex.get("t_float")), NullableValue.of(REAL, (long) floatToRawIntBits(87.1f))) + .put(columnHandles.get(columnIndex.get("t_double")), NullableValue.of(DOUBLE, 88.2)) + .build(); + + // floats and doubles are not supported, so we should see all splits + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.fromFixedValues(bindings), OptionalInt.of(32), Optional.empty()); + assertEquals(result.getRowCount(), 100); + } + } + + @Test + public void testBucketedTableEvolution() + throws Exception + { + for (HiveStorageFormat storageFormat : createTableFormats) { + SchemaTableName temporaryBucketEvolutionTable = temporaryTable("bucket_evolution"); + try { + doTestBucketedTableEvolution(storageFormat, temporaryBucketEvolutionTable); + } + finally { + dropTable(temporaryBucketEvolutionTable); + } + } + } + + private void doTestBucketedTableEvolution(HiveStorageFormat storageFormat, SchemaTableName tableName) + throws Exception + { + int rowCount = 100; + + // + // Produce a table with 8 buckets. + // The table has 3 partitions of 3 different bucket count (4, 8, 16). + createEmptyTable( + tableName, + storageFormat, + ImmutableList.of( + new Column("id", HIVE_LONG, Optional.empty()), + new Column("name", HIVE_STRING, Optional.empty())), + ImmutableList.of(new Column("pk", HIVE_STRING, Optional.empty())), + Optional.of(new HiveBucketProperty(ImmutableList.of("id"), BUCKETING_V1, 4, ImmutableList.of()))); + // write a 4-bucket partition + MaterializedResult.Builder bucket4Builder = MaterializedResult.resultBuilder(SESSION, BIGINT, VARCHAR, VARCHAR); + IntStream.range(0, rowCount).forEach(i -> bucket4Builder.row((long) i, String.valueOf(i), "four")); + insertData(tableName, bucket4Builder.build()); + // write a 16-bucket partition + alterBucketProperty(tableName, Optional.of(new HiveBucketProperty(ImmutableList.of("id"), BUCKETING_V1, 16, ImmutableList.of()))); + MaterializedResult.Builder bucket16Builder = MaterializedResult.resultBuilder(SESSION, BIGINT, VARCHAR, VARCHAR); + IntStream.range(0, rowCount).forEach(i -> bucket16Builder.row((long) i, String.valueOf(i), "sixteen")); + insertData(tableName, bucket16Builder.build()); + // write an 8-bucket partition + alterBucketProperty(tableName, Optional.of(new HiveBucketProperty(ImmutableList.of("id"), BUCKETING_V1, 8, ImmutableList.of()))); + MaterializedResult.Builder bucket8Builder = MaterializedResult.resultBuilder(SESSION, BIGINT, VARCHAR, VARCHAR); + IntStream.range(0, rowCount).forEach(i -> bucket8Builder.row((long) i, String.valueOf(i), "eight")); + insertData(tableName, bucket8Builder.build()); + + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + + // read entire table + List columnHandles = ImmutableList.builder() + .addAll(metadata.getColumnHandles(session, tableHandle).values()) + .build(); + MaterializedResult result = readTable( + transaction, + tableHandle, + columnHandles, + session, + TupleDomain.all(), + OptionalInt.empty(), + Optional.empty()); + assertBucketTableEvolutionResult(result, columnHandles, ImmutableSet.of(0, 1, 2, 3, 4, 5, 6, 7), rowCount); + + // read single bucket (table/logical bucket) + result = readTable( + transaction, + tableHandle, + columnHandles, + session, + TupleDomain.fromFixedValues(ImmutableMap.of(bucketColumnHandle(), NullableValue.of(INTEGER, 6L))), + OptionalInt.empty(), + Optional.empty()); + assertBucketTableEvolutionResult(result, columnHandles, ImmutableSet.of(6), rowCount); + + // read single bucket, without selecting the bucketing column (i.e. id column) + columnHandles = ImmutableList.builder() + .addAll(metadata.getColumnHandles(session, tableHandle).values().stream() + .filter(columnHandle -> !"id".equals(((HiveColumnHandle) columnHandle).getName())) + .collect(toImmutableList())) + .build(); + result = readTable( + transaction, + tableHandle, + columnHandles, + session, + TupleDomain.fromFixedValues(ImmutableMap.of(bucketColumnHandle(), NullableValue.of(INTEGER, 6L))), + OptionalInt.empty(), + Optional.empty()); + assertBucketTableEvolutionResult(result, columnHandles, ImmutableSet.of(6), rowCount); + } + } + + private static void assertBucketTableEvolutionResult(MaterializedResult result, List columnHandles, Set bucketIds, int rowCount) + { + // Assert that only elements in the specified bucket shows up, and each element shows up 3 times. + int bucketCount = 8; + Set expectedIds = LongStream.range(0, rowCount) + .filter(x -> bucketIds.contains(toIntExact(x % bucketCount))) + .boxed() + .collect(toImmutableSet()); + + // assert that content from all three buckets are the same + Map columnIndex = indexColumns(columnHandles); + OptionalInt idColumnIndex = columnIndex.containsKey("id") ? OptionalInt.of(columnIndex.get("id")) : OptionalInt.empty(); + int nameColumnIndex = columnIndex.get("name"); + int bucketColumnIndex = columnIndex.get(BUCKET_COLUMN_NAME); + Map idCount = new HashMap<>(); + for (MaterializedRow row : result.getMaterializedRows()) { + String name = (String) row.getField(nameColumnIndex); + int bucket = (int) row.getField(bucketColumnIndex); + idCount.compute(Long.parseLong(name), (key, oldValue) -> oldValue == null ? 1 : oldValue + 1); + assertEquals(bucket, Integer.parseInt(name) % bucketCount); + if (idColumnIndex.isPresent()) { + long id = (long) row.getField(idColumnIndex.getAsInt()); + assertEquals(Integer.parseInt(name), id); + } + } + assertEquals( + (int) idCount.values().stream() + .distinct() + .collect(onlyElement()), + 3); + assertEquals(idCount.keySet(), expectedIds); + } + + private void assertTableIsBucketed(ConnectorTableHandle tableHandle, Transaction transaction, ConnectorSession session) + { + // the bucketed test tables should have exactly 32 splits + List splits = getAllSplits(tableHandle, transaction, session); + assertEquals(splits.size(), 32); + + // verify all paths are unique + Set paths = new HashSet<>(); + for (ConnectorSplit split : splits) { + assertTrue(paths.add(HiveSplitWrapper.getOnlyHiveSplit(split).getPath())); + } + } + + @Test + public void testGetRecords() + throws Exception + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tablePartitionFormat); + ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(session, tableHandle); + List columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values()); + Map columnIndex = indexColumns(columnHandles); + + List splits = getAllSplits(tableHandle, transaction, session); + assertEquals(splits.size(), tablePartitionFormatPartitions.size()); + + for (ConnectorSplit split : splits) { + HiveSplit hiveSplit = getOnlyElement(((HiveSplitWrapper) split).getSplits()); + + List partitionKeys = hiveSplit.getPartitionKeys(); + String ds = partitionKeys.get(0).getValue(); + String fileFormat = partitionKeys.get(1).getValue(); + HiveStorageFormat fileType = HiveStorageFormat.valueOf(fileFormat.toUpperCase()); + int dummyPartition = Integer.parseInt(partitionKeys.get(2).getValue()); + + long rowNumber = 0; + long completedBytes = 0; + try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles)) { + MaterializedResult result = materializeSourceDataStream(session, pageSource, getTypes(columnHandles)); + + assertPageSourceType(pageSource, fileType); + + for (MaterializedRow row : result) { + try { + assertValueTypes(row, tableMetadata.getColumns()); + } + catch (RuntimeException e) { + throw new RuntimeException("row " + rowNumber, e); + } + + rowNumber++; + Object value; + + value = row.getField(columnIndex.get("t_string")); + if (rowNumber % 19 == 0) { + assertNull(value); + } + else if (rowNumber % 19 == 1) { + assertEquals(value, ""); + } + else { + assertEquals(value, "test"); + } + + assertEquals(row.getField(columnIndex.get("t_tinyint")), (byte) (1 + rowNumber)); + assertEquals(row.getField(columnIndex.get("t_smallint")), (short) (2 + rowNumber)); + assertEquals(row.getField(columnIndex.get("t_int")), 3 + (int) rowNumber); + + if (rowNumber % 13 == 0) { + assertNull(row.getField(columnIndex.get("t_bigint"))); + } + else { + assertEquals(row.getField(columnIndex.get("t_bigint")), 4 + rowNumber); + } + + assertEquals((Float) row.getField(columnIndex.get("t_float")), 5.1f + rowNumber, 0.001); + assertEquals(row.getField(columnIndex.get("t_double")), 6.2 + rowNumber); + + if (rowNumber % 3 == 2) { + assertNull(row.getField(columnIndex.get("t_boolean"))); + } + else { + assertEquals(row.getField(columnIndex.get("t_boolean")), rowNumber % 3 != 0); + } + + assertEquals(row.getField(columnIndex.get("ds")), ds); + assertEquals(row.getField(columnIndex.get("file_format")), fileFormat); + assertEquals(row.getField(columnIndex.get("dummy")), dummyPartition); + + long newCompletedBytes = pageSource.getCompletedBytes(); + assertTrue(newCompletedBytes >= completedBytes); + assertTrue(newCompletedBytes <= hiveSplit.getLength()); + completedBytes = newCompletedBytes; + } + + assertTrue(completedBytes <= hiveSplit.getLength()); + assertEquals(rowNumber, 100); + } + } + } + } + + @Test + public void testGetPartialRecords() + throws Exception + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tablePartitionFormat); + List columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values()); + Map columnIndex = indexColumns(columnHandles); + + List splits = getAllSplits(tableHandle, transaction, session); + assertEquals(splits.size(), tablePartitionFormatPartitions.size()); + + for (ConnectorSplit split : splits) { + HiveSplit hiveSplit = HiveSplitWrapper.getOnlyHiveSplit(split); + + List partitionKeys = hiveSplit.getPartitionKeys(); + String ds = partitionKeys.get(0).getValue(); + String fileFormat = partitionKeys.get(1).getValue(); + HiveStorageFormat fileType = HiveStorageFormat.valueOf(fileFormat.toUpperCase()); + int dummyPartition = Integer.parseInt(partitionKeys.get(2).getValue()); + + long rowNumber = 0; + try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles)) { + assertPageSourceType(pageSource, fileType); + MaterializedResult result = materializeSourceDataStream(session, pageSource, getTypes(columnHandles)); + for (MaterializedRow row : result) { + rowNumber++; + + assertEquals(row.getField(columnIndex.get("t_double")), 6.2 + rowNumber); + assertEquals(row.getField(columnIndex.get("ds")), ds); + assertEquals(row.getField(columnIndex.get("file_format")), fileFormat); + assertEquals(row.getField(columnIndex.get("dummy")), dummyPartition); + } + } + assertEquals(rowNumber, 100); + } + } + } + + @Test + public void testGetRecordsUnpartitioned() + throws Exception + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableUnpartitioned); + List columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values()); + Map columnIndex = indexColumns(columnHandles); + + List splits = getAllSplits(tableHandle, transaction, session); + assertThat(splits).hasSameSizeAs(tableUnpartitionedPartitions); + + for (ConnectorSplit split : splits) { + HiveSplit hiveSplit = HiveSplitWrapper.getOnlyHiveSplit(split); + + assertEquals(hiveSplit.getPartitionKeys(), ImmutableList.of()); + + long rowNumber = 0; + try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles)) { + assertPageSourceType(pageSource, TEXTFILE); + MaterializedResult result = materializeSourceDataStream(session, pageSource, getTypes(columnHandles)); + + for (MaterializedRow row : result) { + rowNumber++; + + if (rowNumber % 19 == 0) { + assertNull(row.getField(columnIndex.get("t_string"))); + } + else if (rowNumber % 19 == 1) { + assertEquals(row.getField(columnIndex.get("t_string")), ""); + } + else { + assertEquals(row.getField(columnIndex.get("t_string")), "unpartitioned"); + } + + assertEquals(row.getField(columnIndex.get("t_tinyint")), (byte) (1 + rowNumber)); + } + } + assertEquals(rowNumber, 100); + } + } + } + + @Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = ".*" + INVALID_COLUMN + ".*") + public void testGetRecordsInvalidColumn() + throws Exception + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata connectorMetadata = transaction.getMetadata(); + ConnectorTableHandle table = getTableHandle(connectorMetadata, tableUnpartitioned); + ConnectorSession session = newSession(); + connectorMetadata.beginQuery(session); + readTable(transaction, table, ImmutableList.of(invalidColumnHandle), session, TupleDomain.all(), OptionalInt.empty(), Optional.empty()); + } + } + + @Test(expectedExceptions = PrestoException.class, expectedExceptionsMessageRegExp = ".*The column 't_data' in table '.*\\.presto_test_partition_schema_change' is declared as type 'double', but partition 'ds=2012-12-29' declared column 't_data' as type 'string'.") + public void testPartitionSchemaMismatch() + throws Exception + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableHandle table = getTableHandle(metadata, tablePartitionSchemaChange); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + readTable(transaction, table, ImmutableList.of(dsColumn), session, TupleDomain.all(), OptionalInt.empty(), Optional.empty()); + } + } + + // TODO coercion of non-canonical values should be supported + @Test(enabled = false) + public void testPartitionSchemaNonCanonical() + throws Exception + { + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + + ConnectorTableHandle table = getTableHandle(metadata, tablePartitionSchemaChangeNonCanonical); + ColumnHandle column = metadata.getColumnHandles(session, table).get("t_boolean"); + + Constraint constraint = new Constraint(TupleDomain.fromFixedValues(ImmutableMap.of(column, NullableValue.of(BOOLEAN, false)))); + table = applyFilter(metadata, table, constraint); + HivePartition partition = getOnlyElement(((HiveTableHandle) table).getPartitions().orElseThrow(AssertionError::new)); + assertEquals(getPartitionId(partition), "t_boolean=0"); + + ConnectorSplitSource splitSource = splitManager.getSplits(transaction.getTransactionHandle(), session, table, UNGROUPED_SCHEDULING); + ConnectorSplit split = getOnlyElement(getAllSplits(splitSource)); + + ImmutableList columnHandles = ImmutableList.of(column); + try (ConnectorPageSource ignored = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, table, columnHandles)) { + fail("expected exception"); + } + catch (PrestoException e) { + assertEquals(e.getErrorCode(), HiveErrorCode.HIVE_INVALID_PARTITION_VALUE.toErrorCode()); + } + } + } + + @Test + public void testTypesTextFile() + throws Exception + { + assertGetRecords("presto_test_types_textfile", TEXTFILE); + } + + @Test + public void testTypesSequenceFile() + throws Exception + { + assertGetRecords("presto_test_types_sequencefile", SEQUENCEFILE); + } + + @Test + public void testTypesRcText() + throws Exception + { + assertGetRecords("presto_test_types_rctext", RCTEXT); + } + + @Test + public void testTypesRcBinary() + throws Exception + { + assertGetRecords("presto_test_types_rcbinary", RCBINARY); + } + + @Test + public void testTypesOrc() + throws Exception + { + assertGetRecords("presto_test_types_orc", ORC); + } + + @Test + public void testTypesParquet() + throws Exception + { + assertGetRecords("presto_test_types_parquet", PARQUET); + } + + @Test + public void testEmptyTextFile() + throws Exception + { + assertEmptyFile(TEXTFILE); + } + + @Test(expectedExceptions = PrestoException.class, expectedExceptionsMessageRegExp = "Error opening Hive split .* not a SequenceFile") + public void testEmptySequenceFile() + throws Exception + { + assertEmptyFile(SEQUENCEFILE); + } + + @Test(expectedExceptions = PrestoException.class, expectedExceptionsMessageRegExp = "RCFile is empty: .*") + public void testEmptyRcTextFile() + throws Exception + { + assertEmptyFile(RCTEXT); + } + + @Test(expectedExceptions = PrestoException.class, expectedExceptionsMessageRegExp = "RCFile is empty: .*") + public void testEmptyRcBinaryFile() + throws Exception + { + assertEmptyFile(RCBINARY); + } + + @Test + public void testEmptyOrcFile() + throws Exception + { + assertEmptyFile(ORC); + } + + private void assertEmptyFile(HiveStorageFormat format) + throws Exception + { + SchemaTableName tableName = temporaryTable("empty_file"); + try { + List columns = ImmutableList.of(new Column("test", HIVE_STRING, Optional.empty())); + createEmptyTable(tableName, format, columns, ImmutableList.of()); + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + + Table table = transaction.getMetastore(tableName.getSchemaName()) + .getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(AssertionError::new); + + // verify directory is empty + HdfsContext context = new HdfsContext(session, tableName.getSchemaName(), tableName.getTableName()); + Path location = new Path(table.getStorage().getLocation()); + assertTrue(listDirectory(context, location).isEmpty()); + + // read table with empty directory + readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.of(0), Optional.of(ORC)); + + // create empty file + FileSystem fileSystem = hdfsEnvironment.getFileSystem(context, location); + assertTrue(fileSystem.createNewFile(new Path(location, "empty-file"))); + assertEquals(listDirectory(context, location), ImmutableList.of("empty-file")); + + // read table with empty file + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.of(1), Optional.empty()); + assertEquals(result.getRowCount(), 0); + } + } + finally { + dropTable(tableName); + } + } + + @Test + public void testHiveViewsAreNotSupported() + { + try (Transaction transaction = newTransaction()) { + try { + ConnectorMetadata metadata = transaction.getMetadata(); + getTableHandle(metadata, view); + fail("Expected HiveViewNotSupportedException"); + } + catch (HiveViewNotSupportedException e) { + assertEquals(e.getTableName(), view); + } + } + } + + @Test + public void testHiveViewsHaveNoColumns() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + assertEquals(metadata.listTableColumns(newSession(), new SchemaTablePrefix(view.getSchemaName(), view.getTableName())), ImmutableMap.of()); + } + } + + @Test + public void testRenameTable() + { + SchemaTableName temporaryRenameTableOld = temporaryTable("rename_old"); + SchemaTableName temporaryRenameTableNew = temporaryTable("rename_new"); + try { + createDummyTable(temporaryRenameTableOld); + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + + metadata.renameTable(session, getTableHandle(metadata, temporaryRenameTableOld), temporaryRenameTableNew); + transaction.commit(); + } + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + + assertNull(metadata.getTableHandle(session, temporaryRenameTableOld)); + assertNotNull(metadata.getTableHandle(session, temporaryRenameTableNew)); + } + } + finally { + dropTable(temporaryRenameTableOld); + dropTable(temporaryRenameTableNew); + } + } + + @Test + public void testTableCreation() + throws Exception + { + for (HiveStorageFormat storageFormat : createTableFormats) { + SchemaTableName temporaryCreateTable = temporaryTable("create"); + try { + doCreateTable(temporaryCreateTable, storageFormat); + } + finally { + dropTable(temporaryCreateTable); + } + } + } + + @Test + public void testTableCreationRollback() + throws Exception + { + SchemaTableName temporaryCreateRollbackTable = temporaryTable("create_rollback"); + try { + Path stagingPathRoot; + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + + // begin creating the table + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(temporaryCreateRollbackTable, CREATE_TABLE_COLUMNS, createTableProperties(RCBINARY)); + + ConnectorOutputTableHandle outputHandle = metadata.beginCreateTable(session, tableMetadata, Optional.empty()); + + // write the data + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, outputHandle); + sink.appendPage(CREATE_TABLE_DATA.toPage()); + getFutureValue(sink.finish()); + + // verify we have data files + stagingPathRoot = getStagingPathRoot(outputHandle); + HdfsContext context = new HdfsContext(session, temporaryCreateRollbackTable.getSchemaName(), temporaryCreateRollbackTable.getTableName()); + assertFalse(listAllDataFiles(context, stagingPathRoot).isEmpty()); + + // rollback the table + transaction.rollback(); + } + + // verify all files have been deleted + HdfsContext context = new HdfsContext(newSession(), temporaryCreateRollbackTable.getSchemaName(), temporaryCreateRollbackTable.getTableName()); + assertTrue(listAllDataFiles(context, stagingPathRoot).isEmpty()); + + // verify table is not in the metastore + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + assertNull(metadata.getTableHandle(session, temporaryCreateRollbackTable)); + } + } + finally { + dropTable(temporaryCreateRollbackTable); + } + } + + @Test + public void testTableCreationIgnoreExisting() + { + List columns = ImmutableList.of(new Column("dummy", HiveType.valueOf("uniontype"), Optional.empty())); + SchemaTableName schemaTableName = temporaryTable("create"); + ConnectorSession session = newSession(); + String schemaName = schemaTableName.getSchemaName(); + String tableName = schemaTableName.getTableName(); + PrincipalPrivileges privileges = testingPrincipalPrivilege(session); + Path targetPath; + try { + try (Transaction transaction = newTransaction()) { + LocationService locationService = getLocationService(); + LocationHandle locationHandle = locationService.forNewTable(transaction.getMetastore(schemaName), session, schemaName, tableName, Optional.empty(), Optional.empty(), HiveWriteUtils.OpertionType.CREATE_TABLE); + targetPath = locationService.getQueryWriteInfo(locationHandle).getTargetPath(); + Table table = createSimpleTable(schemaTableName, columns, session, targetPath, "q1"); + transaction.getMetastore(schemaName) + .createTable(session, table, privileges, Optional.empty(), false, EMPTY_TABLE_STATISTICS); + Optional

tableHandle = transaction.getMetastore(schemaName).getTable(new HiveIdentity(session), schemaName, tableName); + assertTrue(tableHandle.isPresent()); + transaction.commit(); + } + + // try creating it again from another transaction with ignoreExisting=false + try (Transaction transaction = newTransaction()) { + Table table = createSimpleTable(schemaTableName, columns, session, targetPath.suffix("_2"), "q2"); + transaction.getMetastore(schemaName) + .createTable(session, table, privileges, Optional.empty(), false, EMPTY_TABLE_STATISTICS); + transaction.commit(); + fail("Expected exception"); + } + catch (PrestoException e) { + assertInstanceOf(e, TableAlreadyExistsException.class); + } + + // try creating it again from another transaction with ignoreExisting=true + try (Transaction transaction = newTransaction()) { + Table table = createSimpleTable(schemaTableName, columns, session, targetPath.suffix("_3"), "q3"); + transaction.getMetastore(schemaName) + .createTable(session, table, privileges, Optional.empty(), true, EMPTY_TABLE_STATISTICS); + transaction.commit(); + } + + // at this point the table should exist, now try creating the table again with a different table definition + columns = ImmutableList.of(new Column("new_column", HiveType.valueOf("string"), Optional.empty())); + try (Transaction transaction = newTransaction()) { + Table table = createSimpleTable(schemaTableName, columns, session, targetPath.suffix("_4"), "q4"); + transaction.getMetastore(schemaName) + .createTable(session, table, privileges, Optional.empty(), true, EMPTY_TABLE_STATISTICS); + transaction.commit(); + fail("Expected exception"); + } + catch (PrestoException e) { + assertEquals(e.getErrorCode(), TRANSACTION_CONFLICT.toErrorCode()); + assertEquals(e.getMessage(), format("Table already exists with a different schema: '%s'", schemaTableName.getTableName())); + } + } + finally { + dropTable(schemaTableName); + } + } + + private static Table createSimpleTable(SchemaTableName schemaTableName, List columns, ConnectorSession session, Path targetPath, String queryId) + { + String tableOwner = session.getUser(); + String schemaName = schemaTableName.getSchemaName(); + String tableName = schemaTableName.getTableName(); + return Table.builder() + .setDatabaseName(schemaName) + .setTableName(tableName) + .setOwner(tableOwner) + .setTableType(TableType.MANAGED_TABLE.name()) + .setParameters(ImmutableMap.of( + PRESTO_VERSION_NAME, TEST_SERVER_VERSION, + PRESTO_QUERY_ID_NAME, queryId)) + .setDataColumns(columns) + .withStorage(storage -> storage + .setLocation(targetPath.toString()) + .setStorageFormat(StorageFormat.fromHiveStorageFormat(ORC)) + .setSerdeParameters(ImmutableMap.of())) + .build(); + } + + @Test + public void testBucketSortedTables() + throws Exception + { + SchemaTableName table = temporaryTable("create_sorted"); + try { + doTestBucketSortedTables(table); + } + finally { + dropTable(table); + } + } + + private void doTestBucketSortedTables(SchemaTableName table) + throws IOException + { + int bucketCount = 3; + int expectedRowCount = 0; + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + + // begin creating the table + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata( + table, + ImmutableList.builder() + .add(new ColumnMetadata("id", VARCHAR)) + .add(new ColumnMetadata("value_asc", VARCHAR)) + .add(new ColumnMetadata("value_desc", BIGINT)) + .add(new ColumnMetadata("ds", VARCHAR)) + .build(), + ImmutableMap.builder() + .put(STORAGE_FORMAT_PROPERTY, RCBINARY) + .put(PARTITIONED_BY_PROPERTY, ImmutableList.of("ds")) + .put(BUCKETED_BY_PROPERTY, ImmutableList.of("id")) + .put(BUCKET_COUNT_PROPERTY, bucketCount) + .put(SORTED_BY_PROPERTY, ImmutableList.builder() + .add(new SortingColumn("value_asc", SortingColumn.Order.ASCENDING)) + .add(new SortingColumn("value_desc", SortingColumn.Order.DESCENDING)) + .build()) + .build()); + + ConnectorOutputTableHandle outputHandle = metadata.beginCreateTable(session, tableMetadata, Optional.empty()); + + // write the data + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, outputHandle); + List types = tableMetadata.getColumns().stream() + .map(ColumnMetadata::getType) + .collect(toList()); + ThreadLocalRandom random = ThreadLocalRandom.current(); + for (int i = 0; i < 50; i++) { + MaterializedResult.Builder builder = MaterializedResult.resultBuilder(session, types); + for (int j = 0; j < 1000; j++) { + builder.row( + sha256().hashLong(random.nextLong()).toString(), + "test" + random.nextInt(100), + random.nextLong(100_000), + "2018-04-01"); + expectedRowCount++; + } + sink.appendPage(builder.build().toPage()); + } + + // verify we have enough temporary files per bucket to require multiple passes + Path stagingPathRoot = getStagingPathRoot(outputHandle); + HdfsContext context = new HdfsContext(session, table.getSchemaName(), table.getTableName()); + assertThat(listAllDataFiles(context, stagingPathRoot)) + .filteredOn(file -> file.contains(".tmp-sort.")) + .size().isGreaterThan(bucketCount * getHiveConfig().getMaxOpenSortFiles() * 2); + + // finish the write + Collection fragments = getFutureValue(sink.finish()); + + // verify there are no temporary files + for (String file : listAllDataFiles(context, stagingPathRoot)) { + assertThat(file).doesNotContain(".tmp-sort."); + } + + // finish creating table + metadata.finishCreateTable(session, outputHandle, fragments, ImmutableList.of()); + + transaction.commit(); + } + + // verify that bucket files are sorted + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, table); + List columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values()); + + List splits = getAllSplits(tableHandle, transaction, session); + assertThat(splits).hasSize(bucketCount); + + int actualRowCount = 0; + for (ConnectorSplit split : splits) { + try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles)) { + String lastValueAsc = null; + long lastValueDesc = -1; + + while (!pageSource.isFinished()) { + Page page = pageSource.getNextPage(); + if (page == null) { + continue; + } + for (int i = 0; i < page.getPositionCount(); i++) { + Block blockAsc = page.getBlock(1); + Block blockDesc = page.getBlock(2); + assertFalse(blockAsc.isNull(i)); + assertFalse(blockDesc.isNull(i)); + + String valueAsc = VARCHAR.getSlice(blockAsc, i).toStringUtf8(); + if (lastValueAsc != null) { + assertGreaterThanOrEqual(valueAsc, lastValueAsc); + if (valueAsc.equals(lastValueAsc)) { + long valueDesc = BIGINT.getLong(blockDesc, i); + if (lastValueDesc != -1) { + assertLessThanOrEqual(valueDesc, lastValueDesc); + } + lastValueDesc = valueDesc; + } + else { + lastValueDesc = -1; + } + } + lastValueAsc = valueAsc; + actualRowCount++; + } + } + } + } + assertThat(actualRowCount).isEqualTo(expectedRowCount); + } + } + + @Test + public void testInsert() + throws Exception + { + for (HiveStorageFormat storageFormat : createTableFormats) { + SchemaTableName temporaryInsertTable = temporaryTable("insert"); + try { + doInsert(storageFormat, temporaryInsertTable); + } + finally { + dropTable(temporaryInsertTable); + } + } + } + + @Test(enabled = false) + public void testInsertOverwriteUnpartitioned() + throws Exception + { + SchemaTableName table = temporaryTable("insert_overwrite"); + try { + doInsertOverwriteUnpartitioned(table); + } + finally { + dropTable(table); + } + } + + @Test + public void testInsertIntoNewPartition() + throws Exception + { + for (HiveStorageFormat storageFormat : createTableFormats) { + SchemaTableName temporaryInsertIntoNewPartitionTable = temporaryTable("insert_new_partitioned"); + try { + doInsertIntoNewPartition(storageFormat, temporaryInsertIntoNewPartitionTable); + } + finally { + dropTable(temporaryInsertIntoNewPartitionTable); + } + } + } + + @Test + public void testInsertIntoExistingPartition() + throws Exception + { + for (HiveStorageFormat storageFormat : createTableFormats) { + SchemaTableName temporaryInsertIntoExistingPartitionTable = temporaryTable("insert_existing_partitioned"); + try { + doInsertIntoExistingPartition(storageFormat, temporaryInsertIntoExistingPartitionTable); + } + finally { + dropTable(temporaryInsertIntoExistingPartitionTable); + } + } + } + + @Test + public void testInsertIntoExistingPartitionEmptyStatistics() + throws Exception + { + for (HiveStorageFormat storageFormat : createTableFormats) { + SchemaTableName temporaryInsertIntoExistingPartitionTable = temporaryTable("insert_existing_partitioned_empty_statistics"); + try { + doInsertIntoExistingPartitionEmptyStatistics(storageFormat, temporaryInsertIntoExistingPartitionTable); + } + finally { + dropTable(temporaryInsertIntoExistingPartitionTable); + } + } + } + + @Test + public void testInsertUnsupportedWriteType() + throws Exception + { + SchemaTableName temporaryInsertUnsupportedWriteType = temporaryTable("insert_unsupported_type"); + try { + doInsertUnsupportedWriteType(ORC, temporaryInsertUnsupportedWriteType); + } + finally { + dropTable(temporaryInsertUnsupportedWriteType); + } + } + + @Test + public void testMetadataDelete() + throws Exception + { + for (HiveStorageFormat storageFormat : createTableFormats) { + SchemaTableName temporaryMetadataDeleteTable = temporaryTable("metadata_delete"); + try { + doTestMetadataDelete(storageFormat, temporaryMetadataDeleteTable); + } + finally { + dropTable(temporaryMetadataDeleteTable); + } + } + } + + @Test + public void testEmptyTableCreation() + throws Exception + { + for (HiveStorageFormat storageFormat : createTableFormats) { + SchemaTableName temporaryCreateEmptyTable = temporaryTable("create_empty"); + try { + doCreateEmptyTable(temporaryCreateEmptyTable, storageFormat, CREATE_TABLE_COLUMNS); + } + finally { + dropTable(temporaryCreateEmptyTable); + } + } + } + + @Test + public void testTransactionalTableCreation() + { + for (HiveStorageFormat storageFormat : createTableFormats) { + boolean toDropTableAfterCreated = false; + SchemaTableName tableName = new SchemaTableName(database, "test_transactional_tbl_" + storageFormat.name()); + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + List columns = ImmutableList.of(new ColumnMetadata("dummy", INTEGER)); + Map tableProperty = ImmutableMap.builder() + .put(STORAGE_FORMAT_PROPERTY, storageFormat) + .put(PARTITIONED_BY_PROPERTY, ImmutableList.copyOf(ImmutableList.of())) + .put(BUCKETED_BY_PROPERTY, ImmutableList.of()) + .put(BUCKET_COUNT_PROPERTY, 0) + .put(SORTED_BY_PROPERTY, ImmutableList.of()) + .put(TRANSACTIONAL, true) + .build(); + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, columns, tableProperty); + metadata.createTable(session, tableMetadata, false); + transaction.commit(); + toDropTableAfterCreated = true; + if (!HiveStorageFormat.ORC.equals(storageFormat)) { + fail("create table other than ORC storage format should go to Unsupported error."); + } + } + catch (PrestoException e) { + assertEquals(e.getErrorCode(), NOT_SUPPORTED.toErrorCode()); + } + finally { + // only ORC will create table successfully + if (toDropTableAfterCreated) { + dropTable(tableName); + } + } + } + } + + @Test + public void testViewCreation() + { + SchemaTableName temporaryCreateView = temporaryTable("create_view"); + try { + verifyViewCreation(temporaryCreateView); + } + finally { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.dropView(newSession(), temporaryCreateView); + transaction.commit(); + } + catch (RuntimeException e) { + // this usually occurs because the view was not created + } + } + } + + @Test + public void testCreateManagedTableWithLocation() throws IOException + { + String location = createTempDirectory(getClass().getName()).toString(); + Map tableProperty; + Transaction transaction = newTransaction(); + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + List columns = ImmutableList.of(new ColumnMetadata("dummy", createUnboundedVarcharType())); + + // test creating managed table with location property + SchemaTableName tableName = new SchemaTableName("presto", "create_managed_table"); + try { + tableProperty = ImmutableMap.builder() + .put(STORAGE_FORMAT_PROPERTY, TEXTFILE) + .put(PARTITIONED_BY_PROPERTY, ImmutableList.copyOf(ImmutableList.of())) + .put(BUCKETED_BY_PROPERTY, ImmutableList.of()) + .put(BUCKET_COUNT_PROPERTY, 0) + .put(SORTED_BY_PROPERTY, ImmutableList.of()) + .put(LOCATION_PROPERTY, location) + .put(IS_EXTERNAL_TABLE, false) + .build(); + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, columns, tableProperty); + metadata.createTable(session, tableMetadata, true); + } + finally { + dropTable(tableName); + } + + // test creating managed table with location and subquery + tableName = new SchemaTableName("presto", "create_subquery_table"); + try { + tableProperty = ImmutableMap.builder() + .put(STORAGE_FORMAT_PROPERTY, TEXTFILE) + .put(PARTITIONED_BY_PROPERTY, ImmutableList.copyOf(ImmutableList.of())) + .put(BUCKETED_BY_PROPERTY, ImmutableList.of()) + .put(BUCKET_COUNT_PROPERTY, 0) + .put(SORTED_BY_PROPERTY, ImmutableList.of()) + .put(LOCATION_PROPERTY, location) + .put(IS_EXTERNAL_TABLE, false) + .build(); + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, columns, tableProperty); + metadata.beginCreateTable(session, tableMetadata, Optional.empty()); + } + finally { + dropTable(tableName); + } + } + + @Test + public void testCreateExternalTableWithLocation() throws IOException + { + String location = createTempDirectory(getClass().getName()).toString(); + Map tableProperty; + Transaction transaction = newTransaction(); + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + List columns = ImmutableList.of(new ColumnMetadata("dummy", createUnboundedVarcharType())); + + // test creating external table with location property + SchemaTableName tableName = new SchemaTableName("presto", "create_external_table"); + try { + tableProperty = ImmutableMap.builder() + .put(STORAGE_FORMAT_PROPERTY, TEXTFILE) + .put(PARTITIONED_BY_PROPERTY, ImmutableList.copyOf(ImmutableList.of())) + .put(BUCKETED_BY_PROPERTY, ImmutableList.of()) + .put(BUCKET_COUNT_PROPERTY, 0) + .put(SORTED_BY_PROPERTY, ImmutableList.of()) + .put(LOCATION_PROPERTY, location) + .put(IS_EXTERNAL_TABLE, true) + .build(); + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, columns, tableProperty); + metadata.createTable(session, tableMetadata, true); + } + finally { + dropTable(tableName); + } + + //test creating external table with location and subquery + tableName = new SchemaTableName("presto", "create_illegal_table"); + try { + tableProperty = ImmutableMap.builder() + .put(STORAGE_FORMAT_PROPERTY, TEXTFILE) + .put(PARTITIONED_BY_PROPERTY, ImmutableList.copyOf(ImmutableList.of())) + .put(BUCKETED_BY_PROPERTY, ImmutableList.of()) + .put(BUCKET_COUNT_PROPERTY, 0) + .put(SORTED_BY_PROPERTY, ImmutableList.of()) + .put(LOCATION_PROPERTY, location) + .put(IS_EXTERNAL_TABLE, true) + .build(); + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, columns, tableProperty); + metadata.beginCreateTable(session, tableMetadata, Optional.empty()); + fail("create external table with subquery should fail"); + } + catch (PrestoException e) { + assertEquals(e.getErrorCode(), NOT_SUPPORTED.toErrorCode()); + } + } + + @Test + public void testCreateTableUnsupportedType() + { + for (HiveStorageFormat storageFormat : createTableFormats) { + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + List columns = ImmutableList.of(new ColumnMetadata("dummy", HYPER_LOG_LOG)); + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(invalidTable, columns, createTableProperties(storageFormat)); + metadata.beginCreateTable(session, tableMetadata, Optional.empty()); + fail("create table with unsupported type should fail for storage format " + storageFormat); + } + catch (PrestoException e) { + assertEquals(e.getErrorCode(), NOT_SUPPORTED.toErrorCode()); + } + } + } + + @Test + public void testUpdateBasicTableStatistics() + throws Exception + { + SchemaTableName tableName = temporaryTable("update_basic_table_statistics"); + try { + doCreateEmptyTable(tableName, ORC, STATISTICS_TABLE_COLUMNS); + testUpdateTableStatistics(tableName, EMPTY_TABLE_STATISTICS, BASIC_STATISTICS_1, BASIC_STATISTICS_2); + } + finally { + dropTable(tableName); + } + } + + @Test + public void testUpdateTableColumnStatistics() + throws Exception + { + SchemaTableName tableName = temporaryTable("update_table_column_statistics"); + try { + doCreateEmptyTable(tableName, ORC, STATISTICS_TABLE_COLUMNS); + testUpdateTableStatistics(tableName, EMPTY_TABLE_STATISTICS, STATISTICS_1_1, STATISTICS_1_2, STATISTICS_2); + } + finally { + dropTable(tableName); + } + } + + @Test + public void testUpdateTableColumnStatisticsEmptyOptionalFields() + throws Exception + { + SchemaTableName tableName = temporaryTable("update_table_column_statistics_empty_optional_fields"); + try { + doCreateEmptyTable(tableName, ORC, STATISTICS_TABLE_COLUMNS); + testUpdateTableStatistics(tableName, EMPTY_TABLE_STATISTICS, STATISTICS_EMPTY_OPTIONAL_FIELDS); + } + finally { + dropTable(tableName); + } + } + + protected void testUpdateTableStatistics(SchemaTableName tableName, PartitionStatistics initialStatistics, PartitionStatistics... statistics) + { + HiveMetastoreClosure metastoreClient = new HiveMetastoreClosure(getMetastoreClient()); + HiveIdentity identity = new HiveIdentity(SESSION); + assertThat(metastoreClient.getTableStatistics(identity, tableName.getSchemaName(), tableName.getTableName())) + .isEqualTo(initialStatistics); + + AtomicReference expectedStatistics = new AtomicReference<>(initialStatistics); + for (PartitionStatistics partitionStatistics : statistics) { + metastoreClient.updateTableStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), actualStatistics -> { + assertThat(actualStatistics).isEqualTo(expectedStatistics.get()); + return partitionStatistics; + }); + assertThat(metastoreClient.getTableStatistics(identity, tableName.getSchemaName(), tableName.getTableName())) + .isEqualTo(partitionStatistics); + expectedStatistics.set(partitionStatistics); + } + + assertThat(metastoreClient.getTableStatistics(identity, tableName.getSchemaName(), tableName.getTableName())) + .isEqualTo(expectedStatistics.get()); + + metastoreClient.updateTableStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), actualStatistics -> { + assertThat(actualStatistics).isEqualTo(expectedStatistics.get()); + return initialStatistics; + }); + + assertThat(metastoreClient.getTableStatistics(identity, tableName.getSchemaName(), tableName.getTableName())) + .isEqualTo(initialStatistics); + } + + @Test + public void testUpdateBasicPartitionStatistics() + throws Exception + { + SchemaTableName tableName = temporaryTable("update_basic_partition_statistics"); + try { + createDummyPartitionedTable(tableName, STATISTICS_PARTITIONED_TABLE_COLUMNS); + testUpdatePartitionStatistics( + tableName, + EMPTY_TABLE_STATISTICS, + ImmutableList.of(BASIC_STATISTICS_1, BASIC_STATISTICS_2), + ImmutableList.of(BASIC_STATISTICS_2, BASIC_STATISTICS_1)); + } + finally { + dropTable(tableName); + } + } + + @Test + public void testUpdatePartitionColumnStatistics() + throws Exception + { + SchemaTableName tableName = temporaryTable("update_partition_column_statistics"); + try { + createDummyPartitionedTable(tableName, STATISTICS_PARTITIONED_TABLE_COLUMNS); + testUpdatePartitionStatistics( + tableName, + EMPTY_TABLE_STATISTICS, + ImmutableList.of(STATISTICS_1_1, STATISTICS_1_2, STATISTICS_2), + ImmutableList.of(STATISTICS_1_2, STATISTICS_1_1, STATISTICS_2)); + } + finally { + dropTable(tableName); + } + } + + @Test + public void testUpdatePartitionColumnStatisticsEmptyOptionalFields() + throws Exception + { + SchemaTableName tableName = temporaryTable("update_partition_column_statistics"); + try { + createDummyPartitionedTable(tableName, STATISTICS_PARTITIONED_TABLE_COLUMNS); + testUpdatePartitionStatistics( + tableName, + EMPTY_TABLE_STATISTICS, + ImmutableList.of(STATISTICS_EMPTY_OPTIONAL_FIELDS), + ImmutableList.of(STATISTICS_EMPTY_OPTIONAL_FIELDS)); + } + finally { + dropTable(tableName); + } + } + + /** + * During table scan, the illegal storage format for some specific table should not fail the whole table scan + */ + @Test + public void testIllegalStorageFormatDuringTableScan() + { + SchemaTableName schemaTableName = temporaryTable("test_illegal_storage_format"); + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + List columns = ImmutableList.of(new Column("pk", HIVE_STRING, Optional.empty())); + String tableOwner = session.getUser(); + String schemaName = schemaTableName.getSchemaName(); + String tableName = schemaTableName.getTableName(); + LocationHandle locationHandle = locationService.forNewTable(transaction.getMetastore(schemaName), session, schemaName, tableName, Optional.empty(), Optional.empty(), HiveWriteUtils.OpertionType.CREATE_TABLE); + Path targetPath = locationService.getQueryWriteInfo(locationHandle).getTargetPath(); + //create table whose storage format is null + Table.Builder tableBuilder = Table.builder() + .setDatabaseName(schemaName) + .setTableName(tableName) + .setOwner(tableOwner) + .setTableType(TableType.MANAGED_TABLE.name()) + .setParameters(ImmutableMap.of( + PRESTO_VERSION_NAME, TEST_SERVER_VERSION, + PRESTO_QUERY_ID_NAME, session.getQueryId())) + .setDataColumns(columns) + .withStorage(storage -> storage + .setLocation(targetPath.toString()) + .setStorageFormat(StorageFormat.createNullable(null, null, null)) + .setSerdeParameters(ImmutableMap.of())); + PrincipalPrivileges principalPrivileges = testingPrincipalPrivilege(tableOwner, session.getUser()); + transaction.getMetastore(schemaName).createTable(session, tableBuilder.build(), principalPrivileges, Optional.empty(), true, EMPTY_TABLE_STATISTICS); + transaction.commit(); + } + + // We retrieve the table whose storageFormat has null serde/inputFormat/outputFormat + // to make sure it can still be retrieved instead of throwing exception. + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + Map> allColumns = metadata.listTableColumns(newSession(), new SchemaTablePrefix(schemaTableName.getSchemaName())); + assertTrue(allColumns.containsKey(schemaTableName)); + } + finally { + try (Transaction transaction1 = newTransaction()) { + ConnectorMetadata metadata = transaction1.getMetadata(); + if (metadata instanceof HiveMetadata) { + ConnectorSession session = newSession(); + SemiTransactionalHiveMetastore semiTransactionalHiveMetastore = transaction1.getMetastore(schemaTableName.getSchemaName()); + semiTransactionalHiveMetastore.dropTable(session, schemaTableName.getSchemaName(), schemaTableName.getTableName()); + transaction1.commit(); + } + else { + dropTable(schemaTableName); + } + } + } + } + + private void createDummyTable(SchemaTableName tableName) + { + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + + List columns = ImmutableList.of(new ColumnMetadata("dummy", createUnboundedVarcharType())); + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, columns, createTableProperties(TEXTFILE)); + ConnectorOutputTableHandle handle = metadata.beginCreateTable(session, tableMetadata, Optional.empty()); + metadata.finishCreateTable(session, handle, ImmutableList.of(), ImmutableList.of()); + + transaction.commit(); + } + } + + protected void createDummyPartitionedTable(SchemaTableName tableName, List columns) + throws Exception + { + doCreateEmptyTable(tableName, ORC, columns); + + HiveMetastore metastoreClient = getMetastoreClient(); + HiveIdentity identity = new HiveIdentity(SESSION); + Table table = metastoreClient.getTable(identity, tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + + List firstPartitionValues = ImmutableList.of("2016-01-01"); + List secondPartitionValues = ImmutableList.of("2016-01-02"); + + String firstPartitionName = makePartName(ImmutableList.of("ds"), firstPartitionValues); + String secondPartitionName = makePartName(ImmutableList.of("ds"), secondPartitionValues); + + List partitions = ImmutableList.of(firstPartitionName, secondPartitionName) + .stream() + .map(partitionName -> new PartitionWithStatistics(createDummyPartition(table, partitionName), partitionName, PartitionStatistics.empty())) + .collect(toImmutableList()); + metastoreClient.addPartitions(identity, tableName.getSchemaName(), tableName.getTableName(), partitions); + metastoreClient.updatePartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), firstPartitionName, currentStatistics -> EMPTY_TABLE_STATISTICS); + metastoreClient.updatePartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), secondPartitionName, currentStatistics -> EMPTY_TABLE_STATISTICS); + } + + protected void testUpdatePartitionStatistics( + SchemaTableName tableName, + PartitionStatistics initialStatistics, + List firstPartitionStatistics, + List secondPartitionStatistics) + { + verify(firstPartitionStatistics.size() == secondPartitionStatistics.size()); + + String firstPartitionName = "ds=2016-01-01"; + String secondPartitionName = "ds=2016-01-02"; + + HiveMetastoreClosure metastoreClient = new HiveMetastoreClosure(getMetastoreClient()); + HiveIdentity identity = new HiveIdentity(SESSION); + assertThat(metastoreClient.getPartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), ImmutableSet.of(firstPartitionName, secondPartitionName))) + .isEqualTo(ImmutableMap.of(firstPartitionName, initialStatistics, secondPartitionName, initialStatistics)); + + AtomicReference expectedStatisticsPartition1 = new AtomicReference<>(initialStatistics); + AtomicReference expectedStatisticsPartition2 = new AtomicReference<>(initialStatistics); + + for (int i = 0; i < firstPartitionStatistics.size(); i++) { + PartitionStatistics statisticsPartition1 = firstPartitionStatistics.get(i); + PartitionStatistics statisticsPartition2 = secondPartitionStatistics.get(i); + metastoreClient.updatePartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), firstPartitionName, actualStatistics -> { + assertThat(actualStatistics).isEqualTo(expectedStatisticsPartition1.get()); + return statisticsPartition1; + }); + metastoreClient.updatePartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), secondPartitionName, actualStatistics -> { + assertThat(actualStatistics).isEqualTo(expectedStatisticsPartition2.get()); + return statisticsPartition2; + }); + assertThat(metastoreClient.getPartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), ImmutableSet.of(firstPartitionName, secondPartitionName))) + .isEqualTo(ImmutableMap.of(firstPartitionName, statisticsPartition1, secondPartitionName, statisticsPartition2)); + expectedStatisticsPartition1.set(statisticsPartition1); + expectedStatisticsPartition2.set(statisticsPartition2); + } + + assertThat(metastoreClient.getPartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), ImmutableSet.of(firstPartitionName, secondPartitionName))) + .isEqualTo(ImmutableMap.of(firstPartitionName, expectedStatisticsPartition1.get(), secondPartitionName, expectedStatisticsPartition2.get())); + metastoreClient.updatePartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), firstPartitionName, currentStatistics -> { + assertThat(currentStatistics).isEqualTo(expectedStatisticsPartition1.get()); + return initialStatistics; + }); + metastoreClient.updatePartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), secondPartitionName, currentStatistics -> { + assertThat(currentStatistics).isEqualTo(expectedStatisticsPartition2.get()); + return initialStatistics; + }); + assertThat(metastoreClient.getPartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), ImmutableSet.of(firstPartitionName, secondPartitionName))) + .isEqualTo(ImmutableMap.of(firstPartitionName, initialStatistics, secondPartitionName, initialStatistics)); + } + + @Test + public void testStorePartitionWithStatistics() + throws Exception + { + testStorePartitionWithStatistics(STATISTICS_PARTITIONED_TABLE_COLUMNS, STATISTICS_1, STATISTICS_2, STATISTICS_1_1, EMPTY_TABLE_STATISTICS); + } + + protected void testStorePartitionWithStatistics( + List columns, + PartitionStatistics statsForAllColumns1, + PartitionStatistics statsForAllColumns2, + PartitionStatistics statsForSubsetOfColumns, + PartitionStatistics emptyStatistics) + throws Exception + { + SchemaTableName tableName = temporaryTable("store_partition_with_statistics"); + try { + doCreateEmptyTable(tableName, ORC, columns); + + HiveMetastoreClosure metastoreClient = new HiveMetastoreClosure(getMetastoreClient()); + HiveIdentity identity = new HiveIdentity(SESSION); + Table table = metastoreClient.getTable(identity, tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + + List partitionValues = ImmutableList.of("2016-01-01"); + String partitionName = makePartName(ImmutableList.of("ds"), partitionValues); + + Partition partition = createDummyPartition(table, partitionName); + + // create partition with stats for all columns + metastoreClient.addPartitions(identity, tableName.getSchemaName(), tableName.getTableName(), ImmutableList.of(new PartitionWithStatistics(partition, partitionName, statsForAllColumns1))); + assertEquals( + metastoreClient.getPartition(identity, tableName.getSchemaName(), tableName.getTableName(), partitionValues).get().getStorage().getStorageFormat(), + StorageFormat.fromHiveStorageFormat(ORC)); + assertThat(metastoreClient.getPartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), ImmutableSet.of(partitionName))) + .isEqualTo(ImmutableMap.of(partitionName, statsForAllColumns1)); + + // alter the partition into one with other stats + Partition modifiedPartition = Partition.builder(partition) + .withStorage(storage -> storage + .setStorageFormat(StorageFormat.fromHiveStorageFormat(RCBINARY)) + .setLocation(partitionTargetPath(tableName, partitionName))) + .build(); + metastoreClient.alterPartition(identity, tableName.getSchemaName(), tableName.getTableName(), new PartitionWithStatistics(modifiedPartition, partitionName, statsForAllColumns2)); + assertEquals( + metastoreClient.getPartition(identity, tableName.getSchemaName(), tableName.getTableName(), partitionValues).get().getStorage().getStorageFormat(), + StorageFormat.fromHiveStorageFormat(RCBINARY)); + assertThat(metastoreClient.getPartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), ImmutableSet.of(partitionName))) + .isEqualTo(ImmutableMap.of(partitionName, statsForAllColumns2)); + + // alter the partition into one with stats for only subset of columns + modifiedPartition = Partition.builder(partition) + .withStorage(storage -> storage + .setStorageFormat(StorageFormat.fromHiveStorageFormat(TEXTFILE)) + .setLocation(partitionTargetPath(tableName, partitionName))) + .build(); + metastoreClient.alterPartition(identity, tableName.getSchemaName(), tableName.getTableName(), new PartitionWithStatistics(modifiedPartition, partitionName, statsForSubsetOfColumns)); + assertThat(metastoreClient.getPartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), ImmutableSet.of(partitionName))) + .isEqualTo(ImmutableMap.of(partitionName, statsForSubsetOfColumns)); + + // alter the partition into one without stats + modifiedPartition = Partition.builder(partition) + .withStorage(storage -> storage + .setStorageFormat(StorageFormat.fromHiveStorageFormat(TEXTFILE)) + .setLocation(partitionTargetPath(tableName, partitionName))) + .build(); + metastoreClient.alterPartition(identity, tableName.getSchemaName(), tableName.getTableName(), new PartitionWithStatistics(modifiedPartition, partitionName, emptyStatistics)); + assertThat(metastoreClient.getPartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), ImmutableSet.of(partitionName))) + .isEqualTo(ImmutableMap.of(partitionName, emptyStatistics)); + } + finally { + dropTable(tableName); + } + } + + protected Partition createDummyPartition(Table table, String partitionName) + { + return Partition.builder() + .setDatabaseName(table.getDatabaseName()) + .setTableName(table.getTableName()) + .setColumns(table.getDataColumns()) + .setValues(toPartitionValues(partitionName)) + .withStorage(storage -> storage + .setStorageFormat(StorageFormat.fromHiveStorageFormat(HiveStorageFormat.ORC)) + .setLocation(partitionTargetPath(new SchemaTableName(table.getDatabaseName(), table.getTableName()), partitionName))) + .setParameters(ImmutableMap.of( + PRESTO_VERSION_NAME, "testversion", + PRESTO_QUERY_ID_NAME, "20180101_123456_00001_x1y2z")) + .build(); + } + + protected String partitionTargetPath(SchemaTableName schemaTableName, String partitionName) + { + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + SemiTransactionalHiveMetastore metastore = transaction.getMetastore(schemaTableName.getSchemaName()); + LocationService locationService = getLocationService(); + Table table = metastore.getTable(new HiveIdentity(session), schemaTableName.getSchemaName(), schemaTableName.getTableName()).get(); + LocationHandle handle = locationService.forExistingTable(metastore, session, table, Optional.empty(), HiveWriteUtils.OpertionType.INSERT); + return locationService.getPartitionWriteInfo(handle, Optional.empty(), partitionName).getTargetPath().toString(); + } + } + + /** + * This test creates 2 identical partitions and verifies that the statistics projected based on + * a single partition sample are equal to the statistics computed in a fair way + */ + @Test + public void testPartitionStatisticsSampling() + throws Exception + { + testPartitionStatisticsSampling(STATISTICS_PARTITIONED_TABLE_COLUMNS, STATISTICS_1); + } + + protected void testPartitionStatisticsSampling(List columns, PartitionStatistics statistics) + throws Exception + { + SchemaTableName tableName = temporaryTable("test_partition_statistics_sampling"); + + try { + createDummyPartitionedTable(tableName, columns); + HiveMetastore metastoreClient = getMetastoreClient(); + HiveIdentity identity = new HiveIdentity(SESSION); + metastoreClient.updatePartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), "ds=2016-01-01", actualStatistics -> statistics); + metastoreClient.updatePartitionStatistics(identity, tableName.getSchemaName(), tableName.getTableName(), "ds=2016-01-02", actualStatistics -> statistics); + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + + ConnectorTableHandle tableHandle = metadata.getTableHandle(session, tableName); + TableStatistics unsampledStatistics = metadata.getTableStatistics(sampleSize(2), tableHandle, Constraint.alwaysTrue(), true); + TableStatistics sampledStatistics = metadata.getTableStatistics(sampleSize(1), tableHandle, Constraint.alwaysTrue(), true); + assertEquals(sampledStatistics, unsampledStatistics); + } + } + finally { + dropTable(tableName); + } + } + + private ConnectorSession sampleSize(int sampleSize) + { + HiveSessionProperties properties = new HiveSessionProperties( + getHiveConfig().setPartitionStatisticsSampleSize(sampleSize), + new OrcFileWriterConfig(), new ParquetFileWriterConfig()); + return new TestingConnectorSession(properties.getSessionProperties()); + } + + private void verifyViewCreation(SchemaTableName temporaryCreateView) + { + // replace works for new view + doCreateView(temporaryCreateView, true); + + // replace works for existing view + doCreateView(temporaryCreateView, true); + + // create fails for existing view + try { + doCreateView(temporaryCreateView, false); + fail("create existing should fail"); + } + catch (ViewAlreadyExistsException e) { + assertEquals(e.getViewName(), temporaryCreateView); + } + + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + // drop works when view exists + metadata.dropView(newSession(), temporaryCreateView); + transaction.commit(); + } + + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + assertThat(metadata.getView(newSession(), temporaryCreateView)) + .isEmpty(); + assertThat(metadata.getViews(newSession(), Optional.of(temporaryCreateView.getSchemaName()))) + .doesNotContainKey(temporaryCreateView); + assertThat(metadata.listViews(newSession(), Optional.of(temporaryCreateView.getSchemaName()))) + .doesNotContain(temporaryCreateView); + } + + // drop fails when view does not exist + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.dropView(newSession(), temporaryCreateView); + fail("drop non-existing should fail"); + } + catch (ViewNotFoundException e) { + assertEquals(e.getViewName(), temporaryCreateView); + } + + // create works for new view + doCreateView(temporaryCreateView, false); + } + + private void doCreateView(SchemaTableName viewName, boolean replace) + { + String viewData = "test data"; + ConnectorViewDefinition definition = new ConnectorViewDefinition( + viewData, + Optional.empty(), + Optional.empty(), + ImmutableList.of(new ViewColumn("test", BIGINT.getTypeSignature())), + Optional.empty(), + true); + + try (Transaction transaction = newTransaction()) { + transaction.getMetadata().createView(newSession(), viewName, definition, replace); + transaction.commit(); + } + + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + + assertThat(metadata.getView(newSession(), viewName)) + .map(ConnectorViewDefinition::getOriginalSql) + .contains(viewData); + + Map views = metadata.getViews(newSession(), Optional.of(viewName.getSchemaName())); + assertEquals(views.size(), 1); + assertEquals(views.get(viewName).getOriginalSql(), definition.getOriginalSql()); + + assertTrue(metadata.listViews(newSession(), Optional.of(viewName.getSchemaName())).contains(viewName)); + } + } + + protected void doCreateTable(SchemaTableName tableName, HiveStorageFormat storageFormat) + throws Exception + { + String queryId; + long creationTime; + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, CREATE_TABLE_COLUMNS, createTableProperties(storageFormat)); + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + queryId = session.getQueryId(); + + // begin creating the table + ConnectorOutputTableHandle outputHandle = metadata.beginCreateTable(session, tableMetadata, Optional.empty()); + metadata.finishCreateTable(session, outputHandle, ImmutableList.of(), ImmutableList.of()); + + // commit table creation + transaction.commit(); + } + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + + creationTime = metadata.getTableModificationTime(session, metadata.getTableHandle(session, tableName)); + + transaction.commit(); + } + + // We use this to add a second's gap between table creation and modification. + // The Hadoop FileStatus API seems to return times at the granularity of seconds. + Thread.sleep(1001); + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + // write the data + ConnectorInsertTableHandle insertTableHandle = metadata.beginInsert(session, tableHandle); + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, insertTableHandle); + sink.appendPage(CREATE_TABLE_DATA.toPage()); + Collection fragments = getFutureValue(sink.finish()); + + // verify all new files start with the unique prefix + HdfsContext context = new HdfsContext(session, tableName.getSchemaName(), tableName.getTableName()); + for (String filePath : listAllDataFiles(context, getStagingPathRoot(insertTableHandle))) { + assertThat(new Path(filePath).getName()).startsWith(session.getQueryId()); + } + + metadata.finishInsert(session, insertTableHandle, fragments, ImmutableList.of()); + + // commit table insertion + transaction.commit(); + } + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + // load the new table + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + + // verify the metadata + tableMetadata = metadata.getTableMetadata(session, getTableHandle(metadata, tableName)); + assertEquals(filterNonHiddenColumnMetadata(tableMetadata.getColumns()), CREATE_TABLE_COLUMNS); + + // verify the data + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.of(storageFormat)); + assertEqualsIgnoreOrder(result.getMaterializedRows(), CREATE_TABLE_DATA.getMaterializedRows()); + + // verify the node version and query ID in table + Table table = getMetastoreClient().getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()).get(); + assertEquals(table.getParameters().get(PRESTO_VERSION_NAME), TEST_SERVER_VERSION); + assertEquals(table.getParameters().get(PRESTO_QUERY_ID_NAME), queryId); + + // verify basic statistics + HiveBasicStatistics statistics = getBasicStatisticsForTable(session, transaction, tableName); + assertEquals(statistics.getRowCount().getAsLong(), CREATE_TABLE_DATA.getRowCount()); + assertEquals(statistics.getFileCount().getAsLong(), 1L); + assertGreaterThan(statistics.getInMemoryDataSizeInBytes().getAsLong(), 0L); + assertGreaterThan(statistics.getOnDiskDataSizeInBytes().getAsLong(), 0L); + + // verify the modification time + long modificationTime = metadata.getTableModificationTime(session, getTableHandle(metadata, tableName)); + assertGreaterThan(modificationTime, creationTime); + } + } + + protected void doCreateEmptyTable(SchemaTableName tableName, HiveStorageFormat storageFormat, List createTableColumns) + throws Exception + { + List partitionedBy = createTableColumns.stream() + .filter(column -> column.getName().equals("ds")) + .map(ColumnMetadata::getName) + .collect(toList()); + + String queryId; + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + queryId = session.getQueryId(); + + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, createTableColumns, createTableProperties(storageFormat, partitionedBy)); + metadata.createTable(session, tableMetadata, false); + transaction.commit(); + } + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + + // load the new table + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + + // verify the metadata + ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(session, getTableHandle(metadata, tableName)); + + List expectedColumns = createTableColumns.stream() + .map(column -> new ColumnMetadata( + column.getName(), + column.getType(), + true, + column.getComment(), + columnExtraInfo(partitionedBy.contains(column.getName())), + false, + emptyMap(), + partitionedBy.contains(column.getName()))) + .collect(toList()); + assertEquals(filterNonHiddenColumnMetadata(tableMetadata.getColumns()), expectedColumns); + + // verify table format + Table table = transaction.getMetastore(tableName.getSchemaName()).getTable(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()).get(); + assertEquals(table.getStorage().getStorageFormat().getInputFormat(), storageFormat.getInputFormat()); + + // verify the node version and query ID + assertEquals(table.getParameters().get(PRESTO_VERSION_NAME), TEST_SERVER_VERSION); + assertEquals(table.getParameters().get(PRESTO_QUERY_ID_NAME), queryId); + + // verify the table is empty + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.of(storageFormat)); + assertEquals(result.getRowCount(), 0); + + // verify basic statistics + if (partitionedBy.isEmpty()) { + HiveBasicStatistics statistics = getBasicStatisticsForTable(session, transaction, tableName); + assertEquals(statistics.getRowCount().getAsLong(), 0L); + assertEquals(statistics.getFileCount().getAsLong(), 0L); + assertEquals(statistics.getInMemoryDataSizeInBytes().getAsLong(), 0L); + assertEquals(statistics.getOnDiskDataSizeInBytes().getAsLong(), 0L); + } + } + } + + private void doInsert(HiveStorageFormat storageFormat, SchemaTableName tableName) + throws Exception + { + // creating the table + doCreateEmptyTable(tableName, storageFormat, CREATE_TABLE_COLUMNS); + + MaterializedResult.Builder resultBuilder = MaterializedResult.resultBuilder(SESSION, CREATE_TABLE_DATA.getTypes()); + for (int i = 0; i < 3; i++) { + insertData(tableName, CREATE_TABLE_DATA); + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + + // load the new table + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + + // verify the metadata + ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(session, getTableHandle(metadata, tableName)); + assertEquals(filterNonHiddenColumnMetadata(tableMetadata.getColumns()), CREATE_TABLE_COLUMNS); + + // verify the data + resultBuilder.rows(CREATE_TABLE_DATA.getMaterializedRows()); + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.empty()); + assertEqualsIgnoreOrder(result.getMaterializedRows(), resultBuilder.build().getMaterializedRows()); + + // statistics + HiveBasicStatistics tableStatistics = getBasicStatisticsForTable(session, transaction, tableName); + assertEquals(tableStatistics.getRowCount().getAsLong(), CREATE_TABLE_DATA.getRowCount() * (i + 1)); + assertEquals(tableStatistics.getFileCount().getAsLong(), i + 1L); + assertGreaterThan(tableStatistics.getInMemoryDataSizeInBytes().getAsLong(), 0L); + assertGreaterThan(tableStatistics.getOnDiskDataSizeInBytes().getAsLong(), 0L); + } + } + + // test rollback + Set existingFiles; + try (Transaction transaction = newTransaction()) { + existingFiles = listAllDataFiles(transaction, tableName.getSchemaName(), tableName.getTableName()); + assertFalse(existingFiles.isEmpty()); + } + + Path stagingPathRoot; + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + + // "stage" insert data + metadata.beginQuery(session); + ConnectorInsertTableHandle insertTableHandle = metadata.beginInsert(session, tableHandle); + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, insertTableHandle); + sink.appendPage(CREATE_TABLE_DATA.toPage()); + sink.appendPage(CREATE_TABLE_DATA.toPage()); + Collection fragments = getFutureValue(sink.finish()); + metadata.finishInsert(session, insertTableHandle, fragments, ImmutableList.of()); + + // statistics, visible from within transaction + HiveBasicStatistics tableStatistics = getBasicStatisticsForTable(session, transaction, tableName); + assertEquals(tableStatistics.getRowCount().getAsLong(), CREATE_TABLE_DATA.getRowCount() * 5L); + + try (Transaction otherTransaction = newTransaction()) { + // statistics, not visible from outside transaction + HiveBasicStatistics otherTableStatistics = getBasicStatisticsForTable(session, otherTransaction, tableName); + assertEquals(otherTableStatistics.getRowCount().getAsLong(), CREATE_TABLE_DATA.getRowCount() * 3L); + } + + // verify all temp files start with the unique prefix + stagingPathRoot = getStagingPathRoot(insertTableHandle); + HdfsContext context = new HdfsContext(session, tableName.getSchemaName(), tableName.getTableName()); + Set tempFiles = listAllDataFiles(context, stagingPathRoot); + assertTrue(!tempFiles.isEmpty()); + for (String filePath : tempFiles) { + assertThat(new Path(filePath).getName()).startsWith(session.getQueryId()); + } + + // rollback insert + transaction.rollback(); + } + + // verify temp directory is empty + HdfsContext context = new HdfsContext(newSession(), tableName.getSchemaName(), tableName.getTableName()); + assertTrue(listAllDataFiles(context, stagingPathRoot).isEmpty()); + + // verify the data is unchanged + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.empty()); + assertEqualsIgnoreOrder(result.getMaterializedRows(), resultBuilder.build().getMaterializedRows()); + + // verify we did not modify the table directory + assertEquals(listAllDataFiles(transaction, tableName.getSchemaName(), tableName.getTableName()), existingFiles); + } + + // verify statistics unchanged + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + HiveBasicStatistics statistics = getBasicStatisticsForTable(session, transaction, tableName); + assertEquals(statistics.getRowCount().getAsLong(), CREATE_TABLE_DATA.getRowCount() * 3L); + assertEquals(statistics.getFileCount().getAsLong(), 3L); + } + } + + private void doInsertOverwriteUnpartitioned(SchemaTableName tableName) + throws Exception + { + // create table with data + doCreateEmptyTable(tableName, ORC, CREATE_TABLE_COLUMNS); + insertData(tableName, CREATE_TABLE_DATA); + + // overwrite table with new data + MaterializedResult.Builder overwriteDataBuilder = MaterializedResult.resultBuilder(SESSION, CREATE_TABLE_DATA.getTypes()); + MaterializedResult overwriteData = null; + + Map overwriteProperties = ImmutableMap.of("insert_existing_partitions_behavior", "OVERWRITE"); + + for (int i = 0; i < 3; i++) { + overwriteDataBuilder.rows(reverse(CREATE_TABLE_DATA.getMaterializedRows())); + overwriteData = overwriteDataBuilder.build(); + + if (i == 0) { + // change the insert behavior to insert overwrite via session property + insertData(tableName, overwriteData, overwriteProperties); + } + else { + // do insert overwrite via "INSERT OVERWRITE" syntax + insertOverwriteData(tableName, overwriteData, ImmutableMap.of()); + } + + // verify overwrite + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + + // load the new table + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + + // verify the metadata + ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(session, getTableHandle(metadata, tableName)); + assertEquals(filterNonHiddenColumnMetadata(tableMetadata.getColumns()), CREATE_TABLE_COLUMNS); + + // verify the data + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.empty()); + assertEqualsIgnoreOrder(result.getMaterializedRows(), overwriteData.getMaterializedRows()); + + // statistics + HiveBasicStatistics tableStatistics = getBasicStatisticsForTable(session, transaction, tableName); + assertEquals(tableStatistics.getRowCount().getAsLong(), overwriteData.getRowCount()); + assertEquals(tableStatistics.getFileCount().getAsLong(), 1L); + assertGreaterThan(tableStatistics.getInMemoryDataSizeInBytes().getAsLong(), 0L); + assertGreaterThan(tableStatistics.getOnDiskDataSizeInBytes().getAsLong(), 0L); + } + } + + // test rollback + Set existingFiles; + try (Transaction transaction = newTransaction()) { + existingFiles = listAllDataFiles(transaction, tableName.getSchemaName(), tableName.getTableName()); + assertFalse(existingFiles.isEmpty()); + } + + Path stagingPathRoot; + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(overwriteProperties); + ConnectorMetadata metadata = transaction.getMetadata(); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + + // "stage" insert data + ConnectorInsertTableHandle insertTableHandle = metadata.beginInsert(session, tableHandle); + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, insertTableHandle); + for (int i = 0; i < 4; i++) { + sink.appendPage(overwriteData.toPage()); + } + Collection fragments = getFutureValue(sink.finish()); + metadata.finishInsert(session, insertTableHandle, fragments, ImmutableList.of()); + + // statistics, visible from within transaction + HiveBasicStatistics tableStatistics = getBasicStatisticsForTable(session, transaction, tableName); + assertEquals(tableStatistics.getRowCount().getAsLong(), overwriteData.getRowCount() * 4L); + + try (Transaction otherTransaction = newTransaction()) { + // statistics, not visible from outside transaction + HiveBasicStatistics otherTableStatistics = getBasicStatisticsForTable(session, otherTransaction, tableName); + assertEquals(otherTableStatistics.getRowCount().getAsLong(), overwriteData.getRowCount()); + } + + // verify we did not modify the table directory + assertEquals(listAllDataFiles(transaction, tableName.getSchemaName(), tableName.getTableName()), existingFiles); + + // verify all temp files start with the unique prefix + stagingPathRoot = getStagingPathRoot(insertTableHandle); + HdfsContext context = new HdfsContext(session, tableName.getSchemaName(), tableName.getTableName()); + Set tempFiles = listAllDataFiles(context, stagingPathRoot); + assertTrue(!tempFiles.isEmpty()); + for (String filePath : tempFiles) { + assertThat(new Path(filePath).getName()).startsWith(session.getQueryId()); + } + + // rollback insert + transaction.rollback(); + } + + // verify temp directory is empty + HdfsContext context = new HdfsContext(newSession(), tableName.getSchemaName(), tableName.getTableName()); + assertTrue(listAllDataFiles(context, stagingPathRoot).isEmpty()); + + // verify the data is unchanged + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.empty()); + assertEqualsIgnoreOrder(result.getMaterializedRows(), overwriteData.getMaterializedRows()); + + // verify we did not modify the table directory + assertEquals(listAllDataFiles(transaction, tableName.getSchemaName(), tableName.getTableName()), existingFiles); + } + + // verify statistics unchanged + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + HiveBasicStatistics statistics = getBasicStatisticsForTable(session, transaction, tableName); + assertEquals(statistics.getRowCount().getAsLong(), overwriteData.getRowCount()); + assertEquals(statistics.getFileCount().getAsLong(), 1L); + } + } + + // These are protected so extensions to the hive connector can replace the handle classes + protected Path getStagingPathRoot(ConnectorInsertTableHandle insertTableHandle) + { + HiveInsertTableHandle handle = (HiveInsertTableHandle) insertTableHandle; + LocationService.WriteInfo writeInfo = getLocationService().getQueryWriteInfo(handle.getLocationHandle()); + if (writeInfo.getWriteMode() != STAGE_AND_MOVE_TO_TARGET_DIRECTORY) { + throw new AssertionError("writeMode is not STAGE_AND_MOVE_TO_TARGET_DIRECTORY"); + } + return writeInfo.getWritePath(); + } + + protected Path getStagingPathRoot(ConnectorOutputTableHandle outputTableHandle) + { + HiveOutputTableHandle handle = (HiveOutputTableHandle) outputTableHandle; + return getLocationService() + .getQueryWriteInfo(handle.getLocationHandle()) + .getWritePath(); + } + + protected Path getTargetPathRoot(ConnectorInsertTableHandle insertTableHandle) + { + HiveInsertTableHandle hiveInsertTableHandle = (HiveInsertTableHandle) insertTableHandle; + + return getLocationService() + .getQueryWriteInfo(hiveInsertTableHandle.getLocationHandle()) + .getTargetPath(); + } + + protected Set listAllDataFiles(Transaction transaction, String schemaName, String tableName) + throws IOException + { + HdfsContext hdfsContext = new HdfsContext(newSession(), schemaName, tableName); + HiveIdentity identity = new HiveIdentity(newSession()); + Set existingFiles = new HashSet<>(); + for (String location : listAllDataPaths(identity, transaction.getMetastore(schemaName), schemaName, tableName)) { + existingFiles.addAll(listAllDataFiles(hdfsContext, new Path(location))); + } + return existingFiles; + } + + public static List listAllDataPaths(HiveIdentity identity, SemiTransactionalHiveMetastore metastore, String schemaName, String tableName) + { + ImmutableList.Builder locations = ImmutableList.builder(); + Table table = metastore.getTable(identity, schemaName, tableName).get(); + if (table.getStorage().getLocation() != null) { + // For partitioned table, there should be nothing directly under this directory. + // But including this location in the set makes the directory content assert more + // extensive, which is desirable. + locations.add(table.getStorage().getLocation()); + } + + Optional> partitionNames = metastore.getPartitionNames(identity, schemaName, tableName); + if (partitionNames.isPresent()) { + metastore.getPartitionsByNames(identity, schemaName, tableName, partitionNames.get()).values().stream() + .map(Optional::get) + .map(partition -> partition.getStorage().getLocation()) + .filter(location -> !location.startsWith(table.getStorage().getLocation())) + .forEach(locations::add); + } + + return locations.build(); + } + + protected Set listAllDataFiles(HdfsContext context, Path path) + throws IOException + { + Set result = new HashSet<>(); + FileSystem fileSystem = hdfsEnvironment.getFileSystem(context, path); + if (fileSystem.exists(path)) { + for (FileStatus fileStatus : fileSystem.listStatus(path)) { + if (fileStatus.getPath().getName().startsWith(".presto")) { + // skip hidden files + } + else if (fileStatus.isFile()) { + result.add(fileStatus.getPath().toString()); + } + else if (fileStatus.isDirectory()) { + result.addAll(listAllDataFiles(context, fileStatus.getPath())); + } + } + } + return result; + } + + private void doInsertIntoNewPartition(HiveStorageFormat storageFormat, SchemaTableName tableName) + throws Exception + { + // creating the table + doCreateEmptyTable(tableName, storageFormat, CREATE_TABLE_COLUMNS_PARTITIONED); + + // insert the data + String queryId = insertData(tableName, CREATE_TABLE_PARTITIONED_DATA); + + Set existingFiles; + try (Transaction transaction = newTransaction()) { + // verify partitions were created + HiveIdentity identity = new HiveIdentity(newSession()); + List partitionNames = transaction.getMetastore(tableName.getSchemaName()).getPartitionNames(identity, tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new AssertionError("Table does not exist: " + tableName)); + assertEqualsIgnoreOrder(partitionNames, CREATE_TABLE_PARTITIONED_DATA.getMaterializedRows().stream() + .map(row -> "ds=" + row.getField(CREATE_TABLE_PARTITIONED_DATA.getTypes().size() - 1)) + .collect(toList())); + + // verify the node versions in partitions + Map> partitions = getMetastoreClient().getPartitionsByNames(identity, tableName.getSchemaName(), tableName.getTableName(), partitionNames); + assertEquals(partitions.size(), partitionNames.size()); + for (String partitionName : partitionNames) { + Partition partition = partitions.get(partitionName).get(); + assertEquals(partition.getParameters().get(PRESTO_VERSION_NAME), TEST_SERVER_VERSION); + assertEquals(partition.getParameters().get(PRESTO_QUERY_ID_NAME), queryId); + } + + // load the new table + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + + // verify the data + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.of(storageFormat)); + assertEqualsIgnoreOrder(result.getMaterializedRows(), CREATE_TABLE_PARTITIONED_DATA.getMaterializedRows()); + + // test rollback + existingFiles = listAllDataFiles(transaction, tableName.getSchemaName(), tableName.getTableName()); + assertFalse(existingFiles.isEmpty()); + + // test statistics + for (String partitionName : partitionNames) { + HiveBasicStatistics partitionStatistics = getBasicStatisticsForPartition(session, transaction, tableName, partitionName); + assertEquals(partitionStatistics.getRowCount().getAsLong(), 1L); + assertEquals(partitionStatistics.getFileCount().getAsLong(), 1L); + assertGreaterThan(partitionStatistics.getInMemoryDataSizeInBytes().getAsLong(), 0L); + assertGreaterThan(partitionStatistics.getOnDiskDataSizeInBytes().getAsLong(), 0L); + } + } + + Path stagingPathRoot; + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + + metadata.beginQuery(session); + // "stage" insert data + ConnectorInsertTableHandle insertTableHandle = metadata.beginInsert(session, tableHandle); + stagingPathRoot = getStagingPathRoot(insertTableHandle); + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, insertTableHandle); + sink.appendPage(CREATE_TABLE_PARTITIONED_DATA_2ND.toPage()); + Collection fragments = getFutureValue(sink.finish()); + metadata.finishInsert(session, insertTableHandle, fragments, ImmutableList.of()); + + // verify all temp files start with the unique prefix + HdfsContext context = new HdfsContext(session, tableName.getSchemaName(), tableName.getTableName()); + Set tempFiles = listAllDataFiles(context, getStagingPathRoot(insertTableHandle)); + assertTrue(!tempFiles.isEmpty()); + for (String filePath : tempFiles) { + assertThat(new Path(filePath).getName()).startsWith(session.getQueryId()); + } + + // rollback insert + transaction.rollback(); + } + + // verify the data is unchanged + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.empty()); + assertEqualsIgnoreOrder(result.getMaterializedRows(), CREATE_TABLE_PARTITIONED_DATA.getMaterializedRows()); + + // verify we did not modify the table directory + assertEquals(listAllDataFiles(transaction, tableName.getSchemaName(), tableName.getTableName()), existingFiles); + + // verify temp directory is empty + HdfsContext context = new HdfsContext(session, tableName.getSchemaName(), tableName.getTableName()); + assertTrue(listAllDataFiles(context, stagingPathRoot).isEmpty()); + } + } + + private void doInsertUnsupportedWriteType(HiveStorageFormat storageFormat, SchemaTableName tableName) + throws Exception + { + List columns = ImmutableList.of(new Column("dummy", HiveType.valueOf("uniontype"), Optional.empty())); + List partitionColumns = ImmutableList.of(new Column("name", HIVE_STRING, Optional.empty())); + + createEmptyTable(tableName, storageFormat, columns, partitionColumns); + + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + + metadata.beginInsert(session, tableHandle); + fail("expected failure"); + } + catch (PrestoException e) { + assertThat(e).hasMessageMatching("Inserting into Hive table .* with column type uniontype not supported"); + } + } + + private void doInsertIntoExistingPartition(HiveStorageFormat storageFormat, SchemaTableName tableName) + throws Exception + { + // creating the table + doCreateEmptyTable(tableName, storageFormat, CREATE_TABLE_COLUMNS_PARTITIONED); + + MaterializedResult.Builder resultBuilder = MaterializedResult.resultBuilder(SESSION, CREATE_TABLE_PARTITIONED_DATA.getTypes()); + for (int i = 0; i < 3; i++) { + // insert the data + insertData(tableName, CREATE_TABLE_PARTITIONED_DATA); + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + + // verify partitions were created + List partitionNames = transaction.getMetastore(tableName.getSchemaName()).getPartitionNames(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new AssertionError("Table does not exist: " + tableName)); + assertEqualsIgnoreOrder(partitionNames, CREATE_TABLE_PARTITIONED_DATA.getMaterializedRows().stream() + .map(row -> "ds=" + row.getField(CREATE_TABLE_PARTITIONED_DATA.getTypes().size() - 1)) + .collect(toList())); + + // load the new table + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + + // verify the data + resultBuilder.rows(CREATE_TABLE_PARTITIONED_DATA.getMaterializedRows()); + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.of(storageFormat)); + assertEqualsIgnoreOrder(result.getMaterializedRows(), resultBuilder.build().getMaterializedRows()); + + // test statistics + for (String partitionName : partitionNames) { + HiveBasicStatistics statistics = getBasicStatisticsForPartition(session, transaction, tableName, partitionName); + assertEquals(statistics.getRowCount().getAsLong(), i + 1L); + assertEquals(statistics.getFileCount().getAsLong(), i + 1L); + assertGreaterThan(statistics.getInMemoryDataSizeInBytes().getAsLong(), 0L); + assertGreaterThan(statistics.getOnDiskDataSizeInBytes().getAsLong(), 0L); + } + } + } + + // test rollback + Set existingFiles; + Path stagingPathRoot; + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + + existingFiles = listAllDataFiles(transaction, tableName.getSchemaName(), tableName.getTableName()); + assertFalse(existingFiles.isEmpty()); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + + metadata.beginQuery(session); + // "stage" insert data + ConnectorInsertTableHandle insertTableHandle = metadata.beginInsert(session, tableHandle); + stagingPathRoot = getStagingPathRoot(insertTableHandle); + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, insertTableHandle); + sink.appendPage(CREATE_TABLE_PARTITIONED_DATA.toPage()); + sink.appendPage(CREATE_TABLE_PARTITIONED_DATA.toPage()); + Collection fragments = getFutureValue(sink.finish()); + metadata.finishInsert(session, insertTableHandle, fragments, ImmutableList.of()); + + // verify all temp files start with the unique prefix + HdfsContext context = new HdfsContext(session, tableName.getSchemaName(), tableName.getTableName()); + Set tempFiles = listAllDataFiles(context, getStagingPathRoot(insertTableHandle)); + assertTrue(!tempFiles.isEmpty()); + for (String filePath : tempFiles) { + assertThat(new Path(filePath).getName()).startsWith(session.getQueryId()); + } + + // verify statistics are visible from within of the current transaction + List partitionNames = transaction.getMetastore(tableName.getSchemaName()).getPartitionNames(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new AssertionError("Table does not exist: " + tableName)); + for (String partitionName : partitionNames) { + HiveBasicStatistics partitionStatistics = getBasicStatisticsForPartition(session, transaction, tableName, partitionName); + assertEquals(partitionStatistics.getRowCount().getAsLong(), 5L); + } + + // rollback insert + transaction.rollback(); + } + + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + metadata.beginQuery(session); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + + // verify the data is unchanged + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.empty()); + assertEqualsIgnoreOrder(result.getMaterializedRows(), resultBuilder.build().getMaterializedRows()); + + // verify we did not modify the table directory + assertEquals(listAllDataFiles(transaction, tableName.getSchemaName(), tableName.getTableName()), existingFiles); + + // verify temp directory is empty + HdfsContext hdfsContext = new HdfsContext(session, tableName.getSchemaName(), tableName.getTableName()); + assertTrue(listAllDataFiles(hdfsContext, stagingPathRoot).isEmpty()); + + // verify statistics have been rolled back + HiveIdentity identity = new HiveIdentity(session); + List partitionNames = transaction.getMetastore(tableName.getSchemaName()).getPartitionNames(identity, tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new AssertionError("Table does not exist: " + tableName)); + for (String partitionName : partitionNames) { + HiveBasicStatistics partitionStatistics = getBasicStatisticsForPartition(session, transaction, tableName, partitionName); + assertEquals(partitionStatistics.getRowCount().getAsLong(), 3L); + } + } + } + + private void doInsertIntoExistingPartitionEmptyStatistics(HiveStorageFormat storageFormat, SchemaTableName tableName) + throws Exception + { + ConnectorSession session = newSession(); + doCreateEmptyTable(tableName, storageFormat, CREATE_TABLE_COLUMNS_PARTITIONED); + insertData(tableName, CREATE_TABLE_PARTITIONED_DATA); + + eraseStatistics(tableName); + + insertData(tableName, CREATE_TABLE_PARTITIONED_DATA); + + try (Transaction transaction = newTransaction()) { + List partitionNames = transaction.getMetastore(tableName.getSchemaName()).getPartitionNames(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new AssertionError("Table does not exist: " + tableName)); + + for (String partitionName : partitionNames) { + HiveBasicStatistics statistics = getBasicStatisticsForPartition(session, transaction, tableName, partitionName); + assertThat(statistics.getRowCount()).isNotPresent(); + assertThat(statistics.getInMemoryDataSizeInBytes()).isNotPresent(); + // fileCount and rawSize statistics are computed on the fly by the metastore, thus cannot be erased + } + } + } + + private static HiveBasicStatistics getBasicStatisticsForTable(ConnectorSession session, Transaction transaction, SchemaTableName table) + { + return transaction + .getMetastore(table.getSchemaName()) + .getTableStatistics(new HiveIdentity(session), table.getSchemaName(), table.getTableName()) + .getBasicStatistics(); + } + + private static HiveBasicStatistics getBasicStatisticsForPartition(ConnectorSession session, Transaction transaction, SchemaTableName table, String partitionName) + { + HiveIdentity identity = new HiveIdentity(session); + return transaction + .getMetastore(table.getSchemaName()) + .getPartitionStatistics(identity, table.getSchemaName(), table.getTableName(), ImmutableSet.of(partitionName), transaction.getMetastore(table.getSchemaName()).getTable(identity, table.getSchemaName(), table.getTableName())) + .get(partitionName) + .getBasicStatistics(); + } + + private void eraseStatistics(SchemaTableName schemaTableName) + { + HiveMetastore metastoreClient = getMetastoreClient(); + HiveIdentity identity = new HiveIdentity(SESSION); + metastoreClient.updateTableStatistics(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName(), statistics -> new PartitionStatistics(createEmptyStatistics(), ImmutableMap.of())); + Table table = metastoreClient.getTable(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(schemaTableName)); + List partitionColumns = table.getPartitionColumns().stream() + .map(Column::getName) + .collect(toImmutableList()); + if (!table.getPartitionColumns().isEmpty()) { + List partitionNames = metastoreClient.getPartitionNames(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName()) + .orElse(ImmutableList.of()); + List partitions = metastoreClient + .getPartitionsByNames(identity, schemaTableName.getSchemaName(), schemaTableName.getTableName(), partitionNames) + .entrySet() + .stream() + .map(Map.Entry::getValue) + .filter(Optional::isPresent) + .map(Optional::get) + .collect(toImmutableList()); + for (Partition partition : partitions) { + metastoreClient.updatePartitionStatistics( + identity, + schemaTableName.getSchemaName(), + schemaTableName.getTableName(), + makePartName(partitionColumns, partition.getValues()), + statistics -> new PartitionStatistics(createEmptyStatistics(), ImmutableMap.of())); + } + } + } + + /** + * @return query id + */ + private String insertData(SchemaTableName tableName, MaterializedResult data) + throws Exception + { + return insertData(tableName, data, ImmutableMap.of()); + } + + private String insertOverwriteData(SchemaTableName tableName, MaterializedResult data, Map sessionProperties) + throws Exception + { + Path writePath; + Path targetPath; + String queryId; + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(sessionProperties); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + metadata.beginQuery(session); + ConnectorInsertTableHandle insertTableHandle = metadata.beginInsert(session, tableHandle, true); + queryId = session.getQueryId(); + writePath = getStagingPathRoot(insertTableHandle); + targetPath = getTargetPathRoot(insertTableHandle); + + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, insertTableHandle); + + // write data + sink.appendPage(data.toPage()); + Collection fragments = getFutureValue(sink.finish()); + + // commit the insert + metadata.finishInsert(session, insertTableHandle, fragments, ImmutableList.of()); + transaction.commit(); + } + + // check that temporary files are removed + if (!writePath.equals(targetPath)) { + HdfsContext context = new HdfsContext(newSession(), tableName.getSchemaName(), tableName.getTableName()); + FileSystem fileSystem = hdfsEnvironment.getFileSystem(context, writePath); + assertFalse(fileSystem.exists(writePath)); + } + + return queryId; + } + + private String insertData(SchemaTableName tableName, MaterializedResult data, Map sessionProperties) + throws Exception + { + Path writePath; + Path targetPath; + String queryId; + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(sessionProperties); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + metadata.beginQuery(session); + ConnectorInsertTableHandle insertTableHandle = metadata.beginInsert(session, tableHandle); + queryId = session.getQueryId(); + writePath = getStagingPathRoot(insertTableHandle); + targetPath = getTargetPathRoot(insertTableHandle); + + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, insertTableHandle); + + // write data + sink.appendPage(data.toPage()); + Collection fragments = getFutureValue(sink.finish()); + + // commit the insert + metadata.finishInsert(session, insertTableHandle, fragments, ImmutableList.of()); + transaction.commit(); + } + + // check that temporary files are removed + if (!writePath.equals(targetPath)) { + HdfsContext context = new HdfsContext(newSession(), tableName.getSchemaName(), tableName.getTableName()); + FileSystem fileSystem = hdfsEnvironment.getFileSystem(context, writePath); + assertFalse(fileSystem.exists(writePath)); + } + + return queryId; + } + + private void doTestMetadataDelete(HiveStorageFormat storageFormat, SchemaTableName tableName) + throws Exception + { + // creating the table + doCreateEmptyTable(tableName, storageFormat, CREATE_TABLE_COLUMNS_PARTITIONED); + + insertData(tableName, CREATE_TABLE_PARTITIONED_DATA); + + MaterializedResult.Builder expectedResultBuilder = MaterializedResult.resultBuilder(SESSION, CREATE_TABLE_PARTITIONED_DATA.getTypes()); + expectedResultBuilder.rows(CREATE_TABLE_PARTITIONED_DATA.getMaterializedRows()); + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + + // verify partitions were created + List partitionNames = transaction.getMetastore(tableName.getSchemaName()).getPartitionNames(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new AssertionError("Table does not exist: " + tableName)); + assertEqualsIgnoreOrder(partitionNames, CREATE_TABLE_PARTITIONED_DATA.getMaterializedRows().stream() + .map(row -> "ds=" + row.getField(CREATE_TABLE_PARTITIONED_DATA.getTypes().size() - 1)) + .collect(toList())); + + // verify table directory is not empty + Set filesAfterInsert = listAllDataFiles(transaction, tableName.getSchemaName(), tableName.getTableName()); + assertFalse(filesAfterInsert.isEmpty()); + + // verify the data + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.of(storageFormat)); + assertEqualsIgnoreOrder(result.getMaterializedRows(), expectedResultBuilder.build().getMaterializedRows()); + } + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + + // get ds column handle + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + HiveColumnHandle dsColumnHandle = (HiveColumnHandle) metadata.getColumnHandles(session, tableHandle).get("ds"); + + // delete ds=2015-07-03 + session = newSession(); + TupleDomain tupleDomain = TupleDomain.fromFixedValues(ImmutableMap.of(dsColumnHandle, NullableValue.of(createUnboundedVarcharType(), utf8Slice("2015-07-03")))); + Constraint constraint = new Constraint(tupleDomain, convertToPredicate(tupleDomain)); + tableHandle = applyFilter(metadata, tableHandle, constraint); + tableHandle = metadata.applyDelete(session, tableHandle).get(); + metadata.executeDelete(session, tableHandle); + + transaction.commit(); + } + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + HiveColumnHandle dsColumnHandle = (HiveColumnHandle) metadata.getColumnHandles(session, tableHandle).get("ds"); + int dsColumnOrdinalPosition = columnHandles.indexOf(dsColumnHandle); + + // verify the data + ImmutableList expectedRows = expectedResultBuilder.build().getMaterializedRows().stream() + .filter(row -> !"2015-07-03".equals(row.getField(dsColumnOrdinalPosition))) + .collect(toImmutableList()); + MaterializedResult actualAfterDelete = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.of(storageFormat)); + assertEqualsIgnoreOrder(actualAfterDelete.getMaterializedRows(), expectedRows); + } + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + HiveColumnHandle dsColumnHandle = (HiveColumnHandle) metadata.getColumnHandles(session, tableHandle).get("ds"); + + // delete ds=2015-07-01 and 2015-07-02 + session = newSession(); + TupleDomain tupleDomain2 = TupleDomain.withColumnDomains( + ImmutableMap.of(dsColumnHandle, Domain.create(ValueSet.ofRanges(Range.range(createUnboundedVarcharType(), utf8Slice("2015-07-01"), true, utf8Slice("2015-07-02"), true)), false))); + Constraint constraint2 = new Constraint(tupleDomain2, convertToPredicate(tupleDomain2)); + tableHandle = applyFilter(metadata, tableHandle, constraint2); + tableHandle = metadata.applyDelete(session, tableHandle).get(); + metadata.executeDelete(session, tableHandle); + + transaction.commit(); + } + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values()); + + // verify the data + session = newSession(); + MaterializedResult actualAfterDelete2 = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.of(storageFormat)); + assertEqualsIgnoreOrder(actualAfterDelete2.getMaterializedRows(), ImmutableList.of()); + + // verify table directory is empty + Set filesAfterDelete = listAllDataFiles(transaction, tableName.getSchemaName(), tableName.getTableName()); + assertTrue(filesAfterDelete.isEmpty()); + } + } + + protected void assertGetRecords(String tableName, HiveStorageFormat hiveStorageFormat) + throws Exception + { + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + + ConnectorTableHandle tableHandle = getTableHandle(metadata, new SchemaTableName(database, tableName)); + ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(session, tableHandle); + ConnectorSplit split = getHiveSplit(tableHandle, transaction, session); + + List columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, tableHandle).values()); + + ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles); + HiveSplit hiveSplit = getOnlyElement(((HiveSplitWrapper) split).getSplits()); + assertGetRecords(hiveStorageFormat, tableMetadata, hiveSplit, pageSource, columnHandles); + } + } + + protected ConnectorSplit getHiveSplit(ConnectorTableHandle tableHandle, Transaction transaction, ConnectorSession session) + { + List splits = getAllSplits(tableHandle, transaction, session); + assertEquals(splits.size(), 1); + return getOnlyElement(splits); + } + + protected void assertGetRecords( + HiveStorageFormat hiveStorageFormat, + ConnectorTableMetadata tableMetadata, + HiveSplit hiveSplit, + ConnectorPageSource pageSource, + List columnHandles) + throws IOException + { + try { + MaterializedResult result = materializeSourceDataStream(newSession(), pageSource, getTypes(columnHandles)); + + assertPageSourceType(pageSource, hiveStorageFormat); + + ImmutableMap columnIndex = indexColumns(tableMetadata); + + long rowNumber = 0; + long completedBytes = 0; + for (MaterializedRow row : result) { + try { + assertValueTypes(row, tableMetadata.getColumns()); + } + catch (RuntimeException e) { + throw new RuntimeException("row " + rowNumber, e); + } + + rowNumber++; + Integer index; + Object value; + + // STRING + index = columnIndex.get("t_string"); + value = row.getField(index); + if (rowNumber % 19 == 0) { + assertNull(value); + } + else if (rowNumber % 19 == 1) { + assertEquals(value, ""); + } + else { + assertEquals(value, "test"); + } + + // NUMBERS + assertEquals(row.getField(columnIndex.get("t_tinyint")), (byte) (1 + rowNumber)); + assertEquals(row.getField(columnIndex.get("t_smallint")), (short) (2 + rowNumber)); + assertEquals(row.getField(columnIndex.get("t_int")), (int) (3 + rowNumber)); + + index = columnIndex.get("t_bigint"); + if ((rowNumber % 13) == 0) { + assertNull(row.getField(index)); + } + else { + assertEquals(row.getField(index), 4 + rowNumber); + } + + assertEquals((Float) row.getField(columnIndex.get("t_float")), 5.1f + rowNumber, 0.001); + assertEquals(row.getField(columnIndex.get("t_double")), 6.2 + rowNumber); + + // BOOLEAN + index = columnIndex.get("t_boolean"); + if ((rowNumber % 3) == 2) { + assertNull(row.getField(index)); + } + else { + assertEquals(row.getField(index), (rowNumber % 3) != 0); + } + + // TIMESTAMP + index = columnIndex.get("t_timestamp"); + if (index != null) { + if ((rowNumber % 17) == 0) { + assertNull(row.getField(index)); + } + else { + SqlTimestamp expected = sqlTimestampOf(2011, 5, 6, 7, 8, 9, 123); + assertEquals(row.getField(index), expected); + } + } + + // BINARY + index = columnIndex.get("t_binary"); + if (index != null) { + if ((rowNumber % 23) == 0) { + assertNull(row.getField(index)); + } + else { + assertEquals(row.getField(index), new SqlVarbinary("test binary".getBytes(UTF_8))); + } + } + + // DATE + index = columnIndex.get("t_date"); + if (index != null) { + if ((rowNumber % 37) == 0) { + assertNull(row.getField(index)); + } + else { + SqlDate expected = new SqlDate(toIntExact(MILLISECONDS.toDays(new DateTime(2013, 8, 9, 0, 0, 0, UTC).getMillis()))); + assertEquals(row.getField(index), expected); + } + } + + // VARCHAR(50) + index = columnIndex.get("t_varchar"); + if (index != null) { + value = row.getField(index); + if (rowNumber % 39 == 0) { + assertNull(value); + } + else if (rowNumber % 39 == 1) { + // https://issues.apache.org/jira/browse/HIVE-13289 + // RCBINARY reads empty VARCHAR as null + if (hiveStorageFormat == RCBINARY) { + assertNull(value); + } + else { + assertEquals(value, ""); + } + } + else { + assertEquals(value, "test varchar"); + } + } + + //CHAR(25) + index = columnIndex.get("t_char"); + if (index != null) { + value = row.getField(index); + if ((rowNumber % 41) == 0) { + assertNull(value); + } + else { + assertEquals(value, (rowNumber % 41) == 1 ? " " : "test char "); + } + } + + // MAP + index = columnIndex.get("t_map"); + if (index != null) { + if ((rowNumber % 27) == 0) { + assertNull(row.getField(index)); + } + else { + assertEquals(row.getField(index), ImmutableMap.of("test key", "test value")); + } + } + + // ARRAY + index = columnIndex.get("t_array_string"); + if (index != null) { + if ((rowNumber % 29) == 0) { + assertNull(row.getField(index)); + } + else { + assertEquals(row.getField(index), ImmutableList.of("abc", "xyz", "data")); + } + } + + // ARRAY + index = columnIndex.get("t_array_timestamp"); + if (index != null) { + if ((rowNumber % 43) == 0) { + assertNull(row.getField(index)); + } + else { + SqlTimestamp expected = sqlTimestampOf(LocalDateTime.of(2011, 5, 6, 7, 8, 9, 123_000_000)); + assertEquals(row.getField(index), ImmutableList.of(expected)); + } + } + + // ARRAY> + index = columnIndex.get("t_array_struct"); + if (index != null) { + if ((rowNumber % 31) == 0) { + assertNull(row.getField(index)); + } + else { + List expected1 = ImmutableList.of("test abc", 0.1); + List expected2 = ImmutableList.of("test xyz", 0.2); + assertEquals(row.getField(index), ImmutableList.of(expected1, expected2)); + } + } + + // STRUCT + index = columnIndex.get("t_struct"); + if (index != null) { + if ((rowNumber % 31) == 0) { + assertNull(row.getField(index)); + } + else { + assertTrue(row.getField(index) instanceof List); + List values = (List) row.getField(index); + assertEquals(values.size(), 2); + assertEquals(values.get(0), "test abc"); + assertEquals(values.get(1), 0.1); + } + } + + // MAP>> + index = columnIndex.get("t_complex"); + if (index != null) { + if ((rowNumber % 33) == 0) { + assertNull(row.getField(index)); + } + else { + List expected1 = ImmutableList.of("test abc", 0.1); + List expected2 = ImmutableList.of("test xyz", 0.2); + assertEquals(row.getField(index), ImmutableMap.of(1, ImmutableList.of(expected1, expected2))); + } + } + + // NEW COLUMN + assertNull(row.getField(columnIndex.get("new_column"))); + + long newCompletedBytes = pageSource.getCompletedBytes(); + assertTrue(newCompletedBytes >= completedBytes); + assertTrue(newCompletedBytes <= hiveSplit.getLength()); + completedBytes = newCompletedBytes; + } + + assertTrue(completedBytes <= hiveSplit.getLength()); + assertEquals(rowNumber, 100); + } + finally { + pageSource.close(); + } + } + + protected void dropTable(SchemaTableName table) + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + + ConnectorTableHandle handle = metadata.getTableHandle(session, table); + if (handle == null) { + return; + } + + metadata.dropTable(session, handle); + try { + // todo I have no idea why this is needed... maybe there is a propagation delay in the metastore? + metadata.dropTable(session, handle); + fail("expected NotFoundException"); + } + catch (TableNotFoundException expected) { + } + + transaction.commit(); + } + catch (Exception e) { + Logger.get(getClass()).warn(e, "failed to drop table"); + } + } + + protected ConnectorTableHandle getTableHandle(ConnectorMetadata metadata, SchemaTableName tableName) + { + ConnectorTableHandle handle = metadata.getTableHandle(newSession(), tableName); + checkArgument(handle != null, "table not found: %s", tableName); + return handle; + } + + private ConnectorTableHandle applyFilter(ConnectorMetadata metadata, ConnectorTableHandle tableHandle, Constraint constraint) + { + return metadata.applyFilter(newSession(), tableHandle, constraint) + .map(ConstraintApplicationResult::getHandle) + .orElseThrow(AssertionError::new); + } + + private MaterializedResult readTable( + Transaction transaction, + ConnectorTableHandle tableHandle, + List columnHandles, + ConnectorSession session, + TupleDomain tupleDomain, + OptionalInt expectedSplitCount, + Optional expectedStorageFormat) + throws Exception + { + tableHandle = applyFilter(transaction.getMetadata(), tableHandle, new Constraint(tupleDomain)); + List splits = getAllSplits(splitManager.getSplits(transaction.getTransactionHandle(), session, tableHandle, UNGROUPED_SCHEDULING)); + if (expectedSplitCount.isPresent()) { + assertEquals(splits.size(), expectedSplitCount.getAsInt()); + } + + ImmutableList.Builder allRows = ImmutableList.builder(); + for (ConnectorSplit split : splits) { + try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles)) { + expectedStorageFormat.ifPresent(format -> assertPageSourceType(pageSource, format)); + MaterializedResult result = materializeSourceDataStream(session, pageSource, getTypes(columnHandles)); + allRows.addAll(result.getMaterializedRows()); + } + } + return new MaterializedResult(allRows.build(), getTypes(columnHandles)); + } + + protected HiveMetastore getMetastoreClient() + { + return metastoreClient; + } + + protected LocationService getLocationService() + { + return locationService; + } + + protected static int getSplitCount(ConnectorSplitSource splitSource) + { + int splitCount = 0; + while (!splitSource.isFinished()) { + splitCount += getFutureValue(splitSource.getNextBatch(NOT_PARTITIONED, 1000)).getSplits().size(); + } + return splitCount; + } + + private List getAllSplits(ConnectorTableHandle tableHandle, Transaction transaction, ConnectorSession session) + { + return getAllSplits(splitManager.getSplits(transaction.getTransactionHandle(), session, tableHandle, UNGROUPED_SCHEDULING)); + } + + protected static List getAllSplits(ConnectorSplitSource splitSource) + { + ImmutableList.Builder splits = ImmutableList.builder(); + while (!splitSource.isFinished()) { + splits.addAll(getFutureValue(splitSource.getNextBatch(NOT_PARTITIONED, 1000)).getSplits()); + } + return splits.build(); + } + + protected String getPartitionId(Object partition) + { + return ((HivePartition) partition).getPartitionId(); + } + + protected static void assertPageSourceType(ConnectorPageSource pageSource, HiveStorageFormat hiveStorageFormat) + { + if (pageSource instanceof OrcConcatPageSource) { + pageSource = ((OrcConcatPageSource) pageSource).getConnectorPageSource(); + } + + if (pageSource instanceof RecordPageSource) { + RecordCursor hiveRecordCursor = ((RecordPageSource) pageSource).getCursor(); + hiveRecordCursor = ((HiveRecordCursor) hiveRecordCursor).getRegularColumnRecordCursor(); + if (hiveRecordCursor instanceof HiveCoercionRecordCursor) { + hiveRecordCursor = ((HiveCoercionRecordCursor) hiveRecordCursor).getRegularColumnRecordCursor(); + } + assertInstanceOf(hiveRecordCursor, recordCursorType(hiveStorageFormat), hiveStorageFormat.name()); + } + else { + assertInstanceOf(((HivePageSource) pageSource).getPageSource(), pageSourceType(hiveStorageFormat), hiveStorageFormat.name()); + } + } + + private static Class recordCursorType(HiveStorageFormat hiveStorageFormat) + { + return GenericHiveRecordCursor.class; + } + + private static Class pageSourceType(HiveStorageFormat hiveStorageFormat) + { + switch (hiveStorageFormat) { + case RCTEXT: + case RCBINARY: + return RcFilePageSource.class; + case ORC: + return OrcPageSource.class; + case PARQUET: + return ParquetPageSource.class; + default: + throw new AssertionError("File type does not use a PageSource: " + hiveStorageFormat); + } + } + + private static void assertValueTypes(MaterializedRow row, List schema) + { + for (int columnIndex = 0; columnIndex < schema.size(); columnIndex++) { + ColumnMetadata column = schema.get(columnIndex); + Object value = row.getField(columnIndex); + if (value != null) { + if (BOOLEAN.equals(column.getType())) { + assertInstanceOf(value, Boolean.class); + } + else if (TINYINT.equals(column.getType())) { + assertInstanceOf(value, Byte.class); + } + else if (SMALLINT.equals(column.getType())) { + assertInstanceOf(value, Short.class); + } + else if (INTEGER.equals(column.getType())) { + assertInstanceOf(value, Integer.class); + } + else if (BIGINT.equals(column.getType())) { + assertInstanceOf(value, Long.class); + } + else if (DOUBLE.equals(column.getType())) { + assertInstanceOf(value, Double.class); + } + else if (REAL.equals(column.getType())) { + assertInstanceOf(value, Float.class); + } + else if (isVarcharType(column.getType())) { + assertInstanceOf(value, String.class); + } + else if (isCharType(column.getType())) { + assertInstanceOf(value, String.class); + } + else if (VARBINARY.equals(column.getType())) { + assertInstanceOf(value, SqlVarbinary.class); + } + else if (TIMESTAMP.equals(column.getType())) { + assertInstanceOf(value, SqlTimestamp.class); + } + else if (DATE.equals(column.getType())) { + assertInstanceOf(value, SqlDate.class); + } + else if (column.getType() instanceof ArrayType || column.getType() instanceof RowType) { + assertInstanceOf(value, List.class); + } + else if (column.getType() instanceof MapType) { + assertInstanceOf(value, Map.class); + } + else { + fail("Unknown primitive type " + columnIndex); + } + } + } + } + + private static void assertPrimitiveField(Map map, String name, Type type, boolean partitionKey) + { + assertTrue(map.containsKey(name)); + ColumnMetadata column = map.get(name); + assertEquals(column.getType(), type, name); + assertEquals(column.getExtraInfo(), columnExtraInfo(partitionKey)); + } + + protected static ImmutableMap indexColumns(List columnHandles) + { + ImmutableMap.Builder index = ImmutableMap.builder(); + int i = 0; + for (ColumnHandle columnHandle : columnHandles) { + HiveColumnHandle hiveColumnHandle = (HiveColumnHandle) columnHandle; + index.put(hiveColumnHandle.getName(), i); + i++; + } + return index.build(); + } + + protected static ImmutableMap indexColumns(ConnectorTableMetadata tableMetadata) + { + ImmutableMap.Builder index = ImmutableMap.builder(); + int i = 0; + for (ColumnMetadata columnMetadata : tableMetadata.getColumns()) { + index.put(columnMetadata.getName(), i); + i++; + } + return index.build(); + } + + protected SchemaTableName temporaryTable(String tableName) + { + return temporaryTable(database, tableName); + } + + protected static SchemaTableName temporaryTable(String database, String tableName) + { + String randomName = UUID.randomUUID().toString().toLowerCase(ENGLISH).replace("-", ""); + return new SchemaTableName(database, TEMPORARY_TABLE_PREFIX + tableName + "_" + randomName); + } + + protected static Map createTableProperties(HiveStorageFormat storageFormat) + { + return createTableProperties(storageFormat, ImmutableList.of()); + } + + private static Map createTableProperties(HiveStorageFormat storageFormat, Iterable parititonedBy) + { + return ImmutableMap.builder() + .put(STORAGE_FORMAT_PROPERTY, storageFormat) + .put(PARTITIONED_BY_PROPERTY, ImmutableList.copyOf(parititonedBy)) + .put(BUCKETED_BY_PROPERTY, ImmutableList.of()) + .put(BUCKET_COUNT_PROPERTY, 0) + .put(SORTED_BY_PROPERTY, ImmutableList.of()) + .build(); + } + + protected static List filterNonHiddenColumnHandles(Collection columnHandles) + { + return columnHandles.stream() + .filter(columnHandle -> !((HiveColumnHandle) columnHandle).isHidden()) + .collect(toList()); + } + + protected static List filterNonHiddenColumnMetadata(Collection columnMetadatas) + { + return columnMetadatas.stream() + .filter(columnMetadata -> !columnMetadata.isHidden()) + .collect(toList()); + } + + private void createEmptyTable(SchemaTableName schemaTableName, HiveStorageFormat hiveStorageFormat, List columns, List partitionColumns) + throws Exception + { + createEmptyTable(schemaTableName, hiveStorageFormat, columns, partitionColumns, Optional.empty()); + } + + private void createEmptyTable(SchemaTableName schemaTableName, HiveStorageFormat hiveStorageFormat, List columns, List partitionColumns, Optional bucketProperty) + throws Exception + { + Path targetPath; + + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + + String tableOwner = session.getUser(); + String schemaName = schemaTableName.getSchemaName(); + String tableName = schemaTableName.getTableName(); + + LocationService locationService = getLocationService(); + LocationHandle locationHandle = locationService.forNewTable(transaction.getMetastore(schemaName), session, schemaName, tableName, Optional.empty(), Optional.empty(), HiveWriteUtils.OpertionType.CREATE_TABLE); + targetPath = locationService.getQueryWriteInfo(locationHandle).getTargetPath(); + + Table.Builder tableBuilder = Table.builder() + .setDatabaseName(schemaName) + .setTableName(tableName) + .setOwner(tableOwner) + .setTableType(TableType.MANAGED_TABLE.name()) + .setParameters(ImmutableMap.of( + PRESTO_VERSION_NAME, TEST_SERVER_VERSION, + PRESTO_QUERY_ID_NAME, session.getQueryId())) + .setDataColumns(columns) + .setPartitionColumns(partitionColumns); + + tableBuilder.getStorageBuilder() + .setLocation(targetPath.toString()) + .setStorageFormat(StorageFormat.create(hiveStorageFormat.getSerDe(), hiveStorageFormat.getInputFormat(), hiveStorageFormat.getOutputFormat())) + .setBucketProperty(bucketProperty) + .setSerdeParameters(ImmutableMap.of()); + + PrincipalPrivileges principalPrivileges = testingPrincipalPrivilege(tableOwner, session.getUser()); + transaction.getMetastore(schemaName).createTable(session, tableBuilder.build(), principalPrivileges, Optional.empty(), true, EMPTY_TABLE_STATISTICS); + + transaction.commit(); + } + + HdfsContext context = new HdfsContext(newSession(), schemaTableName.getSchemaName(), schemaTableName.getTableName()); + List targetDirectoryList = listDirectory(context, targetPath); + assertEquals(targetDirectoryList, ImmutableList.of()); + } + + private void alterBucketProperty(SchemaTableName schemaTableName, Optional bucketProperty) + { + try (Transaction transaction = newTransaction()) { + ConnectorSession session = newSession(); + + String tableOwner = session.getUser(); + String schemaName = schemaTableName.getSchemaName(); + String tableName = schemaTableName.getTableName(); + + Optional
table = transaction.getMetastore(schemaName).getTable(new HiveIdentity(session), schemaName, tableName); + Table.Builder tableBuilder = Table.builder(table.get()); + tableBuilder.getStorageBuilder().setBucketProperty(bucketProperty); + PrincipalPrivileges principalPrivileges = testingPrincipalPrivilege(tableOwner, session.getUser()); + // hack: replaceView can be used as replaceTable despite its name + transaction.getMetastore(schemaName).replaceView(new HiveIdentity(session), schemaName, tableName, tableBuilder.build(), principalPrivileges); + + transaction.commit(); + } + } + + private PrincipalPrivileges testingPrincipalPrivilege(ConnectorSession session) + { + return testingPrincipalPrivilege(session.getUser(), session.getUser()); + } + + private PrincipalPrivileges testingPrincipalPrivilege(String tableOwner, String grantor) + { + return new PrincipalPrivileges( + ImmutableMultimap.builder() + .put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.SELECT, true, new HivePrincipal(USER, grantor), new HivePrincipal(USER, grantor))) + .put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.INSERT, true, new HivePrincipal(USER, grantor), new HivePrincipal(USER, grantor))) + .put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.UPDATE, true, new HivePrincipal(USER, grantor), new HivePrincipal(USER, grantor))) + .put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.DELETE, true, new HivePrincipal(USER, grantor), new HivePrincipal(USER, grantor))) + .build(), + ImmutableMultimap.of()); + } + + private List listDirectory(HdfsContext context, Path path) + throws IOException + { + FileSystem fileSystem = hdfsEnvironment.getFileSystem(context, path); + return Arrays.stream(fileSystem.listStatus(path)) + .map(FileStatus::getPath) + .map(Path::getName) + .filter(name -> !name.startsWith(".presto")) + .collect(toList()); + } + + @Test + public void testTransactionDeleteInsert() + throws Exception + { + doTestTransactionDeleteInsert( + RCBINARY, + true, + ImmutableList.builder() + .add(new TransactionDeleteInsertTestCase(false, false, ROLLBACK_RIGHT_AWAY, Optional.empty())) + .add(new TransactionDeleteInsertTestCase(false, false, ROLLBACK_AFTER_DELETE, Optional.empty())) + .add(new TransactionDeleteInsertTestCase(false, false, ROLLBACK_AFTER_BEGIN_INSERT, Optional.empty())) + .add(new TransactionDeleteInsertTestCase(false, false, ROLLBACK_AFTER_APPEND_PAGE, Optional.empty())) + .add(new TransactionDeleteInsertTestCase(false, false, ROLLBACK_AFTER_SINK_FINISH, Optional.empty())) + .add(new TransactionDeleteInsertTestCase(false, false, ROLLBACK_AFTER_FINISH_INSERT, Optional.empty())) + .add(new TransactionDeleteInsertTestCase(false, false, COMMIT, Optional.of(new AddPartitionFailure()))) + .add(new TransactionDeleteInsertTestCase(false, false, COMMIT, Optional.of(new DirectoryRenameFailure()))) + .add(new TransactionDeleteInsertTestCase(false, false, COMMIT, Optional.of(new FileRenameFailure()))) + .add(new TransactionDeleteInsertTestCase(true, false, COMMIT, Optional.of(new DropPartitionFailure()))) + .add(new TransactionDeleteInsertTestCase(true, true, COMMIT, Optional.empty())) + .build()); + } + + @Test + public void testPreferredInsertLayout() + throws Exception + { + SchemaTableName tableName = temporaryTable("empty_partitioned_table"); + + try { + Column partitioningColumn = new Column("column2", HIVE_STRING, Optional.empty()); + List columns = ImmutableList.of( + new Column("column1", HIVE_STRING, Optional.empty()), + partitioningColumn); + createEmptyTable(tableName, ORC, columns, ImmutableList.of(partitioningColumn)); + + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(ImmutableMap.of("write_partition_distribution", true)); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + Optional insertLayout = metadata.getInsertLayout(session, tableHandle); + assertTrue(insertLayout.isPresent()); + assertFalse(insertLayout.get().getPartitioning().isPresent()); + assertEquals(insertLayout.get().getPartitionColumns(), ImmutableList.of(partitioningColumn.getName())); + } + } + finally { + dropTable(tableName); + } + } + + @Test + public void testInsertLayoutWithWritePartitionDistributionDisabled() + throws Exception + { + SchemaTableName tableName = temporaryTable("empty_partitioned_table"); + + try { + Column partitioningColumn = new Column("column2", HIVE_STRING, Optional.empty()); + List columns = ImmutableList.of( + new Column("column1", HIVE_STRING, Optional.empty()), + partitioningColumn); + createEmptyTable(tableName, ORC, columns, ImmutableList.of(partitioningColumn)); + + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + Optional insertLayout = metadata.getInsertLayout(session, tableHandle); + assertFalse(insertLayout.isPresent()); + } + } + finally { + dropTable(tableName); + } + } + + @Test + public void testPreferredCreateTableLayout() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(ImmutableMap.of("write_partition_distribution", true)); + Optional newTableLayout = metadata.getNewTableLayout( + session, + new ConnectorTableMetadata( + new SchemaTableName("schema", "table"), + ImmutableList.of( + new ColumnMetadata("column1", BIGINT), + new ColumnMetadata("column2", BIGINT)), + ImmutableMap.of( + PARTITIONED_BY_PROPERTY, ImmutableList.of("column2"), + BUCKETED_BY_PROPERTY, ImmutableList.of(), + BUCKET_COUNT_PROPERTY, 0, + SORTED_BY_PROPERTY, ImmutableList.of()))); + assertTrue(newTableLayout.isPresent()); + assertFalse(newTableLayout.get().getPartitioning().isPresent()); + assertEquals(newTableLayout.get().getPartitionColumns(), ImmutableList.of("column2")); + } + } + + @Test + public void testCreateTableLayoutWithWriteDistributionDisabled() + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + Optional newTableLayout = metadata.getNewTableLayout( + session, + new ConnectorTableMetadata( + new SchemaTableName("schema", "table"), + ImmutableList.of( + new ColumnMetadata("column1", BIGINT), + new ColumnMetadata("column2", BIGINT)), + ImmutableMap.of( + PARTITIONED_BY_PROPERTY, ImmutableList.of("column2"), + BUCKETED_BY_PROPERTY, ImmutableList.of(), + BUCKET_COUNT_PROPERTY, 0, + SORTED_BY_PROPERTY, ImmutableList.of()))); + assertFalse(newTableLayout.isPresent()); + } + } + + @Test + public void testUpdateLayout() + throws Exception + { + SchemaTableName tableName = temporaryTable("empty_partitioned_table"); + + try { + Column partitioningColumn = new Column("column2", HIVE_STRING, Optional.empty()); + List columns = ImmutableList.of( + new Column("column1", HIVE_STRING, Optional.empty()), + partitioningColumn); + createEmptyTable(tableName, ORC, columns, ImmutableList.of(partitioningColumn)); + + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + Optional updateLayout = metadata.getUpdateLayout(session, tableHandle); + assertTrue(updateLayout.isPresent()); + assertTrue(updateLayout.get().getPartitioning().isPresent()); + assertEquals(updateLayout.get().getPartitionColumns(), ImmutableList.of(partitioningColumn.getName(), HiveColumnHandle.UPDATE_ROW_ID_COLUMN_NAME.toLowerCase(ENGLISH))); + } + } + finally { + dropTable(tableName); + } + } + + protected void doTestTransactionDeleteInsert(HiveStorageFormat storageFormat, boolean allowInsertExisting, List testCases) + throws Exception + { + // There are 4 types of operations on a partition: add, drop, alter (drop then add), insert existing. + // There are 12 partitions in this test, 3 for each type. + // 3 is chosen to verify that cleanups, commit aborts, rollbacks are always as complete as possible regardless of failure. + MaterializedResult beforeData = + MaterializedResult.resultBuilder(SESSION, BIGINT, createUnboundedVarcharType(), createUnboundedVarcharType()) + .row(110L, "a", "alter1") + .row(120L, "a", "insert1") + .row(140L, "a", "drop1") + .row(210L, "b", "drop2") + .row(310L, "c", "alter2") + .row(320L, "c", "alter3") + .row(510L, "e", "drop3") + .row(610L, "f", "insert2") + .row(620L, "f", "insert3") + .build(); + Domain domainToDrop = Domain.create(ValueSet.of( + createUnboundedVarcharType(), + utf8Slice("alter1"), utf8Slice("alter2"), utf8Slice("alter3"), utf8Slice("drop1"), utf8Slice("drop2"), utf8Slice("drop3")), + false); + List extraRowsForInsertExisting = ImmutableList.of(); + if (allowInsertExisting) { + extraRowsForInsertExisting = MaterializedResult.resultBuilder(SESSION, BIGINT, createUnboundedVarcharType(), createUnboundedVarcharType()) + .row(121L, "a", "insert1") + .row(611L, "f", "insert2") + .row(621L, "f", "insert3") + .build() + .getMaterializedRows(); + } + MaterializedResult insertData = + MaterializedResult.resultBuilder(SESSION, BIGINT, createUnboundedVarcharType(), createUnboundedVarcharType()) + .row(111L, "a", "alter1") + .row(131L, "a", "add1") + .row(221L, "b", "add2") + .row(311L, "c", "alter2") + .row(321L, "c", "alter3") + .row(411L, "d", "add3") + .rows(extraRowsForInsertExisting) + .build(); + MaterializedResult afterData = + MaterializedResult.resultBuilder(SESSION, BIGINT, createUnboundedVarcharType(), createUnboundedVarcharType()) + .row(120L, "a", "insert1") + .row(610L, "f", "insert2") + .row(620L, "f", "insert3") + .rows(insertData.getMaterializedRows()) + .build(); + + for (TransactionDeleteInsertTestCase testCase : testCases) { + SchemaTableName temporaryDeleteInsert = temporaryTable("delete_insert"); + try { + createEmptyTable( + temporaryDeleteInsert, + storageFormat, + ImmutableList.of(new Column("col1", HIVE_LONG, Optional.empty())), + ImmutableList.of(new Column("pk1", HIVE_STRING, Optional.empty()), new Column("pk2", HIVE_STRING, Optional.empty()))); + insertData(temporaryDeleteInsert, beforeData); + try { + doTestTransactionDeleteInsert( + storageFormat, + temporaryDeleteInsert, + domainToDrop, + insertData, + testCase.isExpectCommitedData() ? afterData : beforeData, + testCase.getTag(), + testCase.isExpectQuerySucceed(), + testCase.getConflictTrigger()); + } + catch (AssertionError e) { + throw new AssertionError(format("Test case: %s", testCase.toString()), e); + } + } + finally { + dropTable(temporaryDeleteInsert); + } + } + } + + private void doTestTransactionDeleteInsert( + HiveStorageFormat storageFormat, + SchemaTableName tableName, + Domain domainToDrop, + MaterializedResult insertData, + MaterializedResult expectedData, + TransactionDeleteInsertTestTag tag, + boolean expectQuerySucceed, + Optional conflictTrigger) + throws Exception + { + Path writePath = null; + Path targetPath = null; + + try (Transaction transaction = newTransaction()) { + try { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + ConnectorSession session; + rollbackIfEquals(tag, ROLLBACK_RIGHT_AWAY); + + // Query 1: delete + session = newSession(); + HiveColumnHandle dsColumnHandle = (HiveColumnHandle) metadata.getColumnHandles(session, tableHandle).get("pk2"); + TupleDomain tupleDomain = TupleDomain.withColumnDomains(ImmutableMap.of( + dsColumnHandle, domainToDrop)); + Constraint constraint = new Constraint(tupleDomain, convertToPredicate(tupleDomain)); + tableHandle = applyFilter(metadata, tableHandle, constraint); + tableHandle = metadata.applyDelete(session, tableHandle).get(); + metadata.executeDelete(session, tableHandle); + rollbackIfEquals(tag, ROLLBACK_AFTER_DELETE); + + // Query 2: insert + session = newSession(); + ConnectorInsertTableHandle insertTableHandle = metadata.beginInsert(session, tableHandle); + rollbackIfEquals(tag, ROLLBACK_AFTER_BEGIN_INSERT); + writePath = getStagingPathRoot(insertTableHandle); + targetPath = getTargetPathRoot(insertTableHandle); + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, insertTableHandle); + sink.appendPage(insertData.toPage()); + rollbackIfEquals(tag, ROLLBACK_AFTER_APPEND_PAGE); + Collection fragments = getFutureValue(sink.finish()); + rollbackIfEquals(tag, ROLLBACK_AFTER_SINK_FINISH); + metadata.finishInsert(session, insertTableHandle, fragments, ImmutableList.of()); + rollbackIfEquals(tag, ROLLBACK_AFTER_FINISH_INSERT); + + assertEquals(tag, COMMIT); + + if (conflictTrigger.isPresent()) { + JsonCodec partitionUpdateCodec = JsonCodec.jsonCodec(PartitionUpdate.class); + List partitionUpdates = fragments.stream() + .map(Slice::getBytes) + .map(partitionUpdateCodec::fromJson) + .collect(toList()); + conflictTrigger.get().triggerConflict(session, tableName, insertTableHandle, partitionUpdates); + } + transaction.commit(); + if (conflictTrigger.isPresent()) { + assertTrue(expectQuerySucceed); + conflictTrigger.get().verifyAndCleanup(session, tableName); + } + } + catch (TestingRollbackException e) { + transaction.rollback(); + } + catch (PrestoException e) { + assertFalse(expectQuerySucceed); + if (conflictTrigger.isPresent()) { + conflictTrigger.get().verifyAndCleanup(newSession(), tableName); + } + } + } + + // check that temporary files are removed + if (writePath != null && !writePath.equals(targetPath)) { + HdfsContext context = new HdfsContext(newSession(), tableName.getSchemaName(), tableName.getTableName()); + FileSystem fileSystem = hdfsEnvironment.getFileSystem(context, writePath); + assertFalse(fileSystem.exists(writePath)); + } + + try (Transaction transaction = newTransaction()) { + // verify partitions + List partitionNames = transaction.getMetastore(tableName.getSchemaName()) + .getPartitionNames(new HiveIdentity(newSession()), tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new AssertionError("Table does not exist: " + tableName)); + assertEqualsIgnoreOrder( + partitionNames, + expectedData.getMaterializedRows().stream() + .map(row -> format("pk1=%s/pk2=%s", row.getField(1), row.getField(2))) + .distinct() + .collect(toList())); + + // load the new table + ConnectorSession session = newSession(); + ConnectorMetadata metadata = transaction.getMetadata(); + metadata.beginQuery(session); + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + + // verify the data + MaterializedResult result = readTable(transaction, tableHandle, columnHandles, session, TupleDomain.all(), OptionalInt.empty(), Optional.of(storageFormat)); + assertEqualsIgnoreOrder(result.getMaterializedRows(), expectedData.getMaterializedRows()); + } + } + + private static void rollbackIfEquals(TransactionDeleteInsertTestTag tag, TransactionDeleteInsertTestTag expectedTag) + { + if (expectedTag == tag) { + throw new TestingRollbackException(); + } + } + + private static class TestingRollbackException + extends RuntimeException + { + } + + protected static class TransactionDeleteInsertTestCase + { + private final boolean expectCommitedData; + private final boolean expectQuerySucceed; + private final TransactionDeleteInsertTestTag tag; + private final Optional conflictTrigger; + + public TransactionDeleteInsertTestCase(boolean expectCommitedData, boolean expectQuerySucceed, TransactionDeleteInsertTestTag tag, Optional conflictTrigger) + { + this.expectCommitedData = expectCommitedData; + this.expectQuerySucceed = expectQuerySucceed; + this.tag = tag; + this.conflictTrigger = conflictTrigger; + } + + public boolean isExpectCommitedData() + { + return expectCommitedData; + } + + public boolean isExpectQuerySucceed() + { + return expectQuerySucceed; + } + + public TransactionDeleteInsertTestTag getTag() + { + return tag; + } + + public Optional getConflictTrigger() + { + return conflictTrigger; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("tag", tag) + .add("conflictTrigger", conflictTrigger.map(conflictTrigger -> conflictTrigger.getClass().getName())) + .add("expectCommitedData", expectCommitedData) + .add("expectQuerySucceed", expectQuerySucceed) + .toString(); + } + } + + protected enum TransactionDeleteInsertTestTag + { + ROLLBACK_RIGHT_AWAY, + ROLLBACK_AFTER_DELETE, + ROLLBACK_AFTER_BEGIN_INSERT, + ROLLBACK_AFTER_APPEND_PAGE, + ROLLBACK_AFTER_SINK_FINISH, + ROLLBACK_AFTER_FINISH_INSERT, + COMMIT, + } + + protected interface ConflictTrigger + { + void triggerConflict(ConnectorSession session, SchemaTableName tableName, ConnectorInsertTableHandle insertTableHandle, List partitionUpdates) + throws IOException; + + void verifyAndCleanup(ConnectorSession session, SchemaTableName tableName) + throws IOException; + } + + protected class AddPartitionFailure + implements ConflictTrigger + { + private final ImmutableList copyPartitionFrom = ImmutableList.of("a", "insert1"); + private final String partitionNameToConflict = "pk1=b/pk2=add2"; + private Partition conflictPartition; + + @Override + public void triggerConflict(ConnectorSession session, SchemaTableName tableName, ConnectorInsertTableHandle insertTableHandle, List partitionUpdates) + { + // This method bypasses transaction interface because this method is inherently hacky and doesn't work well with the transaction abstraction. + // Additionally, this method is not part of a test. Its purpose is to set up an environment for another test. + HiveMetastore metastoreClient = getMetastoreClient(); + Optional partition = metastoreClient.getPartition(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName(), copyPartitionFrom); + conflictPartition = Partition.builder(partition.get()) + .setValues(toPartitionValues(partitionNameToConflict)) + .build(); + metastoreClient.addPartitions( + new HiveIdentity(session), + tableName.getSchemaName(), + tableName.getTableName(), + ImmutableList.of(new PartitionWithStatistics(conflictPartition, partitionNameToConflict, PartitionStatistics.empty()))); + } + + @Override + public void verifyAndCleanup(ConnectorSession session, SchemaTableName tableName) + { + // This method bypasses transaction interface because this method is inherently hacky and doesn't work well with the transaction abstraction. + // Additionally, this method is not part of a test. Its purpose is to set up an environment for another test. + HiveMetastore metastoreClient = getMetastoreClient(); + Optional actualPartition = metastoreClient.getPartition(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName(), toPartitionValues(partitionNameToConflict)); + // Make sure the partition inserted to trigger conflict was not overwritten + // Checking storage location is sufficient because implement never uses .../pk1=a/pk2=a2 as the directory for partition [b, b2]. + assertEquals(actualPartition.get().getStorage().getLocation(), conflictPartition.getStorage().getLocation()); + metastoreClient.dropPartition(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName(), conflictPartition.getValues(), false); + } + } + + protected class DropPartitionFailure + implements ConflictTrigger + { + private final ImmutableList partitionValueToConflict = ImmutableList.of("b", "drop2"); + + @Override + public void triggerConflict(ConnectorSession session, SchemaTableName tableName, ConnectorInsertTableHandle insertTableHandle, List partitionUpdates) + { + // This method bypasses transaction interface because this method is inherently hacky and doesn't work well with the transaction abstraction. + // Additionally, this method is not part of a test. Its purpose is to set up an environment for another test. + HiveMetastore metastoreClient = getMetastoreClient(); + metastoreClient.dropPartition(new HiveIdentity(session), tableName.getSchemaName(), tableName.getTableName(), partitionValueToConflict, false); + } + + @Override + public void verifyAndCleanup(ConnectorSession session, SchemaTableName tableName) + { + // Do not add back the deleted partition because the implementation is expected to move forward instead of backward when delete fails + } + } + + protected class DirectoryRenameFailure + implements ConflictTrigger + { + private HdfsContext context; + private Path path; + + @Override + public void triggerConflict(ConnectorSession session, SchemaTableName tableName, ConnectorInsertTableHandle insertTableHandle, List partitionUpdates) + { + Path writePath = getStagingPathRoot(insertTableHandle); + Path targetPath = getTargetPathRoot(insertTableHandle); + if (writePath.equals(targetPath)) { + // This conflict does not apply. Trigger a rollback right away so that this test case passes. + throw new TestingRollbackException(); + } + path = new Path(targetPath + "/pk1=b/pk2=add2"); + context = new HdfsContext(session, tableName.getSchemaName(), tableName.getTableName()); + createDirectory(context, hdfsEnvironment, path); + } + + @Override + public void verifyAndCleanup(ConnectorSession session, SchemaTableName tableName) + throws IOException + { + assertEquals(listDirectory(context, path), ImmutableList.of()); + hdfsEnvironment.getFileSystem(context, path).delete(path, false); + } + } + + protected class FileRenameFailure + implements ConflictTrigger + { + private HdfsContext context; + private Path path; + + @Override + public void triggerConflict(ConnectorSession session, SchemaTableName tableName, ConnectorInsertTableHandle insertTableHandle, List partitionUpdates) + throws IOException + { + for (PartitionUpdate partitionUpdate : partitionUpdates) { + if ("pk2=insert2".equals(partitionUpdate.getTargetPath().getName())) { + path = new Path(partitionUpdate.getTargetPath(), partitionUpdate.getFileNames().get(0)); + break; + } + } + assertNotNull(path); + + context = new HdfsContext(session, tableName.getSchemaName(), tableName.getTableName()); + FileSystem fileSystem = hdfsEnvironment.getFileSystem(context, path); + fileSystem.createNewFile(path); + } + + @Override + public void verifyAndCleanup(ConnectorSession session, SchemaTableName tableName) + throws IOException + { + // The file we added to trigger a conflict was cleaned up because it matches the query prefix. + // Consider this the same as a network failure that caused the successful creation of file not reported to the caller. + assertFalse(hdfsEnvironment.getFileSystem(context, path).exists(path)); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHiveFileFormats.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHiveFileFormats.java new file mode 100644 index 00000000..4cca5c90 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHiveFileFormats.java @@ -0,0 +1,947 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.spi.Page; +import io.prestosql.spi.PageBuilder; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.type.ArrayType; +import io.prestosql.spi.type.CharType; +import io.prestosql.spi.type.DateType; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.Decimals; +import io.prestosql.spi.type.RowType; +import io.prestosql.spi.type.SqlDate; +import io.prestosql.spi.type.SqlDecimal; +import io.prestosql.spi.type.SqlTimestamp; +import io.prestosql.spi.type.SqlVarbinary; +import io.prestosql.spi.type.TimestampType; +import io.prestosql.spi.type.Type; +import io.prestosql.testing.MaterializedResult; +import io.prestosql.testing.MaterializedRow; +import io.prestosql.tests.StructuralTestUtil; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; +import org.apache.hadoop.hive.ql.io.HiveOutputFormat; +import org.apache.hadoop.hive.serde2.Serializer; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.JobConf; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormat; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.lang.invoke.MethodHandle; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; +import java.util.concurrent.TimeUnit; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Predicates.not; +import static com.google.common.base.Strings.padEnd; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.Iterables.filter; +import static com.google.common.collect.Iterables.transform; +import static io.prestosql.plugin.hive.HdfsConfigurationInitializer.configureCompression; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION; +import static io.prestosql.plugin.hive.HiveTestUtils.SESSION; +import static io.prestosql.plugin.hive.HiveTestUtils.TYPE_MANAGER; +import static io.prestosql.plugin.hive.HiveTestUtils.isDistinctFrom; +import static io.prestosql.plugin.hive.HiveTestUtils.mapType; +import static io.prestosql.plugin.hive.HiveUtil.isStructuralType; +import static io.prestosql.plugin.hive.util.SerDeUtils.serializeObject; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.CharType.createCharType; +import static io.prestosql.spi.type.Chars.isCharType; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.VarbinaryType.VARBINARY; +import static io.prestosql.spi.type.VarcharType.createUnboundedVarcharType; +import static io.prestosql.spi.type.VarcharType.createVarcharType; +import static io.prestosql.spi.type.Varchars.isVarcharType; +import static io.prestosql.testing.DateTimeTestingUtils.sqlTimestampOf; +import static io.prestosql.testing.MaterializedResult.materializeSourceDataStream; +import static io.prestosql.tests.StructuralTestUtil.arrayBlockOf; +import static io.prestosql.tests.StructuralTestUtil.decimalArrayBlockOf; +import static io.prestosql.tests.StructuralTestUtil.decimalMapBlockOf; +import static io.prestosql.tests.StructuralTestUtil.mapBlockOf; +import static io.prestosql.tests.StructuralTestUtil.rowBlockOf; +import static java.lang.Float.intBitsToFloat; +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Arrays.fill; +import static java.util.Objects.requireNonNull; +import static java.util.function.Function.identity; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardListObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardMapObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaBooleanObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDateObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDoubleObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaFloatObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaHiveVarcharObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaShortObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaTimestampObjectInspector; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getCharTypeInfo; +import static org.joda.time.DateTimeZone.UTC; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +@Test(groups = "hive") +public abstract class AbstractTestHiveFileFormats +{ + protected static final DateTimeZone HIVE_STORAGE_TIME_ZONE = DateTimeZone.forID("America/Bahia_Banderas"); + + private static final double EPSILON = 0.001; + + private static final long DATE_MILLIS_UTC = new DateTime(2011, 5, 6, 0, 0, UTC).getMillis(); + private static final long DATE_DAYS = TimeUnit.MILLISECONDS.toDays(DATE_MILLIS_UTC); + private static final String DATE_STRING = DateTimeFormat.forPattern("yyyy-MM-dd").withZoneUTC().print( + DATE_MILLIS_UTC); + private static final Date HIVE_DATE = Date.ofEpochMilli(DATE_MILLIS_UTC); + + private static final long TIMESTAMP = new DateTime(2011, 5, 6, 7, 8, 9, 123).getMillis(); + private static final String TIMESTAMP_STRING = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS").withZoneUTC().print( + TIMESTAMP); + private static final Timestamp HIVE_TIMESTAMP = Timestamp.ofEpochMilli(TIMESTAMP); + + private static final String VARCHAR_MAX_LENGTH_STRING; + + static { + char[] varcharMaxLengthCharArray = new char[HiveVarchar.MAX_VARCHAR_LENGTH]; + fill(varcharMaxLengthCharArray, 'a'); + VARCHAR_MAX_LENGTH_STRING = new String(varcharMaxLengthCharArray); + } + + private static final JavaHiveDecimalObjectInspector DECIMAL_INSPECTOR_PRECISION_2 = + new JavaHiveDecimalObjectInspector(new DecimalTypeInfo(2, 1)); + private static final JavaHiveDecimalObjectInspector DECIMAL_INSPECTOR_PRECISION_4 = + new JavaHiveDecimalObjectInspector(new DecimalTypeInfo(4, 2)); + private static final JavaHiveDecimalObjectInspector DECIMAL_INSPECTOR_PRECISION_8 = + new JavaHiveDecimalObjectInspector(new DecimalTypeInfo(8, 4)); + private static final JavaHiveDecimalObjectInspector DECIMAL_INSPECTOR_PRECISION_17 = + new JavaHiveDecimalObjectInspector(new DecimalTypeInfo(17, 8)); + private static final JavaHiveDecimalObjectInspector DECIMAL_INSPECTOR_PRECISION_18 = + new JavaHiveDecimalObjectInspector(new DecimalTypeInfo(18, 8)); + private static final JavaHiveDecimalObjectInspector DECIMAL_INSPECTOR_PRECISION_38 = + new JavaHiveDecimalObjectInspector(new DecimalTypeInfo(38, 16)); + + private static final DecimalType DECIMAL_TYPE_PRECISION_2 = DecimalType.createDecimalType(2, 1); + private static final DecimalType DECIMAL_TYPE_PRECISION_4 = DecimalType.createDecimalType(4, 2); + private static final DecimalType DECIMAL_TYPE_PRECISION_8 = DecimalType.createDecimalType(8, 4); + private static final DecimalType DECIMAL_TYPE_PRECISION_17 = DecimalType.createDecimalType(17, 8); + private static final DecimalType DECIMAL_TYPE_PRECISION_18 = DecimalType.createDecimalType(18, 8); + private static final DecimalType DECIMAL_TYPE_PRECISION_38 = DecimalType.createDecimalType(38, 16); + + private static final HiveDecimal WRITE_DECIMAL_PRECISION_2 = HiveDecimal.create(new BigDecimal("-1.2")); + private static final HiveDecimal WRITE_DECIMAL_PRECISION_4 = HiveDecimal.create(new BigDecimal("12.3")); + private static final HiveDecimal WRITE_DECIMAL_PRECISION_8 = HiveDecimal.create(new BigDecimal("-1234.5678")); + private static final HiveDecimal WRITE_DECIMAL_PRECISION_17 = HiveDecimal.create(new BigDecimal("123456789.1234")); + private static final HiveDecimal WRITE_DECIMAL_PRECISION_18 = HiveDecimal.create( + new BigDecimal("-1234567890.12345678")); + private static final HiveDecimal WRITE_DECIMAL_PRECISION_38 = HiveDecimal.create( + new BigDecimal("1234567890123456789012.12345678")); + + private static final BigDecimal EXPECTED_DECIMAL_PRECISION_2 = new BigDecimal("-1.2"); + private static final BigDecimal EXPECTED_DECIMAL_PRECISION_4 = new BigDecimal("12.30"); + private static final BigDecimal EXPECTED_DECIMAL_PRECISION_8 = new BigDecimal("-1234.5678"); + private static final BigDecimal EXPECTED_DECIMAL_PRECISION_17 = new BigDecimal("123456789.12340000"); + private static final BigDecimal EXPECTED_DECIMAL_PRECISION_18 = new BigDecimal("-1234567890.12345678"); + private static final BigDecimal EXPECTED_DECIMAL_PRECISION_38 = new BigDecimal( + "1234567890123456789012.1234567800000000"); + + private static final JavaHiveCharObjectInspector CHAR_INSPECTOR_LENGTH_10 = + new JavaHiveCharObjectInspector(getCharTypeInfo(10)); + + // TODO: support null values and determine if timestamp and binary are allowed as partition keys + public static final List TEST_COLUMNS = ImmutableList.builder() + .add(new TestColumn("p_empty_string", javaStringObjectInspector, "", Slices.EMPTY_SLICE, true)) + .add(new TestColumn("p_string", javaStringObjectInspector, "test", Slices.utf8Slice("test"), true)) + .add(new TestColumn("p_empty_varchar", javaHiveVarcharObjectInspector, "", Slices.EMPTY_SLICE, true)) + .add(new TestColumn("p_varchar", javaHiveVarcharObjectInspector, "test", Slices.utf8Slice("test"), true)) + .add(new TestColumn("p_varchar_max_length", javaHiveVarcharObjectInspector, VARCHAR_MAX_LENGTH_STRING, + Slices.utf8Slice(VARCHAR_MAX_LENGTH_STRING), true)) + .add(new TestColumn("p_char_10", CHAR_INSPECTOR_LENGTH_10, "test", Slices.utf8Slice("test"), true)) + .add(new TestColumn("p_tinyint", javaByteObjectInspector, "1", (byte) 1, true)) + .add(new TestColumn("p_smallint", javaShortObjectInspector, "2", (short) 2, true)) + .add(new TestColumn("p_int", javaIntObjectInspector, "3", 3, true)) + .add(new TestColumn("p_bigint", javaLongObjectInspector, "4", 4L, true)) + .add(new TestColumn("p_float", javaFloatObjectInspector, "5.1", 5.1f, true)) + .add(new TestColumn("p_double", javaDoubleObjectInspector, "6.2", 6.2, true)) + .add(new TestColumn("p_boolean", javaBooleanObjectInspector, "true", true, true)) + .add(new TestColumn("p_date", javaDateObjectInspector, DATE_STRING, DATE_DAYS, true)) + .add(new TestColumn("p_timestamp", javaTimestampObjectInspector, TIMESTAMP_STRING, TIMESTAMP, true)) + .add(new TestColumn("p_decimal_precision_2", DECIMAL_INSPECTOR_PRECISION_2, + WRITE_DECIMAL_PRECISION_2.toString(), EXPECTED_DECIMAL_PRECISION_2, true)) + .add(new TestColumn("p_decimal_precision_4", DECIMAL_INSPECTOR_PRECISION_4, + WRITE_DECIMAL_PRECISION_4.toString(), EXPECTED_DECIMAL_PRECISION_4, true)) + .add(new TestColumn("p_decimal_precision_8", DECIMAL_INSPECTOR_PRECISION_8, + WRITE_DECIMAL_PRECISION_8.toString(), EXPECTED_DECIMAL_PRECISION_8, true)) + .add(new TestColumn("p_decimal_precision_17", DECIMAL_INSPECTOR_PRECISION_17, + WRITE_DECIMAL_PRECISION_17.toString(), EXPECTED_DECIMAL_PRECISION_17, true)) + .add(new TestColumn("p_decimal_precision_18", DECIMAL_INSPECTOR_PRECISION_18, + WRITE_DECIMAL_PRECISION_18.toString(), EXPECTED_DECIMAL_PRECISION_18, true)) + .add(new TestColumn("p_decimal_precision_38", DECIMAL_INSPECTOR_PRECISION_38, + WRITE_DECIMAL_PRECISION_38.toString() + "BD", EXPECTED_DECIMAL_PRECISION_38, true)) +// .add(new TestColumn("p_binary", javaByteArrayObjectInspector, "test2", Slices.utf8Slice("test2"), true)) + .add(new TestColumn("p_null_string", javaStringObjectInspector, HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_varchar", javaHiveVarcharObjectInspector, HIVE_DEFAULT_DYNAMIC_PARTITION, null, + true)) + .add(new TestColumn("p_null_char", CHAR_INSPECTOR_LENGTH_10, HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_tinyint", javaByteObjectInspector, HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_smallint", javaShortObjectInspector, HIVE_DEFAULT_DYNAMIC_PARTITION, null, + true)) + .add(new TestColumn("p_null_int", javaIntObjectInspector, HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_bigint", javaLongObjectInspector, HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_float", javaFloatObjectInspector, HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_double", javaDoubleObjectInspector, HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_boolean", javaBooleanObjectInspector, HIVE_DEFAULT_DYNAMIC_PARTITION, null, + true)) + .add(new TestColumn("p_null_date", javaDateObjectInspector, HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_timestamp", javaTimestampObjectInspector, HIVE_DEFAULT_DYNAMIC_PARTITION, null, + true)) + .add(new TestColumn("p_null_decimal_precision_2", DECIMAL_INSPECTOR_PRECISION_2, + HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_decimal_precision_4", DECIMAL_INSPECTOR_PRECISION_4, + HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_decimal_precision_8", DECIMAL_INSPECTOR_PRECISION_8, + HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_decimal_precision_17", DECIMAL_INSPECTOR_PRECISION_17, + HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_decimal_precision_18", DECIMAL_INSPECTOR_PRECISION_18, + HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("p_null_decimal_precision_38", DECIMAL_INSPECTOR_PRECISION_38, + HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + +// .add(new TestColumn("p_null_binary", javaByteArrayObjectInspector, HIVE_DEFAULT_DYNAMIC_PARTITION, null, true)) + .add(new TestColumn("t_null_string", javaStringObjectInspector, null, null)) + .add(new TestColumn("t_null_varchar", javaHiveVarcharObjectInspector, null, null)) + .add(new TestColumn("t_null_char", CHAR_INSPECTOR_LENGTH_10, null, null)) + .add(new TestColumn("t_null_array_int", getStandardListObjectInspector(javaIntObjectInspector), null, null)) + .add(new TestColumn("t_null_decimal_precision_2", DECIMAL_INSPECTOR_PRECISION_2, null, null)) + .add(new TestColumn("t_null_decimal_precision_4", DECIMAL_INSPECTOR_PRECISION_4, null, null)) + .add(new TestColumn("t_null_decimal_precision_8", DECIMAL_INSPECTOR_PRECISION_8, null, null)) + .add(new TestColumn("t_null_decimal_precision_17", DECIMAL_INSPECTOR_PRECISION_17, null, null)) + .add(new TestColumn("t_null_decimal_precision_18", DECIMAL_INSPECTOR_PRECISION_18, null, null)) + .add(new TestColumn("t_null_decimal_precision_38", DECIMAL_INSPECTOR_PRECISION_38, null, null)) + .add(new TestColumn("t_empty_string", javaStringObjectInspector, "", Slices.EMPTY_SLICE)) + .add(new TestColumn("t_string", javaStringObjectInspector, "test", Slices.utf8Slice("test"))) + .add(new TestColumn("t_empty_varchar", javaHiveVarcharObjectInspector, + new HiveVarchar("", HiveVarchar.MAX_VARCHAR_LENGTH), Slices.EMPTY_SLICE)) + .add(new TestColumn("t_varchar", javaHiveVarcharObjectInspector, + new HiveVarchar("test", HiveVarchar.MAX_VARCHAR_LENGTH), Slices.utf8Slice("test"))) + .add(new TestColumn("t_varchar_max_length", javaHiveVarcharObjectInspector, + new HiveVarchar(VARCHAR_MAX_LENGTH_STRING, HiveVarchar.MAX_VARCHAR_LENGTH), + Slices.utf8Slice(VARCHAR_MAX_LENGTH_STRING))) + .add(new TestColumn("t_char", CHAR_INSPECTOR_LENGTH_10, "test", Slices.utf8Slice("test"), true)) + .add(new TestColumn("t_tinyint", javaByteObjectInspector, (byte) 1, (byte) 1)) + .add(new TestColumn("t_smallint", javaShortObjectInspector, (short) 2, (short) 2)) + .add(new TestColumn("t_int", javaIntObjectInspector, 3, 3)) + .add(new TestColumn("t_bigint", javaLongObjectInspector, 4L, 4L)) + .add(new TestColumn("t_float", javaFloatObjectInspector, 5.1f, 5.1f)) + .add(new TestColumn("t_double", javaDoubleObjectInspector, 6.2, 6.2)) + .add(new TestColumn("t_boolean_true", javaBooleanObjectInspector, true, true)) + .add(new TestColumn("t_boolean_false", javaBooleanObjectInspector, false, false)) + .add(new TestColumn("t_date", javaDateObjectInspector, HIVE_DATE, DATE_DAYS)) + .add(new TestColumn("t_timestamp", javaTimestampObjectInspector, HIVE_TIMESTAMP, TIMESTAMP)) + .add(new TestColumn("t_decimal_precision_2", DECIMAL_INSPECTOR_PRECISION_2, WRITE_DECIMAL_PRECISION_2, + EXPECTED_DECIMAL_PRECISION_2)) + .add(new TestColumn("t_decimal_precision_4", DECIMAL_INSPECTOR_PRECISION_4, WRITE_DECIMAL_PRECISION_4, + EXPECTED_DECIMAL_PRECISION_4)) + .add(new TestColumn("t_decimal_precision_8", DECIMAL_INSPECTOR_PRECISION_8, WRITE_DECIMAL_PRECISION_8, + EXPECTED_DECIMAL_PRECISION_8)) + .add(new TestColumn("t_decimal_precision_17", DECIMAL_INSPECTOR_PRECISION_17, WRITE_DECIMAL_PRECISION_17, + EXPECTED_DECIMAL_PRECISION_17)) + .add(new TestColumn("t_decimal_precision_18", DECIMAL_INSPECTOR_PRECISION_18, WRITE_DECIMAL_PRECISION_18, + EXPECTED_DECIMAL_PRECISION_18)) + .add(new TestColumn("t_decimal_precision_38", DECIMAL_INSPECTOR_PRECISION_38, WRITE_DECIMAL_PRECISION_38, + EXPECTED_DECIMAL_PRECISION_38)) + .add(new TestColumn("t_binary", javaByteArrayObjectInspector, Slices.utf8Slice("test2").getBytes(), + Slices.utf8Slice("test2"))) + .add(new TestColumn("t_map_string", + getStandardMapObjectInspector(javaStringObjectInspector, javaStringObjectInspector), + ImmutableMap.of("test", "test"), + mapBlockOf(createUnboundedVarcharType(), createUnboundedVarcharType(), "test", "test"))) + .add(new TestColumn("t_map_tinyint", + getStandardMapObjectInspector(javaByteObjectInspector, javaByteObjectInspector), + ImmutableMap.of((byte) 1, (byte) 1), + mapBlockOf(TINYINT, TINYINT, (byte) 1, (byte) 1))) + .add(new TestColumn("t_map_varchar", + getStandardMapObjectInspector(javaHiveVarcharObjectInspector, javaHiveVarcharObjectInspector), + ImmutableMap.of(new HiveVarchar("test", HiveVarchar.MAX_VARCHAR_LENGTH), + new HiveVarchar("test", HiveVarchar.MAX_VARCHAR_LENGTH)), + mapBlockOf(createVarcharType(HiveVarchar.MAX_VARCHAR_LENGTH), + createVarcharType(HiveVarchar.MAX_VARCHAR_LENGTH), "test", "test"))) + .add(new TestColumn("t_map_char", + getStandardMapObjectInspector(CHAR_INSPECTOR_LENGTH_10, CHAR_INSPECTOR_LENGTH_10), + ImmutableMap.of(new HiveChar("test", 10), new HiveChar("test", 10)), + mapBlockOf(createCharType(10), createCharType(10), "test", "test"))) + .add(new TestColumn("t_map_smallint", + getStandardMapObjectInspector(javaShortObjectInspector, javaShortObjectInspector), + ImmutableMap.of((short) 2, (short) 2), + mapBlockOf(SMALLINT, SMALLINT, (short) 2, (short) 2))) + .add(new TestColumn("t_map_null_key", + getStandardMapObjectInspector(javaLongObjectInspector, javaLongObjectInspector), + asMap(new Long[]{null, 2L}, new Long[]{0L, 3L}), + mapBlockOf(BIGINT, BIGINT, 2, 3))) + .add(new TestColumn("t_map_int", + getStandardMapObjectInspector(javaIntObjectInspector, javaIntObjectInspector), + ImmutableMap.of(3, 3), + mapBlockOf(INTEGER, INTEGER, 3, 3))) + .add(new TestColumn("t_map_bigint", + getStandardMapObjectInspector(javaLongObjectInspector, javaLongObjectInspector), + ImmutableMap.of(4L, 4L), + mapBlockOf(BIGINT, BIGINT, 4L, 4L))) + .add(new TestColumn("t_map_float", + getStandardMapObjectInspector(javaFloatObjectInspector, javaFloatObjectInspector), + ImmutableMap.of(5.0f, 5.0f), mapBlockOf(REAL, REAL, 5.0f, 5.0f))) + .add(new TestColumn("t_map_double", + getStandardMapObjectInspector(javaDoubleObjectInspector, javaDoubleObjectInspector), + ImmutableMap.of(6.0, 6.0), mapBlockOf(DOUBLE, DOUBLE, 6.0, 6.0))) + .add(new TestColumn("t_map_boolean", + getStandardMapObjectInspector(javaBooleanObjectInspector, javaBooleanObjectInspector), + ImmutableMap.of(true, true), + mapBlockOf(BOOLEAN, BOOLEAN, true, true))) + .add(new TestColumn("t_map_date", + getStandardMapObjectInspector(javaDateObjectInspector, javaDateObjectInspector), + ImmutableMap.of(HIVE_DATE, HIVE_DATE), + mapBlockOf(DateType.DATE, DateType.DATE, DATE_DAYS, DATE_DAYS))) + .add(new TestColumn("t_map_timestamp", + getStandardMapObjectInspector(javaTimestampObjectInspector, javaTimestampObjectInspector), + ImmutableMap.of(HIVE_TIMESTAMP, HIVE_TIMESTAMP), + mapBlockOf(TimestampType.TIMESTAMP, TimestampType.TIMESTAMP, TIMESTAMP, TIMESTAMP))) + .add(new TestColumn("t_map_decimal_precision_2", + getStandardMapObjectInspector(DECIMAL_INSPECTOR_PRECISION_2, DECIMAL_INSPECTOR_PRECISION_2), + ImmutableMap.of(WRITE_DECIMAL_PRECISION_2, WRITE_DECIMAL_PRECISION_2), + StructuralTestUtil.decimalMapBlockOf(DECIMAL_TYPE_PRECISION_2, EXPECTED_DECIMAL_PRECISION_2))) + .add(new TestColumn("t_map_decimal_precision_4", + getStandardMapObjectInspector(DECIMAL_INSPECTOR_PRECISION_4, DECIMAL_INSPECTOR_PRECISION_4), + ImmutableMap.of(WRITE_DECIMAL_PRECISION_4, WRITE_DECIMAL_PRECISION_4), + decimalMapBlockOf(DECIMAL_TYPE_PRECISION_4, EXPECTED_DECIMAL_PRECISION_4))) + .add(new TestColumn("t_map_decimal_precision_8", + getStandardMapObjectInspector(DECIMAL_INSPECTOR_PRECISION_8, DECIMAL_INSPECTOR_PRECISION_8), + ImmutableMap.of(WRITE_DECIMAL_PRECISION_8, WRITE_DECIMAL_PRECISION_8), + decimalMapBlockOf(DECIMAL_TYPE_PRECISION_8, EXPECTED_DECIMAL_PRECISION_8))) + .add(new TestColumn("t_map_decimal_precision_17", + getStandardMapObjectInspector(DECIMAL_INSPECTOR_PRECISION_17, DECIMAL_INSPECTOR_PRECISION_17), + ImmutableMap.of(WRITE_DECIMAL_PRECISION_17, WRITE_DECIMAL_PRECISION_17), + decimalMapBlockOf(DECIMAL_TYPE_PRECISION_17, EXPECTED_DECIMAL_PRECISION_17))) + .add(new TestColumn("t_map_decimal_precision_18", + getStandardMapObjectInspector(DECIMAL_INSPECTOR_PRECISION_18, DECIMAL_INSPECTOR_PRECISION_18), + ImmutableMap.of(WRITE_DECIMAL_PRECISION_18, WRITE_DECIMAL_PRECISION_18), + decimalMapBlockOf(DECIMAL_TYPE_PRECISION_18, EXPECTED_DECIMAL_PRECISION_18))) + .add(new TestColumn("t_map_decimal_precision_38", + getStandardMapObjectInspector(DECIMAL_INSPECTOR_PRECISION_38, DECIMAL_INSPECTOR_PRECISION_38), + ImmutableMap.of(WRITE_DECIMAL_PRECISION_38, WRITE_DECIMAL_PRECISION_38), + decimalMapBlockOf(DECIMAL_TYPE_PRECISION_38, EXPECTED_DECIMAL_PRECISION_38))) + .add(new TestColumn("t_array_empty", getStandardListObjectInspector(javaStringObjectInspector), + ImmutableList.of(), arrayBlockOf(createUnboundedVarcharType()))) + .add(new TestColumn("t_array_string", getStandardListObjectInspector(javaStringObjectInspector), + ImmutableList.of("test"), arrayBlockOf(createUnboundedVarcharType(), "test"))) + .add(new TestColumn("t_array_tinyint", getStandardListObjectInspector(javaByteObjectInspector), + ImmutableList.of((byte) 1), arrayBlockOf(TINYINT, (byte) 1))) + .add(new TestColumn("t_array_smallint", getStandardListObjectInspector(javaShortObjectInspector), + ImmutableList.of((short) 2), arrayBlockOf(SMALLINT, (short) 2))) + .add(new TestColumn("t_array_int", getStandardListObjectInspector(javaIntObjectInspector), + ImmutableList.of(3), arrayBlockOf(INTEGER, 3))) + .add(new TestColumn("t_array_bigint", getStandardListObjectInspector(javaLongObjectInspector), + ImmutableList.of(4L), arrayBlockOf(BIGINT, 4L))) + .add(new TestColumn("t_array_float", getStandardListObjectInspector(javaFloatObjectInspector), + ImmutableList.of(5.0f), arrayBlockOf(REAL, 5.0f))) + .add(new TestColumn("t_array_double", getStandardListObjectInspector(javaDoubleObjectInspector), + ImmutableList.of(6.0), StructuralTestUtil.arrayBlockOf(DOUBLE, 6.0))) + .add(new TestColumn("t_array_boolean", getStandardListObjectInspector(javaBooleanObjectInspector), + ImmutableList.of(true), arrayBlockOf(BOOLEAN, true))) + .add(new TestColumn( + "t_array_varchar", + getStandardListObjectInspector(javaHiveVarcharObjectInspector), + ImmutableList.of(new HiveVarchar("test", HiveVarchar.MAX_VARCHAR_LENGTH)), + arrayBlockOf(createVarcharType(HiveVarchar.MAX_VARCHAR_LENGTH), "test"))) + .add(new TestColumn( + "t_array_char", + getStandardListObjectInspector(CHAR_INSPECTOR_LENGTH_10), + ImmutableList.of(new HiveChar("test", 10)), + arrayBlockOf(createCharType(10), "test"))) + .add(new TestColumn("t_array_date", + getStandardListObjectInspector(javaDateObjectInspector), + ImmutableList.of(HIVE_DATE), + arrayBlockOf(DateType.DATE, DATE_DAYS))) + .add(new TestColumn("t_array_timestamp", + getStandardListObjectInspector(javaTimestampObjectInspector), + ImmutableList.of(HIVE_TIMESTAMP), + StructuralTestUtil.arrayBlockOf(TimestampType.TIMESTAMP, TIMESTAMP))) + .add(new TestColumn("t_array_decimal_precision_2", + getStandardListObjectInspector(DECIMAL_INSPECTOR_PRECISION_2), + ImmutableList.of(WRITE_DECIMAL_PRECISION_2), + decimalArrayBlockOf(DECIMAL_TYPE_PRECISION_2, EXPECTED_DECIMAL_PRECISION_2))) + .add(new TestColumn("t_array_decimal_precision_4", + getStandardListObjectInspector(DECIMAL_INSPECTOR_PRECISION_4), + ImmutableList.of(WRITE_DECIMAL_PRECISION_4), + decimalArrayBlockOf(DECIMAL_TYPE_PRECISION_4, EXPECTED_DECIMAL_PRECISION_4))) + .add(new TestColumn("t_array_decimal_precision_8", + getStandardListObjectInspector(DECIMAL_INSPECTOR_PRECISION_8), + ImmutableList.of(WRITE_DECIMAL_PRECISION_8), + decimalArrayBlockOf(DECIMAL_TYPE_PRECISION_8, EXPECTED_DECIMAL_PRECISION_8))) + .add(new TestColumn("t_array_decimal_precision_17", + getStandardListObjectInspector(DECIMAL_INSPECTOR_PRECISION_17), + ImmutableList.of(WRITE_DECIMAL_PRECISION_17), + decimalArrayBlockOf(DECIMAL_TYPE_PRECISION_17, EXPECTED_DECIMAL_PRECISION_17))) + .add(new TestColumn("t_array_decimal_precision_18", + getStandardListObjectInspector(DECIMAL_INSPECTOR_PRECISION_18), + ImmutableList.of(WRITE_DECIMAL_PRECISION_18), + decimalArrayBlockOf(DECIMAL_TYPE_PRECISION_18, EXPECTED_DECIMAL_PRECISION_18))) + .add(new TestColumn("t_array_decimal_precision_38", + getStandardListObjectInspector(DECIMAL_INSPECTOR_PRECISION_38), + ImmutableList.of(WRITE_DECIMAL_PRECISION_38), + decimalArrayBlockOf(DECIMAL_TYPE_PRECISION_38, EXPECTED_DECIMAL_PRECISION_38))) + .add(new TestColumn("t_struct_bigint", + getStandardStructObjectInspector(ImmutableList.of("s_bigint"), + ImmutableList.of(javaLongObjectInspector)), + new Long[]{1L}, + rowBlockOf(ImmutableList.of(BIGINT), 1))) + .add(new TestColumn("t_complex", + getStandardMapObjectInspector( + javaStringObjectInspector, + getStandardListObjectInspector( + getStandardStructObjectInspector( + ImmutableList.of("s_int"), + ImmutableList.of(javaIntObjectInspector)))), + ImmutableMap.of("test", ImmutableList.of(new Integer[]{1})), + mapBlockOf(createUnboundedVarcharType(), + new ArrayType(RowType.anonymous(ImmutableList.of(INTEGER))), + "test", arrayBlockOf(RowType.anonymous(ImmutableList.of(INTEGER)), + rowBlockOf(ImmutableList.of(INTEGER), 1L))))) + .add(new TestColumn("t_map_null_key_complex_value", + getStandardMapObjectInspector( + javaStringObjectInspector, + getStandardMapObjectInspector(javaLongObjectInspector, javaBooleanObjectInspector)), + asMap(new String[]{null, "k"}, + new ImmutableMap[]{ImmutableMap.of(15L, true), ImmutableMap.of(16L, false)}), + mapBlockOf(createUnboundedVarcharType(), mapType(BIGINT, BOOLEAN), "k", + mapBlockOf(BIGINT, BOOLEAN, 16L, false)))) + .add(new TestColumn("t_map_null_key_complex_key_value", + getStandardMapObjectInspector( + getStandardListObjectInspector(javaStringObjectInspector), + getStandardMapObjectInspector(javaLongObjectInspector, javaBooleanObjectInspector)), + asMap(new ImmutableList[]{null, ImmutableList.of("k", "ka")}, + new ImmutableMap[]{ImmutableMap.of(15L, true), ImmutableMap.of(16L, false)}), + mapBlockOf(new ArrayType(createUnboundedVarcharType()), mapType(BIGINT, BOOLEAN), + arrayBlockOf(createUnboundedVarcharType(), "k", "ka"), + mapBlockOf(BIGINT, BOOLEAN, 16L, false)))) + .add(new TestColumn("t_struct_nested", getStandardStructObjectInspector(ImmutableList.of("struct_field"), + ImmutableList.of(getStandardListObjectInspector(javaStringObjectInspector))), + ImmutableList.of(ImmutableList.of("1", "2", "3")), + rowBlockOf(ImmutableList.of(new ArrayType(createUnboundedVarcharType())), + arrayBlockOf(createUnboundedVarcharType(), "1", "2", "3")))) + .add(new TestColumn("t_struct_null", + getStandardStructObjectInspector(ImmutableList.of("struct_field_null", "struct_field_null2"), + ImmutableList.of(javaStringObjectInspector, javaStringObjectInspector)), + Arrays.asList(null, null), + rowBlockOf(ImmutableList.of(createUnboundedVarcharType(), createUnboundedVarcharType()), null, + null))) + .add(new TestColumn("t_struct_non_nulls_after_nulls", getStandardStructObjectInspector( + ImmutableList.of("struct_non_nulls_after_nulls1", "struct_non_nulls_after_nulls2"), + ImmutableList.of(javaIntObjectInspector, javaStringObjectInspector)), + Arrays.asList(null, "some string"), + rowBlockOf(ImmutableList.of(INTEGER, createUnboundedVarcharType()), null, "some string"))) + .add(new TestColumn("t_nested_struct_non_nulls_after_nulls", + getStandardStructObjectInspector( + ImmutableList.of("struct_field1", "struct_field2", "strict_field3"), + ImmutableList.of( + javaIntObjectInspector, + javaStringObjectInspector, + getStandardStructObjectInspector( + ImmutableList.of("nested_struct_field1", "nested_struct_field2"), + ImmutableList.of(javaIntObjectInspector, javaStringObjectInspector)))), + Arrays.asList(null, "some string", Arrays.asList(null, "nested_string2")), + rowBlockOf( + ImmutableList.of( + INTEGER, + createUnboundedVarcharType(), + RowType.anonymous(ImmutableList.of(INTEGER, createUnboundedVarcharType()))), + null, "some string", + rowBlockOf(ImmutableList.of(INTEGER, createUnboundedVarcharType()), null, + "nested_string2")))) + .add(new TestColumn("t_map_null_value", + getStandardMapObjectInspector(javaStringObjectInspector, javaStringObjectInspector), + asMap(new String[]{"k1", "k2", "k3"}, new String[]{"v1", null, "v3"}), + mapBlockOf(createUnboundedVarcharType(), createUnboundedVarcharType(), + new String[]{"k1", "k2", "k3"}, new String[]{"v1", null, "v3"}))) + .add(new TestColumn("t_array_string_starting_with_nulls", + getStandardListObjectInspector(javaStringObjectInspector), Arrays.asList(null, "test"), + arrayBlockOf(createUnboundedVarcharType(), null, "test"))) + .add(new TestColumn("t_array_string_with_nulls_in_between", + getStandardListObjectInspector(javaStringObjectInspector), Arrays.asList("test-1", null, "test-2"), + arrayBlockOf(createUnboundedVarcharType(), "test-1", null, "test-2"))) + .add(new TestColumn("t_array_string_ending_with_nulls", + getStandardListObjectInspector(javaStringObjectInspector), Arrays.asList("test", null), + arrayBlockOf(createUnboundedVarcharType(), "test", null))) + .add(new TestColumn("t_array_string_all_nulls", getStandardListObjectInspector(javaStringObjectInspector), + Arrays.asList(null, null, null), arrayBlockOf(createUnboundedVarcharType(), null, null, null))) + .build(); + + private static Map asMap(K[] keys, V[] values) + { + checkArgument(keys.length == values.length, "array lengths don't match"); + Map map = new HashMap<>(); + int len = keys.length; + for (int i = 0; i < len; i++) { + map.put(keys[i], values[i]); + } + return map; + } + + protected List getColumnHandles(List testColumns) + { + List columns = new ArrayList<>(); + int nextHiveColumnIndex = 0; + for (int i = 0; i < testColumns.size(); i++) { + TestColumn testColumn = testColumns.get(i); + int columnIndex = testColumn.isPartitionKey() ? -1 : nextHiveColumnIndex++; + + HiveType hiveType = HiveType.valueOf(testColumn.getObjectInspector().getTypeName()); + columns.add(new HiveColumnHandle(testColumn.getName(), hiveType, hiveType.getTypeSignature(), columnIndex, + testColumn.isPartitionKey() ? PARTITION_KEY : REGULAR, Optional.empty())); + } + return columns; + } + + public static FileSplit createTestFilePresto( + String filePath, + HiveStorageFormat storageFormat, + HiveCompressionCodec compressionCodec, + List testColumns, + ConnectorSession session, + int numRows, + HiveFileWriterFactory fileWriterFactory) + { + // filter out partition keys, which are not written to the file + testColumns = ImmutableList.copyOf(filter(testColumns, not(TestColumn::isPartitionKey))); + + List types = testColumns.stream() + .map(TestColumn::getType) + .map(HiveType::valueOf) + .map(type -> type.getType(TYPE_MANAGER)) + .collect(toList()); + + PageBuilder pageBuilder = new PageBuilder(types); + + for (int rowNumber = 0; rowNumber < numRows; rowNumber++) { + pageBuilder.declarePosition(); + for (int columnNumber = 0; columnNumber < testColumns.size(); columnNumber++) { + serializeObject( + types.get(columnNumber), + pageBuilder.getBlockBuilder(columnNumber), + testColumns.get(columnNumber).getWriteValue(), + testColumns.get(columnNumber).getObjectInspector(), + false); + } + } + Page page = pageBuilder.build(); + + JobConf jobConf = new JobConf(); + configureCompression(jobConf, compressionCodec); + + Properties tableProperties = new Properties(); + tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName))); + tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType))); + + Optional fileWriter = fileWriterFactory.createFileWriter( + new Path(filePath), + testColumns.stream() + .map(TestColumn::getName) + .collect(toList()), + StorageFormat.fromHiveStorageFormat(storageFormat), + tableProperties, + jobConf, + session, Optional.empty(), Optional.empty()); + + HiveFileWriter hiveFileWriter = fileWriter.orElseThrow(() -> new IllegalArgumentException("fileWriterFactory")); + hiveFileWriter.appendRows(page); + hiveFileWriter.commit(); + + return new FileSplit(new Path(filePath), 0, new File(filePath).length(), new String[0]); + } + + public static FileSplit createTestFileHive( + String filePath, + HiveStorageFormat storageFormat, + HiveCompressionCodec compressionCodec, + List testColumns, + int numRows) + throws Exception + { + HiveOutputFormat outputFormat = newInstance(storageFormat.getOutputFormat(), HiveOutputFormat.class); + Serializer serializer = newInstance(storageFormat.getSerDe(), Serializer.class); + + // filter out partition keys, which are not written to the file + testColumns = ImmutableList.copyOf(filter(testColumns, not(TestColumn::isPartitionKey))); + + Properties tableProperties = new Properties(); + tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName))); + tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType))); + serializer.initialize(new Configuration(), tableProperties); + + JobConf jobConf = new JobConf(); + configureCompression(jobConf, compressionCodec); + + RecordWriter recordWriter = outputFormat.getHiveRecordWriter( + jobConf, + new Path(filePath), + Text.class, + compressionCodec != HiveCompressionCodec.NONE, + tableProperties, + () -> {}); + + try { + serializer.initialize(new Configuration(), tableProperties); + + SettableStructObjectInspector objectInspector = getStandardStructObjectInspector( + ImmutableList.copyOf(transform(testColumns, TestColumn::getName)), + ImmutableList.copyOf(transform(testColumns, TestColumn::getObjectInspector))); + + Object row = objectInspector.create(); + + List fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs()); + + for (int rowNumber = 0; rowNumber < numRows; rowNumber++) { + for (int i = 0; i < testColumns.size(); i++) { + Object writeValue = testColumns.get(i).getWriteValue(); + if (writeValue instanceof Slice) { + writeValue = ((Slice) writeValue).getBytes(); + } + objectInspector.setStructFieldData(row, fields.get(i), writeValue); + } + + Writable record = serializer.serialize(row, objectInspector); + recordWriter.write(record); + } + } + finally { + recordWriter.close(false); + } + + // todo to test with compression, the file must be renamed with the compression extension + Path path = new Path(filePath); + path.getFileSystem(new Configuration()).setVerifyChecksum(true); + File file = new File(filePath); + return new FileSplit(path, 0, file.length(), new String[0]); + } + + private static T newInstance(String className, Class superType) + throws ReflectiveOperationException + { + return HiveStorageFormat.class.getClassLoader().loadClass(className).asSubclass( + superType).getConstructor().newInstance(); + } + + public static Object getFieldFromCursor(RecordCursor cursor, Type type, int field) + { + if (cursor.isNull(field)) { + return null; + } + if (BOOLEAN.equals(type)) { + return cursor.getBoolean(field); + } + if (TINYINT.equals(type)) { + return cursor.getLong(field); + } + if (SMALLINT.equals(type)) { + return cursor.getLong(field); + } + if (INTEGER.equals(type)) { + return (int) cursor.getLong(field); + } + if (BIGINT.equals(type)) { + return cursor.getLong(field); + } + if (REAL.equals(type)) { + return intBitsToFloat((int) cursor.getLong(field)); + } + if (DOUBLE.equals(type)) { + return cursor.getDouble(field); + } + if (isVarcharType(type) || isCharType(type) || VARBINARY.equals(type)) { + return cursor.getSlice(field); + } + if (DateType.DATE.equals(type)) { + return cursor.getLong(field); + } + if (TimestampType.TIMESTAMP.equals(type)) { + return cursor.getLong(field); + } + if (isStructuralType(type)) { + return cursor.getObject(field); + } + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + if (decimalType.isShort()) { + return BigInteger.valueOf(cursor.getLong(field)); + } + else { + return Decimals.decodeUnscaledValue(cursor.getSlice(field)); + } + } + throw new RuntimeException("unknown type"); + } + + protected void checkCursor(RecordCursor cursor, List testColumns, int rowCount) + { + List types = testColumns.stream() + .map(column -> column.getObjectInspector().getTypeName()) + .map(type -> HiveType.valueOf(type).getType(TYPE_MANAGER)) + .collect(toImmutableList()); + + Map distinctFromOperators = types.stream().distinct() + .collect(toImmutableMap(identity(), HiveTestUtils::distinctFromOperator)); + + for (int row = 0; row < rowCount; row++) { + assertTrue(cursor.advanceNextPosition()); + for (int i = 0, testColumnsSize = testColumns.size(); i < testColumnsSize; i++) { + TestColumn testColumn = testColumns.get(i); + + Type type = types.get(i); + Object fieldFromCursor = getFieldFromCursor(cursor, type, i); + if (fieldFromCursor == null) { + assertEquals(null, testColumn.getExpectedValue(), + "Expected null for column " + testColumn.getName()); + } + else if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + fieldFromCursor = new BigDecimal((BigInteger) fieldFromCursor, decimalType.getScale()); + assertEquals(fieldFromCursor, testColumn.getExpectedValue(), + "Wrong value for column " + testColumn.getName()); + } + else if (testColumn.getObjectInspector().getTypeName().equals("float")) { + assertEquals((float) fieldFromCursor, (float) testColumn.getExpectedValue(), (float) EPSILON); + } + else if (testColumn.getObjectInspector().getTypeName().equals("double")) { + assertEquals((double) fieldFromCursor, (double) testColumn.getExpectedValue(), EPSILON); + } + else if (testColumn.getObjectInspector().getTypeName().equals("tinyint")) { + assertEquals(((Number) fieldFromCursor).byteValue(), testColumn.getExpectedValue()); + } + else if (testColumn.getObjectInspector().getTypeName().equals("smallint")) { + assertEquals(((Number) fieldFromCursor).shortValue(), testColumn.getExpectedValue()); + } + else if (testColumn.getObjectInspector().getTypeName().equals("int")) { + assertEquals(((Number) fieldFromCursor).intValue(), testColumn.getExpectedValue()); + } + else if (testColumn.getObjectInspector().getCategory() == Category.PRIMITIVE) { + assertEquals(fieldFromCursor, testColumn.getExpectedValue(), + "Wrong value for column " + testColumn.getName()); + } + else { + Block expected = (Block) testColumn.getExpectedValue(); + Block actual = (Block) fieldFromCursor; + boolean distinct = isDistinctFrom(distinctFromOperators.get(type), expected, actual); + assertFalse(distinct, "Wrong value for column: " + testColumn.getName()); + } + } + } + assertFalse(cursor.advanceNextPosition()); + } + + protected void checkPageSource(ConnectorPageSource pageSource, List testColumns, List types, + int rowCount) + throws IOException + { + try { + MaterializedResult result = materializeSourceDataStream(SESSION, pageSource, types); + assertEquals(result.getMaterializedRows().size(), rowCount); + for (MaterializedRow row : result) { + for (int i = 0, testColumnsSize = testColumns.size(); i < testColumnsSize; i++) { + TestColumn testColumn = testColumns.get(i); + Type type = types.get(i); + + Object actualValue = row.getField(i); + Object expectedValue = testColumn.getExpectedValue(); + + if (expectedValue instanceof Slice) { + expectedValue = ((Slice) expectedValue).toStringUtf8(); + } + + if (actualValue == null || expectedValue == null) { + assertEquals(actualValue, expectedValue, "Wrong value for column " + testColumn.getName()); + } + else if (testColumn.getObjectInspector().getTypeName().equals("float")) { + assertEquals((float) actualValue, (float) expectedValue, EPSILON, + "Wrong value for column " + testColumn.getName()); + } + else if (testColumn.getObjectInspector().getTypeName().equals("double")) { + assertEquals((double) actualValue, (double) expectedValue, EPSILON, + "Wrong value for column " + testColumn.getName()); + } + else if (testColumn.getObjectInspector().getTypeName().equals("date")) { + SqlDate expectedDate = new SqlDate(((Long) expectedValue).intValue()); + assertEquals(actualValue, expectedDate, "Wrong value for column " + testColumn.getName()); + } + else if (testColumn.getObjectInspector().getTypeName().equals("int") || + testColumn.getObjectInspector().getTypeName().equals("smallint") || + testColumn.getObjectInspector().getTypeName().equals("tinyint")) { + assertEquals(actualValue, expectedValue); + } + else if (testColumn.getObjectInspector().getTypeName().equals("timestamp")) { + SqlTimestamp expectedTimestamp = sqlTimestampOf((Long) expectedValue); + assertEquals(actualValue, expectedTimestamp, "Wrong value for column " + testColumn.getName()); + } + else if (testColumn.getObjectInspector().getTypeName().startsWith("char")) { + assertEquals(actualValue, padEnd((String) expectedValue, ((CharType) type).getLength(), ' '), + "Wrong value for column " + testColumn.getName()); + } + else if (testColumn.getObjectInspector().getCategory() == Category.PRIMITIVE) { + if (expectedValue instanceof Slice) { + expectedValue = ((Slice) expectedValue).toStringUtf8(); + } + + if (actualValue instanceof Slice) { + actualValue = ((Slice) actualValue).toStringUtf8(); + } + if (actualValue instanceof SqlVarbinary) { + actualValue = new String(((SqlVarbinary) actualValue).getBytes(), UTF_8); + } + + if (actualValue instanceof SqlDecimal) { + actualValue = new BigDecimal(actualValue.toString()); + } + assertEquals(actualValue, expectedValue, "Wrong value for column " + testColumn.getName()); + } + else { + BlockBuilder builder = type.createBlockBuilder(null, 1); + type.writeObject(builder, expectedValue); + expectedValue = type.getObjectValue(SESSION, builder.build(), 0); + assertEquals(actualValue, expectedValue, "Wrong value for column " + testColumn.getName()); + } + } + } + } + finally { + pageSource.close(); + } + } + + public static final class TestColumn + { + private final String name; + private final ObjectInspector objectInspector; + private final Object writeValue; + private final Object expectedValue; + private final boolean partitionKey; + + public TestColumn(String name, ObjectInspector objectInspector, Object writeValue, Object expectedValue) + { + this(name, objectInspector, writeValue, expectedValue, false); + } + + public TestColumn(String name, ObjectInspector objectInspector, Object writeValue, Object expectedValue, + boolean partitionKey) + { + this.name = requireNonNull(name, "name is null"); + this.objectInspector = requireNonNull(objectInspector, "objectInspector is null"); + this.writeValue = writeValue; + this.expectedValue = expectedValue; + this.partitionKey = partitionKey; + } + + public String getName() + { + return name; + } + + public String getType() + { + return objectInspector.getTypeName(); + } + + public ObjectInspector getObjectInspector() + { + return objectInspector; + } + + public Object getWriteValue() + { + return writeValue; + } + + public Object getExpectedValue() + { + return expectedValue; + } + + public boolean isPartitionKey() + { + return partitionKey; + } + + @Override + public String toString() + { + StringBuilder sb = new StringBuilder("TestColumn{"); + sb.append("name='").append(name).append('\''); + sb.append(", objectInspector=").append(objectInspector); + sb.append(", writeValue=").append(writeValue); + sb.append(", expectedValue=").append(expectedValue); + sb.append(", partitionKey=").append(partitionKey); + sb.append('}'); + return sb.toString(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHiveFileSystem.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHiveFileSystem.java new file mode 100644 index 00000000..555aa993 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHiveFileSystem.java @@ -0,0 +1,576 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.base.Function; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultimap; +import com.google.common.net.HostAndPort; +import io.airlift.concurrent.BoundedExecutor; +import io.airlift.json.JsonCodec; +import io.airlift.slice.Slice; +import io.airlift.stats.CounterStat; +import io.prestosql.GroupByHashPageIndexerFactory; +import io.prestosql.plugin.hive.AbstractTestHive.HiveTransaction; +import io.prestosql.plugin.hive.AbstractTestHive.Transaction; +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.authentication.NoHdfsAuthentication; +import io.prestosql.plugin.hive.metastore.CachingHiveMetastore; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.PrincipalPrivileges; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.metastore.thrift.BridgingHiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.MetastoreLocator; +import io.prestosql.plugin.hive.metastore.thrift.TestingMetastoreLocator; +import io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastoreConfig; +import io.prestosql.plugin.hive.security.SqlStandardAccessControlMetadata; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ColumnMetadata; +import io.prestosql.spi.connector.ConnectorMetadata; +import io.prestosql.spi.connector.ConnectorOutputTableHandle; +import io.prestosql.spi.connector.ConnectorPageSink; +import io.prestosql.spi.connector.ConnectorPageSinkProvider; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorPageSourceProvider; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorSplit; +import io.prestosql.spi.connector.ConnectorSplitManager; +import io.prestosql.spi.connector.ConnectorSplitSource; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.ConnectorTableMetadata; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.security.ConnectorIdentity; +import io.prestosql.sql.gen.JoinCompiler; +import io.prestosql.testing.MaterializedResult; +import io.prestosql.testing.MaterializedRow; +import io.prestosql.testing.TestingConnectorSession; +import io.prestosql.testing.TestingNodeManager; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; +import java.util.concurrent.Executor; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.ScheduledExecutorService; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.Iterables.getOnlyElement; +import static com.google.common.util.concurrent.MoreExecutors.newDirectExecutorService; +import static io.airlift.concurrent.MoreFutures.getFutureValue; +import static io.airlift.concurrent.Threads.daemonThreadsNamed; +import static io.airlift.testing.Assertions.assertEqualsIgnoreOrder; +import static io.prestosql.metadata.MetadataManager.createTestMetadataManager; +import static io.prestosql.plugin.hive.AbstractTestHive.createTableProperties; +import static io.prestosql.plugin.hive.AbstractTestHive.filterNonHiddenColumnHandles; +import static io.prestosql.plugin.hive.AbstractTestHive.filterNonHiddenColumnMetadata; +import static io.prestosql.plugin.hive.AbstractTestHive.getAllSplits; +import static io.prestosql.plugin.hive.HiveTestUtils.PAGE_SORTER; +import static io.prestosql.plugin.hive.HiveTestUtils.TYPE_MANAGER; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveDataStreamFactories; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveFileWriterFactories; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveRecordCursorProvider; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveSelectiveFactories; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultOrcFileWriterFactory; +import static io.prestosql.plugin.hive.HiveTestUtils.getNoOpIndexCache; +import static io.prestosql.plugin.hive.HiveTestUtils.getTypes; +import static io.prestosql.spi.connector.ConnectorSplitManager.SplitSchedulingStrategy.UNGROUPED_SCHEDULING; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.testing.MaterializedResult.materializeSourceDataStream; +import static java.util.Locale.ENGLISH; +import static java.util.concurrent.Executors.newCachedThreadPool; +import static java.util.concurrent.Executors.newScheduledThreadPool; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +public abstract class AbstractTestHiveFileSystem +{ + private static final HdfsContext TESTING_CONTEXT = new HdfsContext(new ConnectorIdentity("test", Optional.empty(), Optional.empty())); + + protected String database; + protected SchemaTableName table; + protected SchemaTableName temporaryCreateTable; + + protected HdfsEnvironment hdfsEnvironment; + protected LocationService locationService; + protected TestingHiveMetastore metastoreClient; + protected HiveMetadataFactory metadataFactory; + protected HiveTransactionManager transactionManager; + protected ConnectorSplitManager splitManager; + protected ConnectorPageSinkProvider pageSinkProvider; + protected ConnectorPageSourceProvider pageSourceProvider; + + private ExecutorService executor; + private HiveConfig config; + private ScheduledExecutorService heartbeatService; + private ScheduledExecutorService vacuumExecutorService; + + @BeforeClass + public void setUp() + { + executor = newCachedThreadPool(daemonThreadsNamed("hive-%s")); + heartbeatService = newScheduledThreadPool(1); + vacuumExecutorService = newScheduledThreadPool(1); + } + + @AfterClass(alwaysRun = true) + public void tearDown() + { + if (executor != null) { + executor.shutdownNow(); + executor = null; + } + if (heartbeatService != null) { + heartbeatService.shutdownNow(); + heartbeatService = null; + } + if (vacuumExecutorService != null) { + vacuumExecutorService.shutdownNow(); + vacuumExecutorService = null; + } + } + + protected abstract Path getBasePath(); + + protected void setup(String host, int port, String databaseName, Function hdfsConfigurationProvider, boolean s3SelectPushdownEnabled) + { + database = databaseName; + table = new SchemaTableName(database, "presto_test_external_fs"); + + String random = UUID.randomUUID().toString().toLowerCase(ENGLISH).replace("-", ""); + temporaryCreateTable = new SchemaTableName(database, "tmp_presto_test_create_" + random); + + config = new HiveConfig().setS3SelectPushdownEnabled(s3SelectPushdownEnabled); + + String proxy = System.getProperty("hive.metastore.thrift.client.socks-proxy"); + if (proxy != null) { + config.setMetastoreSocksProxy(HostAndPort.fromString(proxy)); + } + + MetastoreLocator metastoreLocator = new TestingMetastoreLocator(config, host, port); + ExecutorService executor = newCachedThreadPool(daemonThreadsNamed("hive-%s")); + ExecutorService executorRefresh = newCachedThreadPool(daemonThreadsNamed("hive-refresh-%s")); + HivePartitionManager hivePartitionManager = new HivePartitionManager(TYPE_MANAGER, config); + + HdfsConfiguration hdfsConfiguration = hdfsConfigurationProvider.apply(config); + + hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, config, new NoHdfsAuthentication()); + metastoreClient = new TestingHiveMetastore( + new BridgingHiveMetastore(new ThriftHiveMetastore(metastoreLocator, new ThriftHiveMetastoreConfig())), + executor, + executorRefresh, + config, + getBasePath(), + hdfsEnvironment); + locationService = new HiveLocationService(hdfsEnvironment); + JsonCodec partitionUpdateCodec = JsonCodec.jsonCodec(PartitionUpdate.class); + metadataFactory = new HiveMetadataFactory( + config, + metastoreClient, + hdfsEnvironment, + hivePartitionManager, + newDirectExecutorService(), + vacuumExecutorService, + heartbeatService, + vacuumExecutorService, + TYPE_MANAGER, + locationService, + partitionUpdateCodec, + new HiveTypeTranslator(), + new NodeVersion("test_version"), + SqlStandardAccessControlMetadata::new); + transactionManager = new HiveTransactionManager(); + splitManager = new HiveSplitManager( + transactionHandle -> ((HiveMetadata) transactionManager.get(transactionHandle)).getMetastore(), + hivePartitionManager, + new NamenodeStats(), + hdfsEnvironment, + new CachingDirectoryLister(new HiveConfig()), + new BoundedExecutor(executor, config.getMaxSplitIteratorThreads()), + new HiveCoercionPolicy(TYPE_MANAGER), + new CounterStat(), + config.getMaxOutstandingSplits(), + config.getMaxOutstandingSplitsSize(), + config.getMinPartitionBatchSize(), + config.getMaxPartitionBatchSize(), + config.getMaxInitialSplits(), + config.getSplitLoaderConcurrency(), + config.getMaxSplitsPerSecond(), + config.getRecursiveDirWalkerEnabled(), null, config); + pageSinkProvider = new HivePageSinkProvider( + getDefaultHiveFileWriterFactories(config), + hdfsEnvironment, + PAGE_SORTER, + metastoreClient, + new GroupByHashPageIndexerFactory(new JoinCompiler(createTestMetadataManager())), + TYPE_MANAGER, + config, + locationService, + partitionUpdateCodec, + new TestingNodeManager("fake-environment"), + new HiveEventClient(), + new HiveSessionProperties(config, new OrcFileWriterConfig(), new ParquetFileWriterConfig()), + new HiveWriterStats(), + getDefaultOrcFileWriterFactory(config)); + pageSourceProvider = new HivePageSourceProvider(config, hdfsEnvironment, getDefaultHiveRecordCursorProvider(config), getDefaultHiveDataStreamFactories(config), TYPE_MANAGER, getNoOpIndexCache(), getDefaultHiveSelectiveFactories(config)); + } + + protected ConnectorSession newSession() + { + return new TestingConnectorSession(new HiveSessionProperties(config, new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + } + + protected Transaction newTransaction() + { + return new HiveTransaction(transactionManager, metadataFactory.get()); + } + + @Test + public void testGetRecords() + throws Exception + { + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + + ConnectorTableHandle table = getTableHandle(metadata, this.table); + List columnHandles = ImmutableList.copyOf(metadata.getColumnHandles(session, table).values()); + Map columnIndex = indexColumns(columnHandles); + + ConnectorSplitSource splitSource = splitManager.getSplits(transaction.getTransactionHandle(), session, table, UNGROUPED_SCHEDULING); + + List splits = getAllSplits(splitSource); + assertEquals(splits.size(), 1); + + long sum = 0; + for (ConnectorSplit split : splits) { + try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, table, columnHandles)) { + MaterializedResult result = materializeSourceDataStream(session, pageSource, getTypes(columnHandles)); + + for (MaterializedRow row : result) { + sum += (Long) row.getField(columnIndex.get("t_bigint")); + } + } + } + // The test table is made up of multiple S3 objects with same data and different compression codec + // formats: uncompressed | .gz | .lz4 | .bz2 + assertEquals(sum, 78300 * 4); + } + } + + @Test + public void testGetFileStatus() + throws Exception + { + Path basePath = getBasePath(); + Path tablePath = new Path(basePath, "presto_test_external_fs"); + Path filePath = new Path(tablePath, "test1.csv"); + FileSystem fs = hdfsEnvironment.getFileSystem(TESTING_CONTEXT, basePath); + + assertTrue(fs.getFileStatus(basePath).isDirectory()); + assertTrue(fs.getFileStatus(tablePath).isDirectory()); + assertFalse(fs.getFileStatus(filePath).isDirectory()); + assertFalse(fs.exists(new Path(basePath, "foo"))); + } + + @Test + public void testRename() + throws Exception + { + Path basePath = new Path(getBasePath(), UUID.randomUUID().toString()); + FileSystem fs = hdfsEnvironment.getFileSystem(TESTING_CONTEXT, basePath); + assertFalse(fs.exists(basePath)); + + // create file foo.txt + Path path = new Path(basePath, "foo.txt"); + assertTrue(fs.createNewFile(path)); + assertTrue(fs.exists(path)); + + // rename foo.txt to bar.txt when bar does not exist + Path newPath = new Path(basePath, "bar.txt"); + assertFalse(fs.exists(newPath)); + assertTrue(fs.rename(path, newPath)); + assertFalse(fs.exists(path)); + assertTrue(fs.exists(newPath)); + + // rename foo.txt to foo.txt when foo.txt does not exist + assertFalse(fs.rename(path, path)); + + // create file foo.txt and rename to existing bar.txt + assertTrue(fs.createNewFile(path)); + assertFalse(fs.rename(path, newPath)); + + // rename foo.txt to foo.txt when foo.txt exists + assertFalse(fs.rename(path, path)); + + // delete foo.txt + assertTrue(fs.delete(path, false)); + assertFalse(fs.exists(path)); + + // create directory source with file + Path source = new Path(basePath, "source"); + assertTrue(fs.createNewFile(new Path(source, "test.txt"))); + + // rename source to non-existing target + Path target = new Path(basePath, "target"); + assertFalse(fs.exists(target)); + assertTrue(fs.rename(source, target)); + assertFalse(fs.exists(source)); + assertTrue(fs.exists(target)); + + // create directory source with file + assertTrue(fs.createNewFile(new Path(source, "test.txt"))); + + // rename source to existing target + assertTrue(fs.rename(source, target)); + assertFalse(fs.exists(source)); + target = new Path(target, "source"); + assertTrue(fs.exists(target)); + assertTrue(fs.exists(new Path(target, "test.txt"))); + + // delete target + target = new Path(basePath, "target"); + assertTrue(fs.exists(target)); + assertTrue(fs.delete(target, true)); + assertFalse(fs.exists(target)); + + // cleanup + fs.delete(basePath, true); + } + + @Test + public void testTableCreation() + throws Exception + { + for (HiveStorageFormat storageFormat : HiveStorageFormat.values()) { + if (storageFormat == HiveStorageFormat.CSV) { + // CSV supports only unbounded VARCHAR type + continue; + } + createTable(temporaryCreateTable, storageFormat); + dropTable(temporaryCreateTable); + } + } + + private void createTable(SchemaTableName tableName, HiveStorageFormat storageFormat) + throws Exception + { + List columns = ImmutableList.builder() + .add(new ColumnMetadata("id", BIGINT)) + .build(); + + MaterializedResult data = MaterializedResult.resultBuilder(newSession(), BIGINT) + .row(1L) + .row(3L) + .row(2L) + .build(); + + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + + // begin creating the table + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, columns, createTableProperties(storageFormat)); + ConnectorOutputTableHandle outputHandle = metadata.beginCreateTable(session, tableMetadata, Optional.empty()); + + // write the records + ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, outputHandle); + sink.appendPage(data.toPage()); + Collection fragments = getFutureValue(sink.finish()); + + // commit the table + metadata.finishCreateTable(session, outputHandle, fragments, ImmutableList.of()); + + transaction.commit(); + + // Hack to work around the metastore not being configured for S3 or other FS. + // The metastore tries to validate the location when creating the + // table, which fails without explicit configuration for file system. + // We work around that by using a dummy location when creating the + // table and update it here to the correct location. + metastoreClient.updateTableLocation( + database, + tableName.getTableName(), + locationService.getTableWriteInfo(((HiveOutputTableHandle) outputHandle).getLocationHandle(), false).getTargetPath().toString()); + } + + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + ConnectorSession session = newSession(); + + // load the new table + ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName); + List columnHandles = filterNonHiddenColumnHandles(metadata.getColumnHandles(session, tableHandle).values()); + + // verify the metadata + ConnectorTableMetadata tableMetadata = metadata.getTableMetadata(session, getTableHandle(metadata, tableName)); + assertEquals(filterNonHiddenColumnMetadata(tableMetadata.getColumns()), columns); + + // verify the data + ConnectorSplitSource splitSource = splitManager.getSplits(transaction.getTransactionHandle(), session, tableHandle, UNGROUPED_SCHEDULING); + ConnectorSplit split = getOnlyElement(getAllSplits(splitSource)); + + try (ConnectorPageSource pageSource = pageSourceProvider.createPageSource(transaction.getTransactionHandle(), session, split, tableHandle, columnHandles)) { + MaterializedResult result = materializeSourceDataStream(session, pageSource, getTypes(columnHandles)); + assertEqualsIgnoreOrder(result.getMaterializedRows(), data.getMaterializedRows()); + } + } + } + + private void dropTable(SchemaTableName table) + { + try (Transaction transaction = newTransaction()) { + transaction.getMetastore(table.getSchemaName()).dropTable(newSession(), table.getSchemaName(), table.getTableName()); + transaction.commit(); + } + } + + private ConnectorTableHandle getTableHandle(ConnectorMetadata metadata, SchemaTableName tableName) + { + ConnectorTableHandle handle = metadata.getTableHandle(newSession(), tableName); + checkArgument(handle != null, "table not found: %s", tableName); + return handle; + } + + private static ImmutableMap indexColumns(List columnHandles) + { + ImmutableMap.Builder index = ImmutableMap.builder(); + int i = 0; + for (ColumnHandle columnHandle : columnHandles) { + HiveColumnHandle hiveColumnHandle = (HiveColumnHandle) columnHandle; + index.put(hiveColumnHandle.getName(), i); + i++; + } + return index.build(); + } + + private static class TestingHiveMetastore + extends CachingHiveMetastore + { + private final Path basePath; + private final HdfsEnvironment hdfsEnvironment; + + public TestingHiveMetastore(HiveMetastore delegate, ExecutorService executor, Executor executorRefresh, HiveConfig hiveConfig, Path basePath, HdfsEnvironment hdfsEnvironment) + { + super(delegate, executor, executorRefresh, hiveConfig, new TestingNodeManager("fake-environment")); + this.basePath = basePath; + this.hdfsEnvironment = hdfsEnvironment; + } + + @Override + public Optional getDatabase(String databaseName) + { + return super.getDatabase(databaseName) + .map(database -> Database.builder(database) + .setLocation(Optional.of(basePath.toString())) + .build()); + } + + @Override + public void createTable(HiveIdentity identity, Table table, PrincipalPrivileges privileges) + { + // hack to work around the metastore not being configured for S3 or other FS + Table.Builder tableBuilder = Table.builder(table); + tableBuilder.getStorageBuilder().setLocation("/"); + super.createTable(identity, tableBuilder.build(), privileges); + } + + @Override + public void dropTable(HiveIdentity identity, String databaseName, String tableName, boolean deleteData) + { + try { + Optional
table = getTable(identity, databaseName, tableName); + if (!table.isPresent()) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + + // hack to work around the metastore not being configured for S3 or other FS + List locations = listAllDataPaths(identity, databaseName, tableName); + + Table.Builder tableBuilder = Table.builder(table.get()); + tableBuilder.getStorageBuilder().setLocation("/"); + + // drop table + replaceTable(identity, databaseName, tableName, tableBuilder.build(), new PrincipalPrivileges(ImmutableMultimap.of(), ImmutableMultimap.of())); + delegate.dropTable(identity, databaseName, tableName, false); + + // drop data + if (deleteData) { + for (String location : locations) { + Path path = new Path(location); + hdfsEnvironment.getFileSystem(TESTING_CONTEXT, path).delete(path, true); + } + } + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + finally { + invalidateTable(databaseName, tableName); + } + } + + public void updateTableLocation(String databaseName, String tableName, String location) + { + HiveIdentity identity = new HiveIdentity(TESTING_CONTEXT.getIdentity()); + Optional
table = getTable(identity, databaseName, tableName); + if (!table.isPresent()) { + throw new TableNotFoundException(new SchemaTableName(databaseName, tableName)); + } + + Table.Builder tableBuilder = Table.builder(table.get()); + tableBuilder.getStorageBuilder().setLocation(location); + + // NOTE: this clears the permissions + replaceTable(identity, databaseName, tableName, tableBuilder.build(), new PrincipalPrivileges(ImmutableMultimap.of(), ImmutableMultimap.of())); + } + + private List listAllDataPaths(HiveIdentity identity, String schemaName, String tableName) + { + ImmutableList.Builder locations = ImmutableList.builder(); + Table table = getTable(identity, schemaName, tableName).get(); + if (table.getStorage().getLocation() != null) { + // For partitioned table, there should be nothing directly under this directory. + // But including this location in the set makes the directory content assert more + // extensive, which is desirable. + locations.add(table.getStorage().getLocation()); + } + + Optional> partitionNames = getPartitionNames(identity, schemaName, tableName); + if (partitionNames.isPresent()) { + getPartitionsByNames(identity, schemaName, tableName, partitionNames.get()).values().stream() + .map(Optional::get) + .map(partition -> partition.getStorage().getLocation()) + .filter(location -> !location.startsWith(table.getStorage().getLocation())) + .forEach(locations::add); + } + + return locations.build(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHiveLocal.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHiveLocal.java new file mode 100644 index 00000000..114e235e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/AbstractTestHiveLocal.java @@ -0,0 +1,123 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.spi.connector.ConnectorMetadata; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.security.PrincipalType; +import org.testng.SkipException; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; + +import java.io.File; +import java.io.IOException; + +import static com.google.common.io.MoreFiles.deleteRecursively; +import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static io.prestosql.testing.TestingConnectorSession.SESSION; +import static java.nio.file.Files.createTempDirectory; +import static java.util.Objects.requireNonNull; + +public abstract class AbstractTestHiveLocal + extends AbstractTestHive +{ + private static final String DEFAULT_TEST_DB_NAME = "test"; + private static final HiveIdentity HIVE_IDENTITY = new HiveIdentity(SESSION); + + private File tempDir; + private String testDbName; + + protected AbstractTestHiveLocal() + { + this(DEFAULT_TEST_DB_NAME); + } + + protected AbstractTestHiveLocal(String testDbName) + { + this.testDbName = requireNonNull(testDbName, "testDbName is null"); + } + + protected abstract HiveMetastore createMetastore(File tempDir); + + @BeforeClass + public void initialize() throws IOException + { + tempDir = createTempDirectory(getClass().getName()).toFile(); + + HiveMetastore metastore = createMetastore(tempDir); + + metastore.createDatabase(HIVE_IDENTITY, + Database.builder() + .setDatabaseName(testDbName) + .setOwnerName("public") + .setOwnerType(PrincipalType.ROLE) + .build()); + + HiveConfig hiveConfig = new HiveConfig() + .setParquetTimeZone("America/Los_Angeles") + .setRcfileTimeZone("America/Los_Angeles"); + + setup(testDbName, hiveConfig, metastore); + } + + @AfterClass(alwaysRun = true) + public void cleanup() + throws IOException + { + try { + getMetastoreClient().dropDatabase(HIVE_IDENTITY, testDbName); + } + finally { + deleteRecursively(tempDir.toPath(), ALLOW_INSECURE); + } + } + + @Override + protected ConnectorTableHandle getTableHandle(ConnectorMetadata metadata, SchemaTableName tableName) + { + if (tableName.getTableName().startsWith(TEMPORARY_TABLE_PREFIX)) { + return super.getTableHandle(metadata, tableName); + } + throw new SkipException("tests using existing tables are not supported"); + } + + @Override + public void testGetAllTableNames() + { + } + + @Override + public void testGetAllTableColumns() + { + } + + @Override + public void testGetAllTableColumnsInSchema() + { + } + + @Override + public void testGetTableNames() + { + } + + @Override + public void testGetTableSchemaOffline() + { + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/HiveBenchmarkQueryRunner.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/HiveBenchmarkQueryRunner.java new file mode 100644 index 00000000..e92bca31 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/HiveBenchmarkQueryRunner.java @@ -0,0 +1,96 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableMap; +import io.prestosql.Session; +import io.prestosql.benchmark.BenchmarkSuite; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.tpch.TpchConnectorFactory; +import io.prestosql.spi.security.PrincipalType; +import io.prestosql.testing.LocalQueryRunner; + +import java.io.File; +import java.io.IOException; +import java.util.Map; +import java.util.Optional; + +import static com.google.common.io.MoreFiles.deleteRecursively; +import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static io.prestosql.plugin.hive.metastore.file.FileHiveMetastore.createTestingFileHiveMetastore; +import static io.prestosql.testing.TestingConnectorSession.SESSION; +import static io.prestosql.testing.TestingSession.testSessionBuilder; +import static java.nio.file.Files.createTempDirectory; +import static java.util.Objects.requireNonNull; + +public final class HiveBenchmarkQueryRunner +{ + private HiveBenchmarkQueryRunner() + { + } + + public static void main(String[] args) + throws IOException + { + String outputDirectory = requireNonNull(System.getProperty("outputDirectory"), "Must specify -DoutputDirectory=..."); + File tempDir = createTempDirectory("HiveBenchmarkQueryRunner").toFile(); + try (LocalQueryRunner localQueryRunner = createLocalQueryRunner(tempDir)) { + new BenchmarkSuite(localQueryRunner, outputDirectory).runAllBenchmarks(); + } + finally { + deleteRecursively(tempDir.toPath(), ALLOW_INSECURE); + } + } + + public static LocalQueryRunner createLocalQueryRunner(File tempDir) + { + Session session = testSessionBuilder() + .setCatalog("hive") + .setSchema("tpch") + .build(); + + LocalQueryRunner localQueryRunner = new LocalQueryRunner(session); + + // add tpch + localQueryRunner.createCatalog("tpch", new TpchConnectorFactory(1), ImmutableMap.of()); + + // add hive + File hiveDir = new File(tempDir, "hive_data"); + HiveMetastore metastore = createTestingFileHiveMetastore(hiveDir); + HiveIdentity identity = new HiveIdentity(SESSION); + metastore.createDatabase(identity, + Database.builder() + .setDatabaseName("tpch") + .setOwnerName("public") + .setOwnerType(PrincipalType.ROLE) + .build()); + + HiveConnectorFactory hiveConnectorFactory = new HiveConnectorFactory( + "hive", + HiveBenchmarkQueryRunner.class.getClassLoader(), + Optional.of(metastore)); + + Map hiveCatalogConfig = ImmutableMap.builder() + .put("hive.max-split-size", "10GB") + .build(); + + localQueryRunner.createCatalog("hive", hiveConnectorFactory, hiveCatalogConfig); + + localQueryRunner.execute("CREATE TABLE orders AS SELECT * FROM tpch.sf1.orders"); + localQueryRunner.execute("CREATE TABLE lineitem AS SELECT * FROM tpch.sf1.lineitem"); + return localQueryRunner; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/HiveQueryRunner.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/HiveQueryRunner.java new file mode 100644 index 00000000..fcae4779 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/HiveQueryRunner.java @@ -0,0 +1,350 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.airlift.log.Logger; +import io.airlift.log.Logging; +import io.airlift.tpch.TpchTable; +import io.hetu.core.cube.startree.StarTreePlugin; +import io.hetu.core.metastore.HetuMetastorePlugin; +import io.prestosql.Session; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.authentication.NoHdfsAuthentication; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.file.FileHiveMetastore; +import io.prestosql.plugin.tpch.TpchPlugin; +import io.prestosql.spi.connector.QualifiedObjectName; +import io.prestosql.spi.security.Identity; +import io.prestosql.spi.security.PrincipalType; +import io.prestosql.spi.security.SelectedRole; +import io.prestosql.testing.QueryRunner; +import io.prestosql.tests.DistributedQueryRunner; +import org.intellij.lang.annotations.Language; +import org.joda.time.DateTimeZone; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + +import static io.airlift.log.Level.WARN; +import static io.airlift.units.Duration.nanosSince; +import static io.prestosql.plugin.tpch.TpchMetadata.TINY_SCHEMA_NAME; +import static io.prestosql.spi.security.SelectedRole.Type.ROLE; +import static io.prestosql.testing.TestingConnectorSession.SESSION; +import static io.prestosql.testing.TestingSession.testSessionBuilder; +import static io.prestosql.tests.QueryAssertions.copyTpchTables; +import static java.lang.String.format; +import static java.nio.file.Files.createDirectories; +import static java.util.Locale.ENGLISH; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.testng.Assert.assertEquals; + +public final class HiveQueryRunner +{ + private static final Logger log = Logger.get(HiveQueryRunner.class); + + private HiveQueryRunner() + { + } + + public static final String HIVE_CATALOG = "hive"; + public static final String HIVE_BUCKETED_CATALOG = "hive_bucketed"; + public static final String HIVE_AUTO_VACUUM_CATALOG = "hive_auto_vacuum"; + public static final String TPCH_SCHEMA = "tpch"; + private static final String TPCH_BUCKETED_SCHEMA = "tpch_bucketed"; + private static final DateTimeZone TIME_ZONE = DateTimeZone.forID("America/Bahia_Banderas"); + + public static DistributedQueryRunner createQueryRunner(TpchTable... tables) + throws Exception + { + return createQueryRunner(ImmutableList.copyOf(tables)); + } + + public static DistributedQueryRunner createQueryRunner(Iterable> tables) + throws Exception + { + return createQueryRunner(tables, ImmutableMap.of(), Optional.empty()); + } + + public static DistributedQueryRunner createQueryRunnerWithStateStore(Iterable> tables) + throws Exception + { + return createQueryRunner(tables, ImmutableMap.of(), "sql-standard", ImmutableMap.of(), Optional.empty(), true, ""); + } + + public static DistributedQueryRunner createQueryRunner(Iterable> tables, Map extraProperties, Optional baseDataDir) + throws Exception + { + return createQueryRunner(tables, extraProperties, "sql-standard", ImmutableMap.of(), baseDataDir, false, ""); + } + + public static DistributedQueryRunner createQueryRunner(Iterable> tables, Map extraProperties, String security, Map extraHiveProperties, Optional baseDataDir, boolean hasStateStore) + throws Exception + { + return createQueryRunner(tables, extraProperties, security, extraHiveProperties, baseDataDir, hasStateStore, ""); + } + + public static DistributedQueryRunner createQueryRunner(Iterable> tables, Map extraProperties, String security, Map extraHiveProperties, Optional baseDataDir, boolean hasStateStore, String jdbcUrl) + throws Exception + { + assertEquals(DateTimeZone.getDefault(), TIME_ZONE, "Timezone not configured correctly. Add -Duser.timezone=America/Bahia_Banderas to your JVM arguments"); + setupLogging(); + + DistributedQueryRunner queryRunner = null; + + if (hasStateStore) { + queryRunner = DistributedQueryRunner + .builder(createSession(Optional.of(new SelectedRole(ROLE, Optional.of("admin"))))) + .setNodeCount(4) + .setExtraProperties(extraProperties) + .setBaseDataDir(baseDataDir) + .buildWithStateStore(); + } + else { + Map configProperties = new HashMap<>(); + configProperties.put("auto-vacuum.enabled", "true"); + configProperties.put("optimizer.cte-reuse-enabled", "true"); + configProperties.put("auto-vacuum.scan.interval", "15s"); + configProperties.put("hetu.split-cache-map.enabled", "true"); + + queryRunner = DistributedQueryRunner + .builder(createSession(Optional.of(new SelectedRole(ROLE, Optional.of("admin"))))) + .setNodeCount(4) + .setCoordinatorProperties(configProperties) + .setExtraProperties(extraProperties) + .setBaseDataDir(baseDataDir) + .build(); + } + + try { + if (jdbcUrl != null && !jdbcUrl.isEmpty()) { + File directory = new File(""); + String courseFile = directory.getCanonicalPath(); + System.setProperty("config", courseFile + "/etc/"); + String configDir = System.getProperty("config"); + String hetumetastoreConfig = configDir + "hetu-metastore.properties"; + File file = new File(configDir); + if (!file.exists()) { + file.mkdirs(); + } + File file2 = new File(configDir, "hetu-metastore.properties"); + if (!file2.exists()) { + try { + file2.createNewFile(); + } + catch (IOException e) { + e.printStackTrace(); + } + } + + try (BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(hetumetastoreConfig))) { + bufferedWriter.write("hetu.metastore.db.url = " + jdbcUrl); + bufferedWriter.write("\n"); + bufferedWriter.write("hetu.metastore.type = jdbc\n"); + bufferedWriter.write("hetu.metastore.db.user = user\n"); + bufferedWriter.write("hetu.metastore.db.password = testpass\n"); + bufferedWriter.write("hetu.metastore.cache.ttl = 0s"); + } + queryRunner.installPlugin(new HetuMetastorePlugin()); + queryRunner.getCoordinator().loadMetastore(); + queryRunner.installPlugin(new StarTreePlugin()); + } + + queryRunner.installPlugin(new TpchPlugin()); + queryRunner.createCatalog("tpch", "tpch"); + + File baseDir = queryRunner.getCoordinator().getBaseDataDir().resolve("hive_data").toFile(); + + HiveConfig hiveConfig = new HiveConfig(); + HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hiveConfig), ImmutableSet.of()); + HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hiveConfig, new NoHdfsAuthentication()); + + FileHiveMetastore metastore = new FileHiveMetastore(hdfsEnvironment, baseDir.toURI().toString(), "test"); + queryRunner.installPlugin(new HivePlugin(HIVE_CATALOG, Optional.of(metastore))); + + Map hiveProperties = ImmutableMap.builder() + .putAll(extraHiveProperties) + .put("hive.rcfile.time-zone", TIME_ZONE.getID()) + .put("hive.parquet.time-zone", TIME_ZONE.getID()) + .put("hive.security", security) + .put("hive.max-partitions-per-scan", "1000") + .put("hive.assume-canonical-partition-keys", "true") + .build(); + Map hiveBucketedProperties = ImmutableMap.builder() + .putAll(hiveProperties) + .put("hive.max-initial-split-size", "10kB") // so that each bucket has multiple splits + .put("hive.max-split-size", "10kB") // so that each bucket has multiple splits + .put("hive.storage-format", "TEXTFILE") // so that there's no minimum split size for the file + .put("hive.compression-codec", "NONE") // so that the file is splittable + .build(); + Map hiveAutoVacuumProperties = ImmutableMap.builder() + .putAll(hiveProperties) + .put("hive.auto-vacuum-enabled", "true") + .put("hive.vacuum-collector-interval", "15s") + .build(); + queryRunner.createCatalog(HIVE_AUTO_VACUUM_CATALOG, HIVE_CATALOG, hiveAutoVacuumProperties); + queryRunner.createCatalog(HIVE_CATALOG, HIVE_CATALOG, hiveProperties); + queryRunner.createCatalog(HIVE_BUCKETED_CATALOG, HIVE_CATALOG, hiveBucketedProperties); + + HiveIdentity identity = new HiveIdentity(SESSION); + if (!metastore.getDatabase(TPCH_SCHEMA).isPresent()) { + metastore.createDatabase(identity, createDatabaseMetastoreObject(TPCH_SCHEMA)); + copyTpchTables(queryRunner, "tpch", TINY_SCHEMA_NAME, createSession(Optional.empty()), tables); + } + + if (!metastore.getDatabase(TPCH_BUCKETED_SCHEMA).isPresent()) { + metastore.createDatabase(identity, createDatabaseMetastoreObject(TPCH_BUCKETED_SCHEMA)); + copyTpchTablesBucketed(queryRunner, "tpch", TINY_SCHEMA_NAME, createBucketedSession(Optional.empty()), tables); + } + + return queryRunner; + } + catch (Exception e) { + queryRunner.close(); + throw e; + } + } + + private static void setupLogging() + { + Logging logging = Logging.initialize(); + logging.setLevel("org.apache.parquet.hadoop", WARN); + } + + private static Database createDatabaseMetastoreObject(String name) + { + return Database.builder() + .setDatabaseName(name) + .setOwnerName("public") + .setOwnerType(PrincipalType.ROLE) + .build(); + } + + public static Session createSession(Optional role) + { + return testSessionBuilder() + .setIdentity(new Identity( + "hive", + Optional.empty(), + role.map(selectedRole -> ImmutableMap.of("hive", selectedRole)) + .orElse(ImmutableMap.of()))) + .setCatalog(HIVE_CATALOG) + .setSchema(TPCH_SCHEMA) + .build(); + } + + public static Session createBucketedSession(Optional role) + { + return testSessionBuilder() + .setIdentity(new Identity( + "hive", + Optional.empty(), + role.map(selectedRole -> ImmutableMap.of("hive", selectedRole)) + .orElse(ImmutableMap.of()))) + .setCatalog(HIVE_BUCKETED_CATALOG) + .setSchema(TPCH_BUCKETED_SCHEMA) + .build(); + } + + public static Session createAutoVacuumSession(Optional role) + { + return testSessionBuilder() + .setIdentity(new Identity( + "openLooKeng", + Optional.empty(), + role.map(selectedRole -> ImmutableMap.of("hive", selectedRole)) + .orElse(ImmutableMap.of()))) + .setCatalog(HIVE_AUTO_VACUUM_CATALOG) + .setSchema(TPCH_SCHEMA) + .build(); + } + + public static void copyTpchTablesBucketed( + QueryRunner queryRunner, + String sourceCatalog, + String sourceSchema, + Session session, + Iterable> tables) + { + log.info("Loading data from %s.%s...", sourceCatalog, sourceSchema); + long startTime = System.nanoTime(); + for (TpchTable table : tables) { + copyTableBucketed(queryRunner, new QualifiedObjectName(sourceCatalog, sourceSchema, table.getTableName().toLowerCase(ENGLISH)), session); + } + log.info("Loading from %s.%s complete in %s", sourceCatalog, sourceSchema, nanosSince(startTime).toString(SECONDS)); + } + + private static void copyTableBucketed(QueryRunner queryRunner, QualifiedObjectName table, Session session) + { + long start = System.nanoTime(); + log.info("Running import for %s", table.getObjectName()); + @Language("SQL") String sql; + switch (table.getObjectName()) { + case "part": + case "partsupp": + case "supplier": + case "nation": + case "region": + sql = format("CREATE TABLE %s AS SELECT * FROM %s", table.getObjectName(), table); + break; + case "lineitem": + sql = format("CREATE TABLE %s WITH (bucketed_by=array['orderkey'], bucket_count=11) AS SELECT * FROM %s", table.getObjectName(), table); + break; + case "customer": + sql = format("CREATE TABLE %s WITH (bucketed_by=array['custkey'], bucket_count=11) AS SELECT * FROM %s", table.getObjectName(), table); + break; + case "orders": + sql = format("CREATE TABLE %s WITH (bucketed_by=array['custkey'], bucket_count=11) AS SELECT * FROM %s", table.getObjectName(), table); + break; + default: + throw new UnsupportedOperationException(); + } + long rows = (Long) queryRunner.execute(session, sql).getMaterializedRows().get(0).getField(0); + log.info("Imported %s rows for %s in %s", rows, table.getObjectName(), nanosSince(start).convertToMostSuccinctTimeUnit()); + } + + public static void main(String[] args) + throws Exception + { + // You need to add "--user admin" to your CLI and execute "SET ROLE admin" for queries to work + Logging.initialize(); + + Optional baseDataDir = Optional.empty(); + if (args.length > 0) { + if (args.length != 1) { + System.err.println("usage: HiveQueryRunner [baseDataDir]"); + System.exit(1); + } + + Path path = Paths.get(args[0]); + createDirectories(path); + baseDataDir = Optional.of(path); + } + + DistributedQueryRunner queryRunner = createQueryRunner(TpchTable.getTables(), ImmutableMap.of("http-server.http.port", "8080"), baseDataDir); + Thread.sleep(10); + Logger log = Logger.get(DistributedQueryRunner.class); + log.info("======== SERVER STARTED ========"); + log.info("\n====\n%s\n====", queryRunner.getCoordinator().getBaseUrl()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/HiveTestUtils.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/HiveTestUtils.java new file mode 100644 index 00000000..ff22b919 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/HiveTestUtils.java @@ -0,0 +1,272 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import io.airlift.slice.Slice; +import io.prestosql.PagesIndexPageSorter; +import io.prestosql.metadata.Metadata; +import io.prestosql.operator.PagesIndex; +import io.prestosql.orc.OrcCacheStore; +import io.prestosql.plugin.hive.authentication.NoHdfsAuthentication; +import io.prestosql.plugin.hive.gcs.GoogleGcsConfigurationInitializer; +import io.prestosql.plugin.hive.gcs.HiveGcsConfig; +import io.prestosql.plugin.hive.orc.OrcPageSourceFactory; +import io.prestosql.plugin.hive.orc.OrcSelectivePageSourceFactory; +import io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory; +import io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory; +import io.prestosql.plugin.hive.s3.HiveS3Config; +import io.prestosql.plugin.hive.s3.PrestoS3ConfigurationInitializer; +import io.prestosql.plugin.hive.util.IndexCache; +import io.prestosql.plugin.hive.util.IndexCacheLoader; +import io.prestosql.spi.PageSorter; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.dynamicfilter.BloomFilterDynamicFilter; +import io.prestosql.spi.dynamicfilter.DynamicFilter; +import io.prestosql.spi.function.FunctionHandle; +import io.prestosql.spi.heuristicindex.IndexMetadata; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.ArrayType; +import io.prestosql.spi.type.MapType; +import io.prestosql.spi.type.NamedTypeSignature; +import io.prestosql.spi.type.RowType; +import io.prestosql.spi.type.StandardTypes; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.type.TypeSignatureParameter; +import io.prestosql.spi.util.BloomFilter; +import io.prestosql.testing.NoOpIndexClient; +import io.prestosql.testing.TestingConnectorSession; +import io.prestosql.type.InternalTypeManager; +import org.apache.hadoop.hive.common.type.Timestamp; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.lang.invoke.MethodHandle; +import java.math.BigDecimal; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.util.HashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.function.Supplier; + +import static io.prestosql.metadata.MetadataManager.createTestMetadataManager; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static io.prestosql.plugin.hive.HiveType.HIVE_LONG; +import static io.prestosql.spi.function.OperatorType.IS_DISTINCT_FROM; +import static io.prestosql.spi.type.Decimals.encodeScaledValue; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; +import static io.prestosql.sql.analyzer.TypeSignatureProvider.fromTypes; +import static java.util.stream.Collectors.toList; + +public final class HiveTestUtils +{ + private HiveTestUtils() + { + } + + public static final ConnectorSession SESSION = new TestingConnectorSession( + new HiveSessionProperties(new HiveConfig().setOrcLazyReadSmallRanges(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + + public static final Metadata METADATA = createTestMetadataManager(); + public static final TypeManager TYPE_MANAGER = new InternalTypeManager(METADATA.getFunctionAndTypeManager()); + + public static final HdfsEnvironment HDFS_ENVIRONMENT = createTestHdfsEnvironment(new HiveConfig()); + + public static final PageSorter PAGE_SORTER = new PagesIndexPageSorter(new PagesIndex.TestingFactory(false)); + + public static Set getDefaultHiveDataStreamFactories(HiveConfig hiveConfig) + { + FileFormatDataSourceStats stats = new FileFormatDataSourceStats(); + HdfsEnvironment testHdfsEnvironment = createTestHdfsEnvironment(hiveConfig); + return ImmutableSet.builder() + .add(new RcFilePageSourceFactory(TYPE_MANAGER, testHdfsEnvironment, stats, hiveConfig)) + .add(new OrcPageSourceFactory(TYPE_MANAGER, hiveConfig, testHdfsEnvironment, stats, OrcCacheStore.builder().newCacheStore( + new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), + new HiveConfig().getOrcStripeFooterCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), + new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), + new HiveConfig().getOrcBloomFiltersCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), + new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), + new HiveConfig().isOrcCacheStatsMetricCollectionEnabled()))) + .add(new ParquetPageSourceFactory(TYPE_MANAGER, testHdfsEnvironment, stats, new HiveConfig())) + .build(); + } + + public static HiveRecordCursorProvider createGenericHiveRecordCursorProvider(HdfsEnvironment hdfsEnvironment) + { + return new GenericHiveRecordCursorProvider(hdfsEnvironment); + } + + public static Set getDefaultHiveSelectiveFactories(HiveConfig hiveConfig) + { + FileFormatDataSourceStats stats = new FileFormatDataSourceStats(); + HdfsEnvironment testHdfsEnvironment = createTestHdfsEnvironment(hiveConfig); + return ImmutableSet.builder() + .add(new OrcSelectivePageSourceFactory(TYPE_MANAGER, hiveConfig, testHdfsEnvironment, stats, OrcCacheStore.builder().newCacheStore( + new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), + new HiveConfig().getOrcStripeFooterCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), + new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), + new HiveConfig().getOrcBloomFiltersCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), + new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), + new HiveConfig().isOrcCacheStatsMetricCollectionEnabled()))) + .build(); + } + + public static IndexCache getNoOpIndexCache() + { + return new IndexCache(new IndexCacheLoader(null), new NoOpIndexClient()) + { + @Override + public List getIndices(String catalog, String table, HiveSplit hiveSplit, TupleDomain effectivePredicate, List partitions) + { + return ImmutableList.of(); + } + }; + } + + public static Set getDefaultHiveRecordCursorProvider(HiveConfig hiveConfig) + { + HdfsEnvironment testHdfsEnvironment = createTestHdfsEnvironment(hiveConfig); + return ImmutableSet.builder() + .add(new GenericHiveRecordCursorProvider(testHdfsEnvironment)) + .build(); + } + + public static Set getDefaultHiveFileWriterFactories(HiveConfig hiveConfig) + { + HdfsEnvironment testHdfsEnvironment = createTestHdfsEnvironment(hiveConfig); + return ImmutableSet.builder() + .add(new RcFileFileWriterFactory(testHdfsEnvironment, TYPE_MANAGER, new NodeVersion("test_version"), hiveConfig, new FileFormatDataSourceStats())) + .add(getDefaultOrcFileWriterFactory(hiveConfig)) + .build(); + } + + public static OrcFileWriterFactory getDefaultOrcFileWriterFactory(HiveConfig hiveConfig) + { + HdfsEnvironment testHdfsEnvironment = createTestHdfsEnvironment(hiveConfig); + return new OrcFileWriterFactory( + testHdfsEnvironment, + TYPE_MANAGER, + new NodeVersion("test_version"), + hiveConfig, + new FileFormatDataSourceStats(), + new OrcFileWriterConfig()); + } + + public static List getTypes(List columnHandles) + { + ImmutableList.Builder types = ImmutableList.builder(); + for (ColumnHandle columnHandle : columnHandles) { + types.add(METADATA.getType(((HiveColumnHandle) columnHandle).getTypeSignature())); + } + return types.build(); + } + + public static HdfsEnvironment createTestHdfsEnvironment(HiveConfig config) + { + HdfsConfiguration hdfsConfig = new HiveHdfsConfiguration( + new HdfsConfigurationInitializer( + config, + ImmutableSet.of( + new PrestoS3ConfigurationInitializer(new HiveS3Config()), + new GoogleGcsConfigurationInitializer(new HiveGcsConfig()))), + ImmutableSet.of()); + return new HdfsEnvironment(hdfsConfig, config, new NoHdfsAuthentication()); + } + + public static MapType mapType(Type keyType, Type valueType) + { + return (MapType) METADATA.getFunctionAndTypeManager().getParameterizedType(StandardTypes.MAP, ImmutableList.of( + TypeSignatureParameter.of(keyType.getTypeSignature()), + TypeSignatureParameter.of(valueType.getTypeSignature()))); + } + + public static ArrayType arrayType(Type elementType) + { + return (ArrayType) METADATA.getFunctionAndTypeManager().getParameterizedType( + StandardTypes.ARRAY, + ImmutableList.of(TypeSignatureParameter.of(elementType.getTypeSignature()))); + } + + public static RowType rowType(List elementTypeSignatures) + { + return (RowType) METADATA.getFunctionAndTypeManager().getParameterizedType( + StandardTypes.ROW, + ImmutableList.copyOf(elementTypeSignatures.stream() + .map(TypeSignatureParameter::of) + .collect(toList()))); + } + + public static Long shortDecimal(String value) + { + return new BigDecimal(value).unscaledValue().longValueExact(); + } + + public static Slice longDecimal(String value) + { + return encodeScaledValue(new BigDecimal(value)); + } + + public static MethodHandle distinctFromOperator(Type type) + { + FunctionHandle operatorHandle = METADATA.getFunctionAndTypeManager().resolveOperatorFunctionHandle(IS_DISTINCT_FROM, fromTypes(type, type)); + return METADATA.getFunctionAndTypeManager().getBuiltInScalarFunctionImplementation(operatorHandle).getMethodHandle(); + } + + public static boolean isDistinctFrom(MethodHandle handle, Block left, Block right) + { + try { + return (boolean) handle.invokeExact(left, left == null, right, right == null); + } + catch (Throwable t) { + throw new AssertionError(t); + } + } + + public static Supplier>> createTestDynamicFilterSupplier(String filterKey, List filterValues) + { + Supplier>> dynamicFilterSupplier = () -> { + Set dynamicFilters = new HashSet<>(); + ColumnHandle columnHandle = new HiveColumnHandle(filterKey, HIVE_LONG, parseTypeSignature(StandardTypes.BIGINT), 0, PARTITION_KEY, Optional.empty()); + BloomFilter filter = new BloomFilter(1024 * 1024, 0.01); + filterValues.stream().forEach(value -> filter.add(value)); + + try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { + filter.writeTo(out); + dynamicFilters.add(new BloomFilterDynamicFilter("filter", columnHandle, out.toByteArray(), DynamicFilter.Type.GLOBAL)); + } + catch (IOException e) { + } + + return ImmutableList.of(dynamicFilters); + }; + + return dynamicFilterSupplier; + } + + public static Timestamp hiveTimestamp(LocalDateTime local) + { + return Timestamp.ofEpochSecond(local.toEpochSecond(ZoneOffset.UTC), local.getNano()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestBackgroundHiveSplitLoader.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestBackgroundHiveSplitLoader.java new file mode 100644 index 00000000..f4fa4e77 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestBackgroundHiveSplitLoader.java @@ -0,0 +1,1119 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.AbstractIterator; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.io.MoreFiles; +import io.airlift.stats.CounterStat; +import io.airlift.units.DataSize; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.HiveColumnHandle.ColumnType; +import io.prestosql.plugin.hive.authentication.NoHdfsAuthentication; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorVacuumTableHandle; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.resourcegroups.QueryType; +import io.prestosql.spi.type.TestingTypeManager; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.testing.TestingConnectorSession; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.hive.common.ValidReaderWriteIdList; +import org.apache.hadoop.hive.common.ValidWriteIdList; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.util.Progressable; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.UUID; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static io.airlift.concurrent.Threads.daemonThreadsNamed; +import static io.airlift.slice.Slices.utf8Slice; +import static io.airlift.units.DataSize.Unit.GIGABYTE; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.plugin.hive.HiveBucketing.BucketingVersion.BUCKETING_V1; +import static io.prestosql.plugin.hive.HiveColumnHandle.pathColumnHandle; +import static io.prestosql.plugin.hive.HiveTestUtils.createTestDynamicFilterSupplier; +import static io.prestosql.plugin.hive.HiveTestUtils.createTestHdfsEnvironment; +import static io.prestosql.plugin.hive.HiveType.HIVE_INT; +import static io.prestosql.plugin.hive.HiveType.HIVE_STRING; +import static io.prestosql.plugin.hive.HiveUtil.getRegularColumnHandles; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.connector.NotPartitionedPartitionHandle.NOT_PARTITIONED; +import static io.prestosql.spi.predicate.TupleDomain.withColumnDomains; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.VarcharType.VARCHAR; +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.concurrent.Executors.newCachedThreadPool; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertThrows; +import static org.testng.Assert.assertTrue; + +public class TestBackgroundHiveSplitLoader +{ + private static final int BUCKET_COUNT = 2; + + private static final String SAMPLE_PATH = "hdfs://VOL1:9000/db_name/table_name/000000_0"; + private static final String SAMPLE_PATH_FILTERED = "hdfs://VOL1:9000/db_name/table_name/000000_1"; + + private static final Path RETURNED_PATH = new Path(SAMPLE_PATH); + private static final Path FILTERED_PATH = new Path(SAMPLE_PATH_FILTERED); + + private static final ExecutorService EXECUTOR = newCachedThreadPool(daemonThreadsNamed("test-%s")); + + private static final TupleDomain RETURNED_PATH_DOMAIN = withColumnDomains( + ImmutableMap.of( + pathColumnHandle(), + Domain.singleValue(VARCHAR, utf8Slice(RETURNED_PATH.toString())))); + + private static final List TEST_FILES = ImmutableList.of( + locatedFileStatus(RETURNED_PATH), + locatedFileStatus(FILTERED_PATH)); + + private static final List PARTITION_COLUMNS = ImmutableList.of( + new Column("partitionColumn", HIVE_INT, Optional.empty())); + private static final List BUCKET_COLUMN_HANDLES = ImmutableList.of( + new HiveColumnHandle("col1", HIVE_INT, INTEGER.getTypeSignature(), 0, ColumnType.REGULAR, Optional.empty())); + + private static final Optional BUCKET_PROPERTY = Optional.of( + new HiveBucketProperty(ImmutableList.of("col1"), BUCKETING_V1, BUCKET_COUNT, ImmutableList.of())); + + private static final Table SIMPLE_TABLE = table(ImmutableList.of(), Optional.empty(), ImmutableMap.of()); + private static final Table PARTITIONED_TABLE = table(PARTITION_COLUMNS, BUCKET_PROPERTY, ImmutableMap.of()); + + private static final Column TABLE_COLUMN = new Column( + "column", + HiveType.HIVE_INT, + Optional.of("comment")); + private static final Storage TABLE_STORAGE = new Storage( + StorageFormat.create("serde", "input", "output"), + "location", + Optional.empty(), + true, + ImmutableMap.of("param", "value2")); + + @Test + public void testNoPathFilter() + throws Exception + { + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( + TEST_FILES, + TupleDomain.none()); + + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + + assertEquals(drain(hiveSplitSource).size(), 2); + } + + @Test + public void testPathFilter() + throws Exception + { + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( + TEST_FILES, + RETURNED_PATH_DOMAIN); + + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + List paths = drain(hiveSplitSource); + assertEquals(paths.size(), 1); + assertEquals(paths.get(0), RETURNED_PATH.toString()); + } + + @Test + public void testPathFilterOneBucketMatchPartitionedTable() + throws Exception + { + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( + TEST_FILES, + RETURNED_PATH_DOMAIN, + Optional.of(new HiveBucketing.HiveBucketFilter(ImmutableSet.of(0, 1))), + PARTITIONED_TABLE, + Optional.of(new HiveBucketHandle(BUCKET_COLUMN_HANDLES, BUCKETING_V1, BUCKET_COUNT, BUCKET_COUNT))); + + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + List paths = drain(hiveSplitSource); + assertEquals(paths.size(), 1); + assertEquals(paths.get(0), RETURNED_PATH.toString()); + } + + @Test + public void testPathFilterBucketedPartitionedTable() + throws Exception + { + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( + TEST_FILES, + RETURNED_PATH_DOMAIN, + Optional.empty(), + PARTITIONED_TABLE, + Optional.of( + new HiveBucketHandle( + getRegularColumnHandles(PARTITIONED_TABLE), + BUCKETING_V1, + BUCKET_COUNT, + BUCKET_COUNT))); + + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + List paths = drain(hiveSplitSource); + assertEquals(paths.size(), 1); + assertEquals(paths.get(0), RETURNED_PATH.toString()); + } + + @Test + public void testEmptyFileWithNoBlocks() + throws Exception + { + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( + ImmutableList.of(locatedFileStatusWithNoBlocks(RETURNED_PATH)), + TupleDomain.none()); + + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + + List splits = drainSplits(hiveSplitSource); + assertEquals(splits.size(), 1); + assertEquals(splits.get(0).getPath(), RETURNED_PATH.toString()); + assertEquals(splits.get(0).getLength(), 0); + } + + @Test + public void testNoHangIfPartitionIsOffline() + throws Exception + { + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoaderOfflinePartitions(); + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + + assertThrows(RuntimeException.class, () -> drain(hiveSplitSource)); + assertThrows(RuntimeException.class, () -> hiveSplitSource.isFinished()); + } + + @Test + public void testCachedDirectoryLister() + throws Exception + { + CachingDirectoryLister cachingDirectoryLister = new CachingDirectoryLister(new Duration(5, TimeUnit.MINUTES), 1000, ImmutableList.of("test_dbname.test_table")); + assertEquals(cachingDirectoryLister.getRequestCount(), 0); + + int totalCount = 1000; + CountDownLatch firstVisit = new CountDownLatch(1); + List>> futures = new ArrayList<>(); + + futures.add(EXECUTOR.submit(() -> { + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader(TEST_FILES, cachingDirectoryLister); + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + try { + return drainSplits(hiveSplitSource); + } + finally { + firstVisit.countDown(); + } + })); + + for (int i = 0; i < totalCount - 1; i++) { + futures.add(EXECUTOR.submit(() -> { + firstVisit.await(); + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader(TEST_FILES, cachingDirectoryLister); + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + return drainSplits(hiveSplitSource); + })); + } + + for (Future> future : futures) { + assertEquals(future.get().size(), TEST_FILES.size()); + } + assertEquals(cachingDirectoryLister.getRequestCount(), totalCount); + assertEquals(cachingDirectoryLister.getHitCount(), totalCount - 1); + assertEquals(cachingDirectoryLister.getMissCount(), 1); + } + + @Test + public void testGetBucketNumber() + { + assertEquals(HiveUtil.getBucketNumber("0234_0"), OptionalInt.of(234)); + assertEquals(HiveUtil.getBucketNumber("000234_0"), OptionalInt.of(234)); + assertEquals(HiveUtil.getBucketNumber("0234_99"), OptionalInt.of(234)); + assertEquals(HiveUtil.getBucketNumber("0234_0.txt"), OptionalInt.of(234)); + assertEquals(HiveUtil.getBucketNumber("0234_0_copy_1"), OptionalInt.of(234)); + assertEquals(HiveUtil.getBucketNumber("20190526_072952_00009_fn7s5_bucket-00234"), OptionalInt.of(234)); + assertEquals(HiveUtil.getBucketNumber("20190526_072952_00009_fn7s5_bucket-00234.txt"), OptionalInt.of(234)); + assertEquals(HiveUtil.getBucketNumber("20190526_235847_87654_fn7s5_bucket-56789"), OptionalInt.of(56789)); + + assertEquals(HiveUtil.getBucketNumber("234_99"), OptionalInt.empty()); + assertEquals(HiveUtil.getBucketNumber("0234.txt"), OptionalInt.empty()); + assertEquals(HiveUtil.getBucketNumber("0234.txt"), OptionalInt.empty()); + } + + @Test(dataProvider = "testPropagateExceptionDataProvider", timeOut = 60_000) + public void testPropagateException(boolean error, int threads) + { + AtomicBoolean iteratorUsedAfterException = new AtomicBoolean(); + AtomicBoolean isFirstTime = new AtomicBoolean(true); + + BackgroundHiveSplitLoader backgroundHiveSplitLoader = new BackgroundHiveSplitLoader( + SIMPLE_TABLE, + () -> new Iterator() + { + private boolean threw; + + @Override + public boolean hasNext() + { + iteratorUsedAfterException.compareAndSet(false, threw); + return !threw; + } + + @Override + public HivePartitionMetadata next() + { + // isFirstTime variable is used to skip throwing exception from next method called in BackgroundHiveSplitLoader constructor + if (!isFirstTime.compareAndSet(true, false)) { + iteratorUsedAfterException.compareAndSet(false, threw); + threw = true; + if (error) { + throw new Error("loading error occurred"); + } + throw new RuntimeException("loading error occurred"); + } + return new HivePartitionMetadata( + new HivePartition(new SchemaTableName("testSchema", "table_name")), + Optional.empty(), + ImmutableMap.of()); + } + }, + TupleDomain.all(), + BackgroundHiveSplitLoader.BucketSplitInfo.createBucketSplitInfo(Optional.empty(), Optional.empty()), + HiveTestUtils.SESSION, + new TestingHdfsEnvironment(TEST_FILES), + new NamenodeStats(), + new CachingDirectoryLister(new HiveConfig()), + EXECUTOR, + threads, + false, + Optional.empty(), + null, + Optional.empty(), + Collections.emptyMap(), null); + + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + + assertThatThrownBy(() -> drain(hiveSplitSource)) + .hasMessageEndingWith("loading error occurred"); + + assertThatThrownBy(hiveSplitSource::isFinished) + .hasMessageEndingWith("loading error occurred"); + + if (threads == 1) { + assertFalse(iteratorUsedAfterException.get()); + } + } + + @DataProvider + public Object[][] testPropagateExceptionDataProvider() + { + return new Object[][] { + {false, 1}, + {true, 1}, + {false, 2}, + {true, 2}, + {false, 4}, + {true, 4}, + }; + } + + @Test + public void testSplitsGenerationWithAbortedTransactions() + throws Exception + { + ImmutableMap tableParams = ImmutableMap.of( + "transactional", "true", + "transactional_properties", "insert_only"); + testAbortedTransactions(tableParams, Optional.empty(), ImmutableMap.of()); + } + + @Test + public void testSplitsGenerationWithAbortedTransactionsForIUDTable() + throws Exception + { + ImmutableMap tableParams = ImmutableMap.of( + "transactional", "true"); + testAbortedTransactions(tableParams, Optional.empty(), ImmutableMap.of()); + } + + @Test + public void testSplitsGenerationWithAbortedTransactionsForVacuum() + throws Exception + { + ImmutableMap tableParams = ImmutableMap.of( + "transactional", "true"); + testAbortedTransactions(tableParams, Optional.of(QueryType.VACUUM), ImmutableMap.of("FULL", false)); + } + + private void testAbortedTransactions(ImmutableMap tableParameters, + Optional queryType, ImmutableMap queryInfo) + throws Exception + { + java.nio.file.Path tablePath = Files.createTempDirectory(UUID.randomUUID().toString()); + Table table = table( + tablePath.toString(), + ImmutableList.of(), + Optional.empty(), + tableParameters); + + List filePaths = ImmutableList.of( + tablePath + "/delta_0000001_0000001_0000/_orc_acid_version", + tablePath + "/delta_0000001_0000001_0000/bucket_00000", + tablePath + "/delta_0000002_0000002_0000/_orc_acid_version", + tablePath + "/delta_0000002_0000002_0000/bucket_00000", + tablePath + "/delta_0000003_0000003_0000/_orc_acid_version", + tablePath + "/delta_0000003_0000003_0000/bucket_00000"); + + try { + for (String path : filePaths) { + File file = new File(path); + assertTrue(file.getParentFile().exists() || file.getParentFile().mkdirs(), "Failed creating directory " + file.getParentFile()); + createOrcAcidFile(file); + } + + // ValidWriteIdList is of format $.
:::: + // This writeId list has high watermark transaction=3 and aborted transaction=2 + String validWriteIdsList = format("4$%s.%s:3:9223372036854775807::2", table.getDatabaseName(), table.getTableName()); + + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( + createTestHdfsEnvironment(new HiveConfig()), + TupleDomain.none(), + Optional.empty(), + table, + Optional.empty(), + Optional.of(new ValidReaderWriteIdList(validWriteIdsList)), + queryType, + queryInfo); + + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + List splits = drain(hiveSplitSource); + assertTrue(splits.stream().anyMatch(p -> p.contains(filePaths.get(1))), format("%s not found in splits %s", filePaths.get(1), splits)); + assertTrue(splits.stream().anyMatch(p -> p.contains(filePaths.get(5))), format("%s not found in splits %s", filePaths.get(5), splits)); + assertFalse(splits.stream().anyMatch(p -> p.contains(filePaths.get(3))), format("Aborted txn %s found in splits %s", filePaths.get(3), splits)); + } + finally { + Files.walk(tablePath).sorted(Comparator.reverseOrder()).map(java.nio.file.Path::toFile).forEach(File::delete); + } + } + + private static void createOrcAcidFile(File file) + throws IOException + { + if (file.getName().equals("_orc_acid_version")) { + Files.write(file.toPath(), "2".getBytes(UTF_8)); + return; + } + checkState(file.createNewFile(), "Failed to create file %s", file); + } + + @Test + public void testFullAcidTableWithOriginalFilesFails() + throws Exception + { + java.nio.file.Path tablePath = Files.createTempDirectory("TestBackgroundHiveSplitLoader"); + Table table = table( + tablePath.toString(), + ImmutableList.of(), + Optional.empty(), + ImmutableMap.of("transactional", "true")); + + String originalFile = tablePath + "/000000_1"; + List filePaths = ImmutableList.of( + tablePath + "/delta_0000002_0000002_0000/_orc_acid_version", + tablePath + "/delta_0000002_0000002_0000/bucket_00000"); + + try { + for (String path : filePaths) { + File file = new File(path); + assertTrue(file.getParentFile().exists() || file.getParentFile().mkdirs(), "Failed creating directory " + file.getParentFile()); + createOrcAcidFile(file); + } + Files.write(Paths.get(originalFile), "test".getBytes(UTF_8)); + + // ValidWriteIdsList is of format $.
:::: + // This writeId list has high watermark transaction=3 + ValidReaderWriteIdList validWriteIdsList = new ValidReaderWriteIdList(format("4$%s.%s:3:9223372036854775807::", table.getDatabaseName(), table.getTableName())); + + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( + createTestHdfsEnvironment(new HiveConfig()), + TupleDomain.all(), + Optional.empty(), + table, + Optional.empty(), + Optional.of(validWriteIdsList)); + + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + drain(hiveSplitSource); + assertThatThrownBy(() -> drain(hiveSplitSource)) + .isInstanceOfSatisfying(PrestoException.class, e -> { + assertEquals(NOT_SUPPORTED.toErrorCode(), e.getErrorCode()); + }); + } + catch (PrestoException e) { + assertEquals(NOT_SUPPORTED.toErrorCode(), e.getErrorCode(), "Unexpected exception " + e); + } + finally { + MoreFiles.deleteRecursively(tablePath, ALLOW_INSECURE); + } + } + + @Test + public void testFullAcidTableVacuumWithOpenTxns() + throws Exception + { + java.nio.file.Path tablePath = Files.createTempDirectory("TestBackgroundHiveSplitLoader"); + Table table = table( + tablePath.toString(), + ImmutableList.of(), + Optional.empty(), + ImmutableMap.of("transactional", "true")); + + List filePaths = ImmutableList.of( + tablePath + "/delta_0000001_0000001_0000/_orc_acid_version", + tablePath + "/delta_0000001_0000001_0000/bucket_00000", + tablePath + "/delta_0000002_0000002_0000/_orc_acid_version", + tablePath + "/delta_0000002_0000002_0000/bucket_00000", + tablePath + "/delta_0000003_0000003_0000/_orc_acid_version", + tablePath + "/delta_0000003_0000003_0000/bucket_00000"); + + try { + for (String path : filePaths) { + File file = new File(path); + assertTrue(file.getParentFile().exists() || file.getParentFile().mkdirs(), "Failed creating directory " + file.getParentFile()); + createOrcAcidFile(file); + } + + // ValidWriteIdsList is of format $.
:::: + // This writeId list has high watermark transaction=3 + ValidReaderWriteIdList validWriteIdsList = new ValidReaderWriteIdList(format("4$%s.%s:1:2::", table.getDatabaseName(), table.getTableName())); + + ImmutableMap queryInfo = ImmutableMap.of( + "FULL", false, + "vacuumHandle", new ConnectorVacuumTableHandle() {}); + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( + createTestHdfsEnvironment(new HiveConfig()), + TupleDomain.all(), + Optional.empty(), + table, + Optional.empty(), + Optional.of(validWriteIdsList), + Optional.of(QueryType.VACUUM), + queryInfo); + + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + List splits = drainSplits(hiveSplitSource); + assertEquals(1, splits.size()); + } + finally { + MoreFiles.deleteRecursively(tablePath, ALLOW_INSECURE); + } + } + + @Test + public void testHive2VersionedFullAcidTableFails() + throws Exception + { + java.nio.file.Path tablePath = Files.createTempDirectory("TestBackgroundHiveSplitLoader"); + Table table = table( + tablePath.toString(), + ImmutableList.of(), + Optional.empty(), + ImmutableMap.of("transactional", "true")); + + List filePaths = ImmutableList.of( + tablePath + "/000000_1", // _orc_acid_version does not exist so it's assumed to be "ORC ACID version 0" + tablePath + "/delta_0000002_0000002_0000/bucket_00000"); + + for (String path : filePaths) { + File file = new File(path); + assertTrue(file.getParentFile().exists() || file.getParentFile().mkdirs(), "Failed creating directory " + file.getParentFile()); + createOrcAcidFile(file); + } + + // ValidWriteIdsList is of format $.
:::: + // This writeId list has high watermark transaction=3 + ValidReaderWriteIdList validWriteIdsList = new ValidReaderWriteIdList(format("4$%s.%s:3:9223372036854775807::", table.getDatabaseName(), table.getTableName())); + + BackgroundHiveSplitLoader backgroundHiveSplitLoader = backgroundHiveSplitLoader( + createTestHdfsEnvironment(new HiveConfig()), + TupleDomain.all(), + Optional.empty(), + table, + Optional.empty(), + Optional.of(validWriteIdsList)); + + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + assertThatThrownBy(() -> drain(hiveSplitSource)) + .isInstanceOfSatisfying(PrestoException.class, e -> assertEquals(NOT_SUPPORTED.toErrorCode(), e.getErrorCode())) + .hasMessage("Hive transactional tables are supported with Hive 3.0 and only after a major compaction has been run"); + + MoreFiles.deleteRecursively(tablePath, ALLOW_INSECURE); + } + + @Test + public void testPartitionedTableWithDynamicFilter() + throws Exception + { + TypeManager typeManager = new TestingTypeManager(); + List hivePartitionMetadatas = + ImmutableList.of( + new HivePartitionMetadata( + new HivePartition(new SchemaTableName("testSchema", "table_name")), + Optional.of(new Partition( + "testSchema", + "table_name", + ImmutableList.of("1"), + TABLE_STORAGE, + ImmutableList.of(TABLE_COLUMN), + ImmutableMap.of("param", "value"))), + ImmutableMap.of())); + + ConnectorSession connectorSession = new TestingConnectorSession( + new HiveSessionProperties( + new HiveConfig() + .setMaxSplitSize(new DataSize(1.0, GIGABYTE)) + .setDynamicFilterPartitionFilteringEnabled(true), + new OrcFileWriterConfig(), + new ParquetFileWriterConfig()).getSessionProperties()); + + BackgroundHiveSplitLoader backgroundHiveSplitLoader = new BackgroundHiveSplitLoader( + PARTITIONED_TABLE, + hivePartitionMetadatas, + TupleDomain.all(), + BackgroundHiveSplitLoader.BucketSplitInfo.createBucketSplitInfo(Optional.empty(), Optional.empty()), + connectorSession, + new TestingHdfsEnvironment(TEST_FILES), + new NamenodeStats(), + new CachingDirectoryLister(new HiveConfig()), + directExecutor(), + 2, + false, + Optional.empty(), + createTestDynamicFilterSupplier("partitionColumn", ImmutableList.of(0L, 2L, 3L)), + Optional.empty(), + ImmutableMap.of(), + typeManager); + + HiveSplitSource hiveSplitSource = hiveSplitSource(backgroundHiveSplitLoader); + backgroundHiveSplitLoader.start(hiveSplitSource); + + List splits = drainSplits(hiveSplitSource); + assertEquals(splits.size(), 0, "Splits should be filtered"); + } + + private static List drain(HiveSplitSource source) + throws Exception + { + return drainSplits(source).stream() + .map(HiveSplit::getPath) + .collect(toImmutableList()); + } + + private static List drainSplits(HiveSplitSource source) + throws Exception + { + ImmutableList.Builder splits = ImmutableList.builder(); + while (!source.isFinished()) { + source.getNextBatch(NOT_PARTITIONED, 100).get() + .getSplits().stream() + .map(HiveSplitWrapper::getOnlyHiveSplit) + .forEach(splits::add); + } + return splits.build(); + } + + private static BackgroundHiveSplitLoader backgroundHiveSplitLoader( + List files, + TupleDomain tupleDomain) + { + return backgroundHiveSplitLoader( + files, + tupleDomain, + Optional.empty(), + SIMPLE_TABLE, + Optional.empty()); + } + + private static BackgroundHiveSplitLoader backgroundHiveSplitLoader( + List files, + TupleDomain compactEffectivePredicate, + Optional hiveBucketFilter, + Table table, + Optional bucketHandle) + { + return backgroundHiveSplitLoader( + files, + compactEffectivePredicate, + hiveBucketFilter, + table, + bucketHandle, + Optional.empty()); + } + + private static BackgroundHiveSplitLoader backgroundHiveSplitLoader( + List files, + TupleDomain compactEffectivePredicate, + Optional hiveBucketFilter, + Table table, + Optional bucketHandle, + Optional validWriteIds) + { + return backgroundHiveSplitLoader( + new TestingHdfsEnvironment(files), + compactEffectivePredicate, + hiveBucketFilter, + table, + bucketHandle, + validWriteIds); + } + + private static BackgroundHiveSplitLoader backgroundHiveSplitLoader( + HdfsEnvironment hdfsEnvironment, + TupleDomain compactEffectivePredicate, + Optional hiveBucketFilter, + Table table, + Optional bucketHandle, + Optional validWriteIds) + { + return backgroundHiveSplitLoader(hdfsEnvironment, compactEffectivePredicate, hiveBucketFilter, table, bucketHandle, validWriteIds, Optional.empty(), Collections.emptyMap()); + } + + private static BackgroundHiveSplitLoader backgroundHiveSplitLoader( + HdfsEnvironment hdfsEnvironment, + TupleDomain compactEffectivePredicate, + Optional hiveBucketFilter, + Table table, + Optional bucketHandle, + Optional validWriteIds, + Optional queryType, Map queryInfo) + { + List hivePartitionMetadatas = + ImmutableList.of( + new HivePartitionMetadata( + new HivePartition(new SchemaTableName("testSchema", "table_name")), + Optional.empty(), + ImmutableMap.of())); + + ConnectorSession connectorSession = new TestingConnectorSession( + new HiveSessionProperties(new HiveConfig().setMaxSplitSize(new DataSize(1.0, GIGABYTE)), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + + return new BackgroundHiveSplitLoader( + table, + hivePartitionMetadatas, + compactEffectivePredicate, + BackgroundHiveSplitLoader.BucketSplitInfo.createBucketSplitInfo(bucketHandle, hiveBucketFilter), + connectorSession, + hdfsEnvironment, + new NamenodeStats(), + new CachingDirectoryLister(new HiveConfig()), + EXECUTOR, + 2, + false, + validWriteIds, + null, + queryType, + queryInfo, null); + } + + private static BackgroundHiveSplitLoader backgroundHiveSplitLoader(List files, DirectoryLister directoryLister) + { + List hivePartitionMetadatas = ImmutableList.of( + new HivePartitionMetadata( + new HivePartition(new SchemaTableName("testSchema", "table_name")), + Optional.empty(), + ImmutableMap.of())); + + ConnectorSession connectorSession = new TestingConnectorSession( + new HiveSessionProperties(new HiveConfig().setMaxSplitSize(new DataSize(1.0, GIGABYTE)), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + + return new BackgroundHiveSplitLoader( + SIMPLE_TABLE, + hivePartitionMetadatas, + TupleDomain.none(), + Optional.empty(), + connectorSession, + new TestingHdfsEnvironment(files), + new NamenodeStats(), + directoryLister, + EXECUTOR, + 2, + false, + Optional.empty(), + null, + Optional.empty(), + Collections.emptyMap(), null); + } + + private static BackgroundHiveSplitLoader backgroundHiveSplitLoaderOfflinePartitions() + { + ConnectorSession connectorSession = new TestingConnectorSession( + new HiveSessionProperties(new HiveConfig().setMaxSplitSize(new DataSize(1.0, GIGABYTE)), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + + return new BackgroundHiveSplitLoader( + SIMPLE_TABLE, + createPartitionMetadataWithOfflinePartitions(), + TupleDomain.all(), + BackgroundHiveSplitLoader.BucketSplitInfo.createBucketSplitInfo(Optional.empty(), Optional.empty()), + connectorSession, + new TestingHdfsEnvironment(TEST_FILES), + new NamenodeStats(), + new CachingDirectoryLister(new HiveConfig()), + directExecutor(), + 2, + false, + Optional.empty(), + null, + Optional.empty(), + Collections.emptyMap(), null); + } + + private static Iterable createPartitionMetadataWithOfflinePartitions() + throws RuntimeException + { + return () -> new AbstractIterator() + { + // This iterator is crafted to return a valid partition for the first calls to + // hasNext() and next(), and then it should throw for the second call to hasNext() + private int position = -1; + + @Override + protected HivePartitionMetadata computeNext() + { + position++; + switch (position) { + case 0: + return new HivePartitionMetadata( + new HivePartition(new SchemaTableName("testSchema", "table_name")), + Optional.empty(), + ImmutableMap.of()); + case 1: + throw new RuntimeException("OFFLINE"); + default: + return endOfData(); + } + } + }; + } + + private static HiveSplitSource hiveSplitSource(HiveSplitLoader hiveSplitLoader) + { + return HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + SIMPLE_TABLE.getDatabaseName(), + SIMPLE_TABLE.getTableName(), + 1, + 1, + new DataSize(32, MEGABYTE), + Integer.MAX_VALUE, + hiveSplitLoader, + EXECUTOR, + new CounterStat(), + null, + null, null, new HiveConfig(), + HiveStorageFormat.ORC); + } + + private static Table table( + List partitionColumns, + Optional bucketProperty, + ImmutableMap tableParameters) + { + return table(partitionColumns, + bucketProperty, + tableParameters, + StorageFormat.create( + "com.facebook.hive.orc.OrcSerde", + "org.apache.hadoop.hive.ql.io.RCFileInputFormat", + "org.apache.hadoop.hive.ql.io.RCFileInputFormat")); + } + + private static Table table( + String location, + List partitionColumns, + Optional bucketProperty, + ImmutableMap tableParameters) + { + return table(location, + partitionColumns, + bucketProperty, + tableParameters, + StorageFormat.create( + "com.facebook.hive.orc.OrcSerde", + "org.apache.hadoop.hive.ql.io.RCFileInputFormat", + "org.apache.hadoop.hive.ql.io.RCFileInputFormat")); + } + + private static Table table( + List partitionColumns, + Optional bucketProperty, + Map tableParameters, + StorageFormat storageFormat) + { + return table("hdfs://VOL1:9000/db_name/table_name", + partitionColumns, + bucketProperty, + tableParameters, + storageFormat); + } + + private static Table table( + String location, + List partitionColumns, + Optional bucketProperty, + Map tableParameters, + StorageFormat storageFormat) + { + Table.Builder tableBuilder = Table.builder(); + tableBuilder.getStorageBuilder() + .setStorageFormat( + StorageFormat.create( + "com.facebook.hive.orc.OrcSerde", + "org.apache.hadoop.hive.ql.io.RCFileInputFormat", + "org.apache.hadoop.hive.ql.io.RCFileInputFormat")) + .setLocation(location) + .setSkewed(false) + .setBucketProperty(bucketProperty); + + return tableBuilder + .setDatabaseName("test_dbname") + .setOwner("testOwner") + .setTableName("test_table") + .setTableType(TableType.MANAGED_TABLE.toString()) + .setDataColumns(ImmutableList.of(new Column("col1", HIVE_STRING, Optional.empty()))) + .setParameters(tableParameters) + .setPartitionColumns(partitionColumns) + .build(); + } + + private static LocatedFileStatus locatedFileStatus(Path path) + { + return new LocatedFileStatus( + 0L, + false, + 0, + 0L, + 0L, + 0L, + null, + null, + null, + null, + path, + new BlockLocation[] {new BlockLocation()}); + } + + private static LocatedFileStatus locatedFileStatusWithNoBlocks(Path path) + { + return new LocatedFileStatus( + 0L, + false, + 0, + 0L, + 0L, + 0L, + null, + null, + null, + null, + path, + new BlockLocation[] {}); + } + + public static class TestingHdfsEnvironment + extends HdfsEnvironment + { + private final List files; + + public TestingHdfsEnvironment(List files) + { + super( + new HiveHdfsConfiguration(new HdfsConfigurationInitializer(new HiveConfig()), ImmutableSet.of()), + new HiveConfig(), + new NoHdfsAuthentication()); + this.files = ImmutableList.copyOf(files); + } + + @Override + public FileSystem getFileSystem(String user, Path path, Configuration configuration) + { + return new TestingHdfsFileSystem(files); + } + } + + private static class TestingHdfsFileSystem + extends FileSystem + { + private final List files; + + public TestingHdfsFileSystem(List files) + { + this.files = ImmutableList.copyOf(files); + } + + @Override + public boolean delete(Path f, boolean recursive) + { + throw new UnsupportedOperationException(); + } + + @Override + public boolean rename(Path src, Path dst) + { + throw new UnsupportedOperationException(); + } + + @Override + public void setWorkingDirectory(Path dir) + { + throw new UnsupportedOperationException(); + } + + @Override + public FileStatus[] listStatus(Path f) + { + throw new UnsupportedOperationException(); + } + + @Override + public RemoteIterator listLocatedStatus(Path f) + { + return new RemoteIterator() + { + private final Iterator iterator = files.iterator(); + + @Override + public boolean hasNext() + throws IOException + { + return iterator.hasNext(); + } + + @Override + public LocatedFileStatus next() + throws IOException + { + return iterator.next(); + } + }; + } + + @Override + public FSDataOutputStream create( + Path f, + FsPermission permission, + boolean overwrite, + int bufferSize, + short replication, + long blockSize, + Progressable progress) + { + throw new UnsupportedOperationException(); + } + + @Override + public boolean mkdirs(Path f, FsPermission permission) + { + throw new UnsupportedOperationException(); + } + + @Override + public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) + { + throw new UnsupportedOperationException(); + } + + @Override + public FSDataInputStream open(Path f, int bufferSize) + { + throw new UnsupportedOperationException(); + } + + @Override + public FileStatus getFileStatus(Path f) + { + throw new UnsupportedOperationException(); + } + + @Override + public Path getWorkingDirectory() + { + throw new UnsupportedOperationException(); + } + + @Override + public URI getUri() + { + throw new UnsupportedOperationException(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestColumnTypeCacheable.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestColumnTypeCacheable.java new file mode 100644 index 00000000..5d124faf --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestColumnTypeCacheable.java @@ -0,0 +1,779 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.airlift.slice.Slices; +import io.airlift.stats.CounterStat; +import io.airlift.units.DataSize; +import io.prestosql.spi.connector.ColumnMetadata; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorSplit; +import io.prestosql.spi.connector.ConnectorSplitSource; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.testing.TestingConnectorSession; +import org.testng.annotations.Test; + +import java.math.BigDecimal; +import java.util.List; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.Executors; + +import static io.airlift.concurrent.MoreFutures.getFutureValue; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.spi.connector.NotPartitionedPartitionHandle.NOT_PARTITIONED; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.CharType.createCharType; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DecimalType.createDecimalType; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TimestampType.TIMESTAMP; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.VarcharType.VARCHAR; +import static org.testng.Assert.assertEquals; + +public class TestColumnTypeCacheable +{ + @Test + public void testDecimalCacheable() + { + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ColumnMetadata ptdMetadata = new ColumnMetadata("pt_d", createDecimalType(4, 2)); + Set> cachePredicates = ImmutableSet.of( + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(createDecimalType(4, 2), BigDecimal.valueOf(10.88d).unscaledValue().longValue()))), + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(createDecimalType(4, 2), BigDecimal.valueOf(20.56d).unscaledValue().longValue())))); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + cachePredicates, null, + new HiveConfig(), + HiveStorageFormat.ORC); + int[] idPrefix = new int[] {1}; + ImmutableMap + .of("__HIVE_DEFAULT_PARTITION__", 1, "10.88", 2, "22.22", 3, "20.56", 4) + .forEach((ptdValue, splitCount) -> { + for (int i = 1; i <= splitCount; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(idPrefix[0] * 10 + i, ImmutableList.of(new HivePartitionKey("pt_d", ptdValue)), "pt_d=" + ptdValue)); + } + idPrefix[0] = idPrefix[0] + 1; + }); + List splits = getSplits(hiveSplitSource, 10); + assertEquals(splits.size(), 10); + assertEquals(splits.stream().filter(ConnectorSplit::isCacheable).count(), 6); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "22.22"))).count(), 0); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "10.88"))).count(), 2); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "20.56"))).count(), 4); + } + + @Test + public void testBooleanCacheable() + { + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ColumnMetadata ptdMetadata = new ColumnMetadata("pt_d", BOOLEAN); + Set> cachePredicates = ImmutableSet.of( + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(BOOLEAN, true))), + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(BOOLEAN, false)))); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + cachePredicates, null, + new HiveConfig(), + HiveStorageFormat.ORC); + int[] idPrefix = new int[] {1}; + ImmutableMap + .of("__HIVE_DEFAULT_PARTITION__", 1, "false", 2, "true", 3) + .forEach((ptdValue, splitCount) -> { + for (int i = 1; i <= splitCount; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(idPrefix[0] * 10 + i, ImmutableList.of(new HivePartitionKey("pt_d", ptdValue)), "pt_d=" + ptdValue)); + } + idPrefix[0] = idPrefix[0] + 1; + }); + List splits = getSplits(hiveSplitSource, 10); + assertEquals(splits.size(), 6); + assertEquals(splits.stream().filter(ConnectorSplit::isCacheable).count(), 5); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "true"))).count(), 3); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "false"))).count(), 2); + } + + @Test + public void testTinyintCacheable() + { + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ColumnMetadata ptdMetadata = new ColumnMetadata("pt_d", TINYINT); + Set> cachePredicates = ImmutableSet.of( + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(TINYINT, 100L))), + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(TINYINT, 101L)))); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + cachePredicates, null, + new HiveConfig(), + HiveStorageFormat.ORC); + int[] idPrefix = new int[] {1}; + ImmutableMap + .of("__HIVE_DEFAULT_PARTITION__", 1, "102", 2, "100", 3, "101", 2) + .forEach((ptdValue, splitCount) -> { + for (int i = 1; i <= splitCount; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(idPrefix[0] * 10 + i, ImmutableList.of(new HivePartitionKey("pt_d", ptdValue)), "pt_d=" + ptdValue)); + } + idPrefix[0] = idPrefix[0] + 1; + }); + List splits = getSplits(hiveSplitSource, 10); + assertEquals(splits.size(), 8); + assertEquals(splits.stream().filter(ConnectorSplit::isCacheable).count(), 5); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "100"))).count(), 3); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "101"))).count(), 2); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "102"))).count(), 0); + } + + @Test + public void testSmallintCacheable() + { + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ColumnMetadata ptdMetadata = new ColumnMetadata("pt_d", SMALLINT); + Set> cachePredicates = ImmutableSet.of( + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(SMALLINT, 20001L))), + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(SMALLINT, 20002L)))); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + cachePredicates, null, + new HiveConfig(), + HiveStorageFormat.ORC); + int[] idPrefix = new int[] {1}; + ImmutableMap + .of("__HIVE_DEFAULT_PARTITION__", 1, "20000", 2, "20001", 3, "20002", 2) + .forEach((ptdValue, splitCount) -> { + for (int i = 1; i <= splitCount; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(idPrefix[0] * 10 + i, ImmutableList.of(new HivePartitionKey("pt_d", ptdValue)), "pt_d=" + ptdValue)); + } + idPrefix[0] = idPrefix[0] + 1; + }); + List splits = getSplits(hiveSplitSource, 10); + assertEquals(splits.size(), 8); + assertEquals(splits.stream().filter(ConnectorSplit::isCacheable).count(), 5); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "20001"))).count(), 3); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "20002"))).count(), 2); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "20000"))).count(), 0); + } + + @Test + public void testIntegerCacheable() + { + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ColumnMetadata ptdMetadata = new ColumnMetadata("pt_d", INTEGER); + Set> cachePredicates = ImmutableSet.of( + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(INTEGER, 8001L))), + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(INTEGER, 8002L)))); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + cachePredicates, null, + new HiveConfig(), + HiveStorageFormat.ORC); + int[] idPrefix = new int[] {1}; + ImmutableMap + .of("__HIVE_DEFAULT_PARTITION__", 1, "8001", 2, "8002", 3, "8000", 2) + .forEach((ptdValue, splitCount) -> { + for (int i = 1; i <= splitCount; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(idPrefix[0] * 10 + i, ImmutableList.of(new HivePartitionKey("pt_d", ptdValue)), "pt_d=" + ptdValue)); + } + idPrefix[0] = idPrefix[0] + 1; + }); + List splits = getSplits(hiveSplitSource, 10); + assertEquals(splits.size(), 8); + assertEquals(splits.stream().filter(ConnectorSplit::isCacheable).count(), 5); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "8002"))).count(), 3); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "8001"))).count(), 2); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "8000"))).count(), 0); + } + + @Test + public void testBigintCacheable() + { + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ColumnMetadata ptdMetadata = new ColumnMetadata("pt_d", BIGINT); + Set> cachePredicates = ImmutableSet.of( + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(BIGINT, 20200522L))), + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(BIGINT, 20200521L)))); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + cachePredicates, null, + new HiveConfig(), + HiveStorageFormat.ORC); + int[] idPrefix = new int[] {1}; + ImmutableMap + .of("__HIVE_DEFAULT_PARTITION__", 1, "20200520", 2, "20200521", 3, "20200522", 2) + .forEach((ptdValue, splitCount) -> { + for (int i = 1; i <= splitCount; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(idPrefix[0] * 10 + i, ImmutableList.of(new HivePartitionKey("pt_d", ptdValue)), "pt_d=" + ptdValue)); + } + idPrefix[0] = idPrefix[0] + 1; + }); + List splits = getSplits(hiveSplitSource, 10); + assertEquals(splits.size(), 8); + assertEquals(splits.stream().filter(ConnectorSplit::isCacheable).count(), 5); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "20200521"))).count(), 3); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "20200522"))).count(), 2); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "20200520"))).count(), 0); + } + + @Test + public void testRealTypeCacheable() + { + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ColumnMetadata ptdMetadata = new ColumnMetadata("pt_d", REAL); + Set> cachePredicates = ImmutableSet.of( + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(REAL, (long) Float.floatToRawIntBits(1.0f)))), + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(REAL, (long) Float.floatToRawIntBits(1000.10f))))); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + cachePredicates, null, + new HiveConfig(), + HiveStorageFormat.ORC); + int[] idPrefix = new int[] {1}; + ImmutableMap + .of("__HIVE_DEFAULT_PARTITION__", 1, "1.0", 2, "2", 3, "1000.10", 4) + .forEach((ptdValue, splitCount) -> { + for (int i = 1; i <= splitCount; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(idPrefix[0] * 10 + i, ImmutableList.of(new HivePartitionKey("pt_d", ptdValue)), "pt_d=" + ptdValue)); + } + idPrefix[0] = idPrefix[0] + 1; + }); + List splits = getSplits(hiveSplitSource, 10); + assertEquals(splits.size(), 10); + assertEquals(splits.stream().filter(ConnectorSplit::isCacheable).count(), 6); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "1.0"))).count(), 2); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "2"))).count(), 0); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "1000.10"))).count(), 4); + } + + @Test + public void testDoubleTypeCacheable() + { + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ColumnMetadata ptdMetadata = new ColumnMetadata("pt_d", DOUBLE); + Set> cachePredicates = ImmutableSet.of( + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(DOUBLE, 1.0d))), + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(DOUBLE, 1000.10d)))); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + cachePredicates, null, + new HiveConfig(), + HiveStorageFormat.ORC); + int[] idPrefix = new int[] {1}; + ImmutableMap + .of("__HIVE_DEFAULT_PARTITION__", 1, "1.0", 2, "2", 3, "1000.10", 4) + .forEach((ptdValue, splitCount) -> { + for (int i = 1; i <= splitCount; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(idPrefix[0] * 10 + i, ImmutableList.of(new HivePartitionKey("pt_d", ptdValue)), "pt_d=" + ptdValue)); + } + idPrefix[0] = idPrefix[0] + 1; + }); + List splits = getSplits(hiveSplitSource, 10); + assertEquals(splits.size(), 10); + assertEquals(splits.stream().filter(ConnectorSplit::isCacheable).count(), 6); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "1.0"))).count(), 2); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "2"))).count(), 0); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "1000.10"))).count(), 4); + } + + @Test + public void testDateCacheable() + { + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ColumnMetadata ptdMetadata = new ColumnMetadata("pt_d", DATE); + Set> cachePredicates = ImmutableSet.of( + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(DATE, HiveUtil.parseHiveDate("1995-10-09")))), + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(DATE, HiveUtil.parseHiveDate("1995-11-14"))))); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + cachePredicates, null, + new HiveConfig(), + HiveStorageFormat.ORC); + int[] idPrefix = new int[] {1}; + ImmutableMap + .of("__HIVE_DEFAULT_PARTITION__", 1, "1995-10-09", 2, "2020-07-22", 3, "1995-11-14", 4) + .forEach((ptdValue, splitCount) -> { + for (int i = 1; i <= splitCount; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(idPrefix[0] * 10 + i, ImmutableList.of(new HivePartitionKey("pt_d", ptdValue)), "pt_d=" + ptdValue)); + } + idPrefix[0] = idPrefix[0] + 1; + }); + List splits = getSplits(hiveSplitSource, 10); + assertEquals(splits.size(), 10); + assertEquals(splits.stream().filter(ConnectorSplit::isCacheable).count(), 6); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "1995-10-09"))).count(), 2); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "2020-07-22"))).count(), 0); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "1995-11-14"))).count(), 4); + } + + @Test + public void testTimestampCacheable() + { + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ColumnMetadata ptdMetadata = new ColumnMetadata("pt_d", TIMESTAMP); + Set> cachePredicates = ImmutableSet.of( + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(TIMESTAMP, HiveUtil.parseHiveTimestamp("1995-10-09 00:00:00")))), + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(TIMESTAMP, HiveUtil.parseHiveTimestamp("1995-11-14 00:00:00"))))); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + cachePredicates, null, + new HiveConfig(), + HiveStorageFormat.ORC); + int[] idPrefix = new int[] {1}; + ImmutableMap + .of("__HIVE_DEFAULT_PARTITION__", 1, "1995-10-09 00:00:00", 2, "2020-07-22 00:00:00", 3, "1995-11-14 00:00:00", 4) + .forEach((ptdValue, splitCount) -> { + for (int i = 1; i <= splitCount; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(idPrefix[0] * 10 + i, ImmutableList.of(new HivePartitionKey("pt_d", ptdValue)), "pt_d=" + ptdValue)); + } + idPrefix[0] = idPrefix[0] + 1; + }); + List splits = getSplits(hiveSplitSource, 10); + assertEquals(splits.size(), 10); + assertEquals(splits.stream().filter(ConnectorSplit::isCacheable).count(), 6); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "1995-10-09 00:00:00"))).count(), 2); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "2020-07-22 00:00:00"))).count(), 0); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "1995-11-14 00:00:00"))).count(), 4); + } + + @Test + public void testVarcharCacheable() + { + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ColumnMetadata ptdMetadata = new ColumnMetadata("pt_d", VARCHAR); + Set> cachePredicates = ImmutableSet.of( + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(VARCHAR, Slices.utf8Slice("abc")))), + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(VARCHAR, Slices.utf8Slice("xyz"))))); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + cachePredicates, null, + new HiveConfig(), + HiveStorageFormat.ORC); + int[] idPrefix = new int[] {1}; + ImmutableMap + .of("__HIVE_DEFAULT_PARTITION__", 1, "abc", 2, "def", 3, "xyz", 4) + .forEach((ptdValue, splitCount) -> { + for (int i = 1; i <= splitCount; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(idPrefix[0] * 10 + i, ImmutableList.of(new HivePartitionKey("pt_d", ptdValue)), "pt_d=" + ptdValue)); + } + idPrefix[0] = idPrefix[0] + 1; + }); + List splits = getSplits(hiveSplitSource, 10); + assertEquals(splits.size(), 10); + assertEquals(splits.stream().filter(ConnectorSplit::isCacheable).count(), 6); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "abc"))).count(), 2); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "def"))).count(), 0); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "xyz"))).count(), 4); + } + + @Test + public void testCharCacheable() + { + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ColumnMetadata ptdMetadata = new ColumnMetadata("pt_d", createCharType(3)); + Set> cachePredicates = ImmutableSet.of( + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(createCharType(3), Slices.utf8Slice("abc")))), + TupleDomain.withColumnDomains(ImmutableMap.of(ptdMetadata, Domain.singleValue(createCharType(3), Slices.utf8Slice("xyz"))))); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + cachePredicates, null, + new HiveConfig(), + HiveStorageFormat.ORC); + int[] idPrefix = new int[] {1}; + ImmutableMap + .of("__HIVE_DEFAULT_PARTITION__", 1, "abc", 2, "def", 3, "xyz", 4) + .forEach((ptdValue, splitCount) -> { + for (int i = 1; i <= splitCount; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(idPrefix[0] * 10 + i, ImmutableList.of(new HivePartitionKey("pt_d", ptdValue)), "pt_d=" + ptdValue)); + } + idPrefix[0] = idPrefix[0] + 1; + }); + List splits = getSplits(hiveSplitSource, 10); + assertEquals(splits.size(), 10); + assertEquals(splits.stream().filter(ConnectorSplit::isCacheable).count(), 6); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "abc"))).count(), 2); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "def"))).count(), 0); + assertEquals(splits.stream() + .filter(ConnectorSplit::isCacheable) + .map(HiveSplitWrapper::getOnlyHiveSplit) + .filter(hiveSplit -> hiveSplit + .getPartitionKeys() + .contains(new HivePartitionKey("pt_d", "xyz"))).count(), 4); + } + + private static class TestingHiveSplitLoader + implements HiveSplitLoader + { + @Override + public void start(HiveSplitSource splitSource) + { + } + + @Override + public void stop() + { + } + } + + private static class TestPartitionSplit + extends InternalHiveSplit + { + private TestPartitionSplit(int id, List partitionKeys, String partitionName) + { + this(id, partitionKeys, partitionName, OptionalInt.empty()); + } + + private TestPartitionSplit(int id, List partitionKeys, String partitionName, OptionalInt bucketNumber) + { + super( + partitionName, + "path", + 0, + 100, + 100, + 0, + properties("id", String.valueOf(id)), + partitionKeys, + ImmutableList.of(new InternalHiveBlock(0, 100, ImmutableList.of())), + bucketNumber, + true, + false, + ImmutableMap.of(), + Optional.empty(), + false, + Optional.empty(), + Optional.empty(), + ImmutableMap.of()); + } + + private static Properties properties(String key, String value) + { + Properties properties = new Properties(); + properties.put(key, value); + return properties; + } + } + + private static List getSplits(ConnectorSplitSource source, int maxSize) + { + return getSplits(source, OptionalInt.empty(), maxSize); + } + + private static List getSplits(ConnectorSplitSource source, OptionalInt bucketNumber, int maxSize) + { + if (bucketNumber.isPresent()) { + return getFutureValue(source.getNextBatch(new HivePartitionHandle(bucketNumber.getAsInt()), maxSize)).getSplits(); + } + else { + return getFutureValue(source.getNextBatch(NOT_PARTITIONED, maxSize)).getSplits(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestFileSystemCache.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestFileSystemCache.java new file mode 100644 index 00000000..e6b223fb --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestFileSystemCache.java @@ -0,0 +1,57 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableSet; +import io.prestosql.plugin.hive.authentication.ImpersonatingHdfsAuthentication; +import io.prestosql.plugin.hive.authentication.SimpleHadoopAuthentication; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.testng.annotations.Test; + +import java.io.IOException; + +import static org.testng.Assert.assertNotSame; +import static org.testng.Assert.assertSame; + +public class TestFileSystemCache +{ + @Test + public void testFileSystemCache() + throws IOException + { + ImpersonatingHdfsAuthentication auth = new ImpersonatingHdfsAuthentication(new SimpleHadoopAuthentication()); + HdfsEnvironment environment = + new HdfsEnvironment( + new HiveHdfsConfiguration(new HdfsConfigurationInitializer(new HiveConfig()), ImmutableSet.of()), + new HiveConfig(), + auth); + FileSystem fs1 = getFileSystem(environment, "user"); + FileSystem fs2 = getFileSystem(environment, "user"); + assertSame(fs1, fs2); + + FileSystem fs3 = getFileSystem(environment, "other_user"); + assertNotSame(fs1, fs3); + + FileSystem fs4 = getFileSystem(environment, "other_user"); + assertSame(fs3, fs4); + } + + private FileSystem getFileSystem(HdfsEnvironment environment, String user) + throws IOException + { + return environment.getFileSystem(user, new Path("/"), new Configuration(false)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveBooleanParser.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveBooleanParser.java new file mode 100644 index 00000000..5e27a7c4 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveBooleanParser.java @@ -0,0 +1,51 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import org.testng.annotations.Test; + +import static io.prestosql.plugin.hive.HiveBooleanParser.parseHiveBoolean; +import static java.nio.charset.StandardCharsets.US_ASCII; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; + +public class TestHiveBooleanParser +{ + @Test + public void testParse() + { + assertTrue(parseBoolean("true")); + assertTrue(parseBoolean("TRUE")); + assertTrue(parseBoolean("tRuE")); + + assertFalse(parseBoolean("false")); + assertFalse(parseBoolean("FALSE")); + assertFalse(parseBoolean("fAlSe")); + + assertNull(parseBoolean("true ")); + assertNull(parseBoolean(" true")); + assertNull(parseBoolean("false ")); + assertNull(parseBoolean(" false")); + assertNull(parseBoolean("t")); + assertNull(parseBoolean("f")); + assertNull(parseBoolean("")); + assertNull(parseBoolean("blah")); + } + + private static Boolean parseBoolean(String s) + { + return parseHiveBoolean(s.getBytes(US_ASCII), 0, s.length()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveBucketing.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveBucketing.java new file mode 100644 index 00000000..65e6a77e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveBucketing.java @@ -0,0 +1,337 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import io.airlift.slice.Slices; +import io.prestosql.plugin.hive.HiveBucketing.BucketingVersion; +import io.prestosql.spi.Page; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.type.StandardTypes; +import io.prestosql.spi.type.TimestampType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.io.DateWritableV2; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.Map; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.prestosql.plugin.hive.HiveBucketing.BucketingVersion.BUCKETING_V1; +import static io.prestosql.plugin.hive.HiveBucketing.BucketingVersion.BUCKETING_V2; +import static io.prestosql.plugin.hive.HiveBucketing.getBucketHashCode; +import static io.prestosql.spi.type.TypeUtils.writeNativeValue; +import static java.lang.Double.longBitsToDouble; +import static java.lang.Float.intBitsToFloat; +import static java.util.Arrays.asList; +import static java.util.Map.Entry; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.timestampTypeInfo; +import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; +import static org.testng.Assert.assertEquals; + +public class TestHiveBucketing +{ + @Test + public void testHashingCompare() + { + assertBucketEquals("boolean", null, 0, 0); + assertBucketEquals("boolean", true, 1, 1); + assertBucketEquals("boolean", false, 0, 0); + + assertBucketEquals("tinyint", null, 0, 0); + assertBucketEquals("tinyint", (byte) 5, 5, 5); + assertBucketEquals("tinyint", Byte.MIN_VALUE, -128, -128); + assertBucketEquals("tinyint", Byte.MAX_VALUE, 127, 127); + + assertBucketEquals("smallint", null, 0, 0); + assertBucketEquals("smallint", (short) 300, 300, 2107031704); + assertBucketEquals("smallint", Short.MIN_VALUE, -32768, 1342976838); + assertBucketEquals("smallint", Short.MAX_VALUE, 32767, -684075052); + + assertBucketEquals("int", null, 0, 0); + assertBucketEquals("int", 300_000, 300000, -678663480); + assertBucketEquals("int", Integer.MIN_VALUE, -2147483648, 1194881028); + assertBucketEquals("int", Integer.MAX_VALUE, 2147483647, 1133859967); + + assertBucketEquals("bigint", null, 0, 0); + assertBucketEquals("bigint", 300_000_000_000L, -647710651, -888935297); + assertBucketEquals("bigint", Long.MIN_VALUE, -2147483648, 1728983947); + assertBucketEquals("bigint", Long.MAX_VALUE, -2147483648, -536577852); + + assertBucketEquals("float", null, 0, 0); + assertBucketEquals("float", 12.34F, 1095069860, -381747602); + assertBucketEquals("float", -Float.MAX_VALUE, -8388609, 470252243); + assertBucketEquals("float", Float.MIN_VALUE, 1, 1206721797); + assertBucketEquals("float", Float.POSITIVE_INFINITY, 2139095040, -292175804); + assertBucketEquals("float", Float.NEGATIVE_INFINITY, -8388608, -1433270801); + assertBucketEquals("float", Float.NaN, 2143289344, -480354314); + assertBucketEquals("float", intBitsToFloat(0xffc00000), 2143289344, -480354314); // also a NaN + assertBucketEquals("float", intBitsToFloat(0x7fc00000), 2143289344, -480354314); // also a NaN + assertBucketEquals("float", intBitsToFloat(0x7fc01234), 2143289344, -480354314); // also a NaN + assertBucketEquals("float", intBitsToFloat(0xffc01234), 2143289344, -480354314); // also a NaN + + assertBucketEquals("double", null, 0, 0); + assertBucketEquals("double", 12.34, 986311098, -2070733568); + assertBucketEquals("double", -Double.MAX_VALUE, 1048576, 14392725); + assertBucketEquals("double", Double.MIN_VALUE, 1, -8838199); + assertBucketEquals("double", Double.POSITIVE_INFINITY, 2146435072, 1614292060); + assertBucketEquals("double", Double.NEGATIVE_INFINITY, -1048576, 141388605); + assertBucketEquals("double", Double.NaN, 2146959360, 1138026565); + assertBucketEquals("double", longBitsToDouble(0xfff8000000000000L), 2146959360, 1138026565); // also a NaN + assertBucketEquals("double", longBitsToDouble(0x7ff8123412341234L), 2146959360, 1138026565); // also a NaN + assertBucketEquals("double", longBitsToDouble(0xfff8123412341234L), 2146959360, 1138026565); // also a NaN + + assertBucketEquals("varchar(15)", null, 0, 0); + assertBucketEquals("varchar(15)", "", 1, -965378730); + assertBucketEquals("varchar(15)", "test string", -189841218, -138301454); + assertBucketEquals("varchar(15)", "\u5f3a\u5927\u7684Hetu\u5f15\u64ce", 481023052, 1436831192); // 3-byte UTF-8 sequences (in Basic Plane, i.e. Plane 0) + assertBucketEquals("varchar(15)", "\uD843\uDFFC\uD843\uDFFD\uD843\uDFFE\uD843\uDFFF", -457487557, -697348811); // 4 code points: 20FFC - 20FFF. 4-byte UTF-8 sequences in Supplementary Plane 2 + assertBucketEquals("string", null, 0, 0); + assertBucketEquals("string", "", 0, -965378730); + assertBucketEquals("string", "test string", -318923937, -138301454); + assertBucketEquals("string", "\u5f3a\u5927\u7684Hetu\u5f15\u64ce", 889847277, 1436831192); // 3-byte UTF-8 sequences (in Basic Plane, i.e. Plane 0) + assertBucketEquals("string", "\uD843\uDFFC\uD843\uDFFD\uD843\uDFFE\uD843\uDFFF", -1810797254, -697348811); // 4 code points: 20FFC - 20FFF. 4-byte UTF-8 sequences in Supplementary Plane 2 + + assertBucketEquals("char(6)", null, 0, 0); + assertBucketEquals("char(6)", "", 1, -965378730); + assertBucketEquals("char(6)", "test_1", 10333957, 1284522943); + + assertBucketEquals("date", null, 0, 0); + assertBucketEquals("date", Date.valueOf("1970-01-01"), 0, 1362653161); + assertBucketEquals("date", Date.valueOf("2015-11-19"), 16758, 8542395); + assertBucketEquals("date", Date.valueOf("1950-11-19"), -6983, -431619185); + + for (BucketingVersion version : BucketingVersion.values()) { + List typeInfos = ImmutableList.of(timestampTypeInfo); + + assertThatThrownBy(() -> getBucketHashCode(version, typeInfos, new Object[]{0})) + .hasMessage("Computation of Hive bucket hashCode is not supported for Hive primitive category: TIMESTAMP"); + TimestampType timestampType = TimestampType.TIMESTAMP; + BlockBuilder builder = timestampType.createBlockBuilder(null, 1); + timestampType.writeLong(builder, 0); + Page page = new Page(builder.build()); + + assertThatThrownBy(() -> getBucketHashCode(version, typeInfos, page, 0)) + .hasMessage("Computation of Hive bucket hashCode is not supported for Hive primitive category: TIMESTAMP"); + } + + assertBucketEquals("array", null, 0, 0); + assertBucketEquals("array", ImmutableList.of(), 0, 0); + assertBucketEquals("array", ImmutableList.of((short) 5, (short) 8, (short) 13), 5066, -905011156); + assertBucketEquals("array", ImmutableList.of("test1", "test2", "test3", "test4"), 957612994, 1305539282); + assertBucketEquals("array>", ImmutableList.of(ImmutableList.of(10L, 20L), ImmutableList.of(-10L, -20L), asList((Object) null)), 326368, 611324477); + + assertBucketEquals("map", null, 0, 0); + assertBucketEquals("map", ImmutableMap.of(), 0, 0); + assertBucketEquals("map", ImmutableMap.of("key", 123L, "key2", 123456789L, "key3", -123456L), 127880789, -1910999650); + + assertBucketEquals("map,map>", ImmutableMap.of(ImmutableList.of(12.3, 45.7), ImmutableMap.of(123, "test99")), -34001111, -1565874874); + + // multiple bucketing columns + assertBucketEquals( + ImmutableList.of("float", "array", "map"), + ImmutableList.of(12.34F, ImmutableList.of((short) 5, (short) 8, (short) 13), ImmutableMap.of("key", 123L)), + 95411006, + 932898434); + assertBucketEquals( + ImmutableList.of("double", "array", "boolean", "map", "tinyint"), + asList(null, ImmutableList.of((short) 5, (short) 8, (short) 13), null, ImmutableMap.of("key", 123L), null), + 154207826, + -1120812524); + } + + private static void assertBucketEquals(String hiveTypeString, Object hiveValue, int expectedHashCodeV1, int expectedHashCodeV2) + { + assertBucketEquals(hiveTypeString, hiveValue, BUCKETING_V1, expectedHashCodeV1); + assertBucketEquals(hiveTypeString, hiveValue, BUCKETING_V2, expectedHashCodeV2); + } + + private static void assertBucketEquals(String hiveTypeString, Object hiveValue, BucketingVersion bucketingVersion, int expectedHashCode) + { + // Use asList to allow nulls + assertBucketEquals(ImmutableList.of(hiveTypeString), asList(hiveValue), bucketingVersion, expectedHashCode); + } + + private static void assertBucketEquals(List hiveTypeStrings, List hiveValues, int expectedHashCodeV1, int expectedHashCodeV2) + { + assertBucketEquals(hiveTypeStrings, hiveValues, BUCKETING_V1, expectedHashCodeV1); + assertBucketEquals(hiveTypeStrings, hiveValues, BUCKETING_V2, expectedHashCodeV2); + } + + private static void assertBucketEquals(List hiveTypeStrings, List hiveValues, BucketingVersion bucketingVersion, int expectedHashCode) + { + List hiveTypes = hiveTypeStrings.stream() + .map(HiveType::valueOf) + .collect(toImmutableList()); + List hiveTypeInfos = hiveTypes.stream() + .map(HiveType::getTypeInfo) + .collect(toImmutableList()); + + assertEquals(computePresto(bucketingVersion, hiveTypeStrings, hiveValues, hiveTypes, hiveTypeInfos), expectedHashCode); + assertEquals(computeHive(bucketingVersion, hiveTypeStrings, hiveValues, hiveTypeInfos), expectedHashCode); + + for (int bucketCount : new int[] {1, 2, 500, 997}) { + int actual = HiveBucketing.getBucketNumber(expectedHashCode, bucketCount); + int expected = ObjectInspectorUtils.getBucketNumber(expectedHashCode, bucketCount); + assertEquals(actual, expected, "bucketCount " + bucketCount); + } + } + + private static int computeHive(BucketingVersion bucketingVersion, List hiveTypeStrings, List hiveValues, List hiveTypeInfos) + { + ImmutableList.Builder> columnBindingsBuilder = ImmutableList.builder(); + for (int i = 0; i < hiveTypeStrings.size(); i++) { + Object javaValue = hiveValues.get(i); + + columnBindingsBuilder.add(Maps.immutableEntry( + TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(hiveTypeInfos.get(i)), + javaValue)); + } + return getHiveBucketHashCode(bucketingVersion, columnBindingsBuilder.build()); + } + + private static int computePresto(BucketingVersion bucketingVersion, List hiveTypeStrings, List hiveValues, List hiveTypes, List hiveTypeInfos) + { + ImmutableList.Builder blockListBuilder = ImmutableList.builder(); + Object[] nativeContainerValues = new Object[hiveValues.size()]; + for (int i = 0; i < hiveTypeStrings.size(); i++) { + Object hiveValue = hiveValues.get(i); + Type type = hiveTypes.get(i).getType(HiveTestUtils.TYPE_MANAGER); + + BlockBuilder blockBuilder = type.createBlockBuilder(null, 3); + // prepend 2 nulls to make sure position is respected when HiveBucketing function + blockBuilder.appendNull(); + blockBuilder.appendNull(); + appendToBlockBuilder(type, hiveValue, blockBuilder); + Block block = blockBuilder.build(); + blockListBuilder.add(block); + + nativeContainerValues[i] = toNativeContainerValue(type, hiveValue); + } + ImmutableList blockList = blockListBuilder.build(); + int result1 = getBucketHashCode(bucketingVersion, hiveTypeInfos, new Page(blockList.toArray(new Block[blockList.size()])), 2); + int result2 = getBucketHashCode(bucketingVersion, hiveTypeInfos, nativeContainerValues); + assertEquals(result1, result2, "overloads of getBucketHashCode produced different result"); + return result1; + } + + public static int getHiveBucketHashCode(BucketingVersion bucketingVersion, List> columnBindings) + { + ObjectInspector[] objectInspectors = new ObjectInspector[columnBindings.size()]; + Object[] objects = new Object[columnBindings.size()]; + + int i = 0; + for (Entry entry : columnBindings) { + objectInspectors[i] = entry.getKey(); + if (entry.getValue() != null && entry.getKey() instanceof JavaHiveVarcharObjectInspector) { + JavaHiveVarcharObjectInspector varcharObjectInspector = (JavaHiveVarcharObjectInspector) entry.getKey(); + objects[i] = new HiveVarchar(((String) entry.getValue()), varcharObjectInspector.getMaxLength()); + } + else { + objects[i] = entry.getValue(); + } + i++; + } + + switch (bucketingVersion) { + case BUCKETING_V1: + @SuppressWarnings("deprecation") + int hashCodeOld = ObjectInspectorUtils.getBucketHashCodeOld(objects, objectInspectors); + return hashCodeOld; + case BUCKETING_V2: + return ObjectInspectorUtils.getBucketHashCode(objects, objectInspectors); + default: + throw new IllegalArgumentException("Unsupported bucketing version: " + bucketingVersion); + } + } + + private static Object toNativeContainerValue(Type type, Object hiveValue) + { + String typeBase = type.getTypeSignature().getBase(); + if (hiveValue == null) { + return null; + } + switch (typeBase) { + case StandardTypes.ARRAY: { + BlockBuilder blockBuilder = type.createBlockBuilder(null, 1); + BlockBuilder subBlockBuilder = blockBuilder.beginBlockEntry(); + for (Object subElement : (Iterable) hiveValue) { + appendToBlockBuilder(type.getTypeParameters().get(0), subElement, subBlockBuilder); + } + blockBuilder.closeEntry(); + return type.getObject(blockBuilder, 0); + } + case StandardTypes.ROW: { + BlockBuilder blockBuilder = type.createBlockBuilder(null, 1); + BlockBuilder subBlockBuilder = blockBuilder.beginBlockEntry(); + int field = 0; + for (Object subElement : (Iterable) hiveValue) { + appendToBlockBuilder(type.getTypeParameters().get(field), subElement, subBlockBuilder); + field++; + } + blockBuilder.closeEntry(); + return type.getObject(blockBuilder, 0); + } + case StandardTypes.MAP: { + BlockBuilder blockBuilder = type.createBlockBuilder(null, 1); + BlockBuilder subBlockBuilder = blockBuilder.beginBlockEntry(); + for (Entry entry : ((Map) hiveValue).entrySet()) { + appendToBlockBuilder(type.getTypeParameters().get(0), entry.getKey(), subBlockBuilder); + appendToBlockBuilder(type.getTypeParameters().get(1), entry.getValue(), subBlockBuilder); + } + blockBuilder.closeEntry(); + return type.getObject(blockBuilder, 0); + } + case StandardTypes.BOOLEAN: + return hiveValue; + case StandardTypes.TINYINT: + return (long) (byte) hiveValue; + case StandardTypes.SMALLINT: + return (long) (short) hiveValue; + case StandardTypes.INTEGER: + return (long) (int) hiveValue; + case StandardTypes.BIGINT: + return hiveValue; + case StandardTypes.REAL: + return (long) Float.floatToRawIntBits((float) hiveValue); + case StandardTypes.DOUBLE: + return hiveValue; + case StandardTypes.VARCHAR: + return Slices.utf8Slice(hiveValue.toString()); + case StandardTypes.CHAR: + return Slices.utf8Slice(hiveValue.toString()); + case StandardTypes.DATE: + long daysSinceEpochInLocalZone = ((Date) hiveValue).toEpochDay(); + assertEquals(daysSinceEpochInLocalZone, DateWritableV2.dateToDays((Date) hiveValue)); + return daysSinceEpochInLocalZone; + default: + throw new IllegalArgumentException("Unsupported bucketing type: " + type); + } + } + + private static void appendToBlockBuilder(Type type, Object hiveValue, BlockBuilder blockBuilder) + { + writeNativeValue(type, blockBuilder, toNativeContainerValue(type, hiveValue)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveColumnHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveColumnHandle.java new file mode 100644 index 00000000..d1dc66c3 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveColumnHandle.java @@ -0,0 +1,72 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.airlift.json.JsonCodec; +import io.prestosql.spi.type.StandardTypes; +import org.testng.annotations.Test; + +import java.util.Optional; + +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; +import static org.testng.Assert.assertEquals; + +public class TestHiveColumnHandle +{ + private final JsonCodec codec = JsonCodec.jsonCodec(HiveColumnHandle.class); + + @Test + public void testHiddenColumn() + { + HiveColumnHandle hiddenColumn = HiveColumnHandle.pathColumnHandle(); + testRoundTrip(hiddenColumn); + } + + @Test + public void testRegularColumn() + { + HiveColumnHandle expectedPartitionColumn = new HiveColumnHandle("name", HiveType.HIVE_FLOAT, parseTypeSignature(StandardTypes.DOUBLE), 88, PARTITION_KEY, Optional.empty()); + testRoundTrip(expectedPartitionColumn); + } + + @Test + public void testPartitionKeyColumn() + { + HiveColumnHandle expectedRegularColumn = new HiveColumnHandle("name", HiveType.HIVE_FLOAT, parseTypeSignature(StandardTypes.DOUBLE), 88, REGULAR, Optional.empty()); + testRoundTrip(expectedRegularColumn); + } + + @Test + public void testRequiredColumn() + { + HiveColumnHandle expectedRegularColumn = new HiveColumnHandle("name", HiveType.HIVE_FLOAT, parseTypeSignature(StandardTypes.DOUBLE), 88, REGULAR, Optional.empty(), false); + testRoundTrip(expectedRegularColumn); + HiveColumnHandle expectedPartitionColumn = new HiveColumnHandle("name", HiveType.HIVE_FLOAT, parseTypeSignature(StandardTypes.DOUBLE), 88, PARTITION_KEY, Optional.empty(), true); + testRoundTrip(expectedPartitionColumn); + } + + private void testRoundTrip(HiveColumnHandle expected) + { + String json = codec.toJson(expected); + HiveColumnHandle actual = codec.fromJson(json); + + assertEquals(actual.getName(), expected.getName()); + assertEquals(actual.getHiveType(), expected.getHiveType()); + assertEquals(actual.getHiveColumnIndex(), expected.getHiveColumnIndex()); + assertEquals(actual.isPartitionKey(), expected.isPartitionKey()); + assertEquals(actual.isRequired(), expected.isRequired()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveConfig.java new file mode 100644 index 00000000..dedb2ae6 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveConfig.java @@ -0,0 +1,441 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.net.HostAndPort; +import io.airlift.configuration.testing.ConfigAssertions; +import io.airlift.units.DataSize; +import io.airlift.units.DataSize.Unit; +import io.airlift.units.Duration; +import io.prestosql.orc.OrcWriteValidation.OrcWriteValidationMode; +import io.prestosql.plugin.hive.s3.S3FileSystemType; +import org.testng.annotations.Test; + +import java.util.Map; +import java.util.TimeZone; +import java.util.concurrent.TimeUnit; + +import static io.airlift.units.DataSize.Unit.GIGABYTE; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.plugin.hive.HiveConfig.MIN_OFFLOAD_FACTOR; +import static io.prestosql.plugin.hive.HiveConfig.MIN_OFFLOAD_ROW_NUM; +import static io.prestosql.plugin.hive.TestHiveUtil.nonDefaultTimeZone; + +public class TestHiveConfig +{ + @Test + public void testDefaults() + { + ConfigAssertions.assertRecordedDefaults(ConfigAssertions.recordDefaults(HiveConfig.class) + .setMaxSplitSize(new DataSize(64, Unit.MEGABYTE)) + .setMaxPartitionsPerScan(100_000) + .setMaxOutstandingSplits(1_000) + .setMaxOutstandingSplitsSize(new DataSize(256, Unit.MEGABYTE)) + .setMaxSplitIteratorThreads(1_000) + .setAllowCorruptWritesForTesting(false) + .setMetastoreCacheTtl(new Duration(0, TimeUnit.SECONDS)) + .setMetastoreRefreshInterval(new Duration(1, TimeUnit.SECONDS)) + .setMetastoreDBCacheTtl(new Duration(0, TimeUnit.SECONDS)) + .setMetastoreDBRefreshInterval(new Duration(1, TimeUnit.SECONDS)) + .setMetastoreCacheMaximumSize(10000) + .setPerTransactionMetastoreCacheMaximumSize(1000) + .setMaxMetastoreRefreshThreads(100) + .setMetastoreSocksProxy(null) + .setMetastoreTimeout(new Duration(10, TimeUnit.SECONDS)) + .setMinPartitionBatchSize(10) + .setMaxPartitionBatchSize(100) + .setMaxInitialSplits(200) + .setMaxInitialSplitSize(new DataSize(32, Unit.MEGABYTE)) + .setSplitLoaderConcurrency(4) + .setMaxSplitsPerSecond(null) + .setDomainCompactionThreshold(100) + .setWriterSortBufferSize(new DataSize(64, Unit.MEGABYTE)) + .setForceLocalScheduling(false) + .setMaxConcurrentFileRenames(20) + .setRecursiveDirWalkerEnabled(false) + .setDfsTimeout(new Duration(60, TimeUnit.SECONDS)) + .setIpcPingInterval(new Duration(10, TimeUnit.SECONDS)) + .setDfsConnectTimeout(new Duration(500, TimeUnit.MILLISECONDS)) + .setDfsKeyProviderCacheTtl(new Duration(30, TimeUnit.MINUTES)) + .setDfsConnectMaxRetries(5) + .setVerifyChecksum(true) + .setDomainSocketPath(null) + .setS3FileSystemType(S3FileSystemType.PRESTO) + .setResourceConfigFiles("") + .setHiveStorageFormat(HiveStorageFormat.ORC) + .setHiveCompressionCodec(HiveCompressionCodec.GZIP) + .setRespectTableFormat(true) + .setImmutablePartitions(false) + .setCreateEmptyBucketFiles(false) + .setSortedWritingEnabled(true) + .setMaxPartitionsPerWriter(100) + .setMaxOpenSortFiles(50) + .setWriteValidationThreads(16) + .setTextMaxLineLength(new DataSize(100, Unit.MEGABYTE)) + .setOrcLegacyTimeZone(TimeZone.getDefault().getID()) + .setParquetTimeZone(TimeZone.getDefault().getID()) + .setUseParquetColumnNames(false) + .setFailOnCorruptedParquetStatistics(true) + .setParquetMaxReadBlockSize(new DataSize(16, Unit.MEGABYTE)) + .setUseOrcColumnNames(false) + .setAssumeCanonicalPartitionKeys(false) + .setOrcBloomFiltersEnabled(false) + .setOrcDefaultBloomFilterFpp(0.05) + .setOrcMaxMergeDistance(new DataSize(1, Unit.MEGABYTE)) + .setOrcMaxBufferSize(new DataSize(8, Unit.MEGABYTE)) + .setOrcStreamBufferSize(new DataSize(8, Unit.MEGABYTE)) + .setOrcTinyStripeThreshold(new DataSize(1, Unit.BYTE)) + .setOrcMaxReadBlockSize(new DataSize(16, Unit.MEGABYTE)) + .setOrcFileTailCacheEnabled(false).setOrcFileTailCacheTtl(new Duration(4, TimeUnit.HOURS)).setOrcFileTailCacheLimit(50_000) + .setOrcStripeFooterCacheEnabled(false).setOrcStripeFooterCacheTtl(new Duration(4, TimeUnit.HOURS)).setOrcStripeFooterCacheLimit(250_000) + .setOrcRowIndexCacheEnabled(false).setOrcRowIndexCacheTtl(new Duration(4, TimeUnit.HOURS)).setOrcRowIndexCacheLimit(250_000) + .setOrcBloomFiltersCacheEnabled(false).setOrcBloomFiltersCacheTtl(new Duration(4, TimeUnit.HOURS)).setOrcBloomFiltersCacheLimit(250_000) + .setOrcRowDataCacheEnabled(false).setOrcRowDataCacheTtl(new Duration(4, TimeUnit.HOURS)).setOrcRowDataCacheMaximumWeight(new DataSize(20, GIGABYTE)) + .setOrcLazyReadSmallRanges(true) + .setRcfileTimeZone(TimeZone.getDefault().getID()) + .setRcfileWriterValidate(false) + .setOrcWriteLegacyVersion(false) + .setOrcWriterValidationPercentage(0.0) + .setOrcWriterValidationMode(OrcWriteValidationMode.BOTH) + .setHiveMetastoreAuthenticationType(HiveConfig.HiveMetastoreAuthenticationType.NONE) + .setHdfsAuthenticationType(HiveConfig.HdfsAuthenticationType.NONE) + .setHdfsImpersonationEnabled(false) + .setSkipDeletionForAlter(false) + .setSkipTargetCleanupOnRollback(false) + .setBucketExecutionEnabled(true) + .setFileSystemMaxCacheSize(1000) + .setTableStatisticsEnabled(true) + .setOptimizeMismatchedBucketCount(false) + .setWritesToNonManagedTablesEnabled(false) + .setCreatesOfNonManagedTablesEnabled(true) + .setHdfsWireEncryptionEnabled(false) + .setPartitionStatisticsSampleSize(100) + .setIgnoreCorruptedStatistics(false) + .setRecordingPath(null) + .setRecordingDuration(new Duration(10, TimeUnit.MINUTES)) + .setReplay(false) + .setCollectColumnStatisticsOnWrite(true) + .setS3SelectPushdownEnabled(false) + .setS3SelectPushdownMaxConnections(500) + .setTemporaryStagingDirectoryEnabled(true) + .setTemporaryStagingDirectoryPath("/tmp/presto-${USER}") + .setFileStatusCacheExpireAfterWrite(new Duration(24, TimeUnit.HOURS)) + .setFileStatusCacheMaxSize(1000 * 1000) + .setFileStatusCacheTables("") + .setHiveTransactionHeartbeatInterval(null) + .setHiveTransactionHeartbeatThreads(5) + .setTableCreatesWithLocationAllowed(true) + .setTlsEnabled(false) + .setDynamicFilterPartitionFilteringEnabled(true) + .setDynamicFilteringRowFilteringThreshold(2000) + .setOrcCacheStatsMetricCollectionEnabled(false) + .setVacuumCleanupRecheckInterval(new Duration(5, TimeUnit.MINUTES)) + .setVacuumServiceThreads(2) + .setMetastoreClientServiceThreads(4) + .setVacuumDeltaNumThreshold(10) + .setAutoVacuumEnabled(false) + .setVacuumDeltaPercentThreshold(0.1) + .setOrcPredicatePushdownEnabled(false) + .setVacuumCollectorInterval(new Duration(5, TimeUnit.MINUTES)) + .setMaxSplitsToGroup(1) + .setWorkerMetaStoreCacheEnabled(false) + .setAggregatorOffloadEnabled(true) + .setFilterOffloadEnabled(true) + .setMinAggregatorOffloadFactor(MIN_OFFLOAD_FACTOR) + .setMinFilterOffloadFactor(MIN_OFFLOAD_FACTOR) + .setMinOffloadRowNumber(MIN_OFFLOAD_ROW_NUM) + .setOmniDataEnabled(false) + .setOmniDataSslPkiDir("") + .setOmniDataSslEnabled(false) + .setOmniDataSslClientCertFilePath("") + .setOmniDataSslCrlFilePath("") + .setOmniDataSslPrivateKeyFilePath("") + .setOmniDataSslTrustCertFilePath("") + .setMetastoreWriteBatchSize(8)); + } + + @Test + public void testExplicitPropertyMappings() + { + Map properties = new ImmutableMap.Builder() + .put("hive.max-split-size", "256MB") + .put("hive.max-partitions-per-scan", "123") + .put("hive.max-outstanding-splits", "10") + .put("hive.max-outstanding-splits-size", "32MB") + .put("hive.max-split-iterator-threads", "10") + .put("hive.allow-corrupt-writes-for-testing", "true") + .put("hive.metastore-cache-ttl", "2h") + .put("hive.metastore-refresh-interval", "30m") + .put("hive.metastore-db-cache-ttl", "2h") + .put("hive.metastore-db-refresh-interval", "30m") + .put("hive.metastore-cache-maximum-size", "5000") + .put("hive.per-transaction-metastore-cache-maximum-size", "500") + .put("hive.metastore-refresh-max-threads", "2500") + .put("hive.metastore.thrift.client.socks-proxy", "localhost:1080") + .put("hive.metastore-timeout", "20s") + .put("hive.metastore.partition-batch-size.min", "1") + .put("hive.metastore.partition-batch-size.max", "1000") + .put("hive.dfs.ipc-ping-interval", "34s") + .put("hive.dfs-timeout", "33s") + .put("hive.dfs.connect.timeout", "20s") + .put("hive.dfs.key-provider.cache-ttl", "42s") + .put("hive.dfs.connect.max-retries", "10") + .put("hive.dfs.verify-checksum", "false") + .put("hive.dfs.domain-socket-path", "/foo") + .put("hive.dynamic-filter-partition-filtering", "false") + .put("hive.dynamic-filtering-row-filtering-threshold", "10000") + .put("hive.s3-file-system-type", "EMRFS") + .put("hive.config.resources", "/foo.xml,/bar.xml") + .put("hive.max-initial-splits", "10") + .put("hive.max-initial-split-size", "16MB") + .put("hive.split-loader-concurrency", "1") + .put("hive.max-splits-per-second", "1") + .put("hive.domain-compaction-threshold", "42") + .put("hive.writer-sort-buffer-size", "13MB") + .put("hive.recursive-directories", "true") + .put("hive.storage-format", "SEQUENCEFILE") + .put("hive.compression-codec", "NONE") + .put("hive.respect-table-format", "false") + .put("hive.immutable-partitions", "true") + .put("hive.create-empty-bucket-files", "true") + .put("hive.max-partitions-per-writers", "222") + .put("hive.max-open-sort-files", "333") + .put("hive.write-validation-threads", "11") + .put("hive.force-local-scheduling", "true") + .put("hive.max-concurrent-file-renames", "100") + .put("hive.assume-canonical-partition-keys", "true") + .put("hive.text.max-line-length", "13MB") + .put("hive.orc.time-zone", nonDefaultTimeZone().getID()) + .put("hive.parquet.time-zone", nonDefaultTimeZone().getID()) + .put("hive.parquet.use-column-names", "true") + .put("hive.parquet.fail-on-corrupted-statistics", "false") + .put("hive.parquet.max-read-block-size", "66kB") + .put("hive.orc.use-column-names", "true") + .put("hive.orc.bloom-filters.enabled", "true") + .put("hive.orc.default-bloom-filter-fpp", "0.96") + .put("hive.orc.max-merge-distance", "22kB") + .put("hive.orc.max-buffer-size", "44kB") + .put("hive.orc.stream-buffer-size", "55kB") + .put("hive.orc.tiny-stripe-threshold", "61kB") + .put("hive.orc.max-read-block-size", "66kB") + .put("hive.orc.file-tail.cache.enabled", "true") + .put("hive.orc.file-tail.cache.ttl", "1h") + .put("hive.orc.file-tail.cache.limit", "100") + .put("hive.orc.stripe-footer.cache.enabled", "true") + .put("hive.orc.stripe-footer.cache.ttl", "1h") + .put("hive.orc.stripe-footer.cache.limit", "100") + .put("hive.orc.row-index.cache.enabled", "true") + .put("hive.orc.row-index.cache.ttl", "1h") + .put("hive.orc.row-index.cache.limit", "100") + .put("hive.orc.bloom-filters.cache.enabled", "true") + .put("hive.orc.bloom-filters.cache.ttl", "1h") + .put("hive.orc.bloom-filters.cache.limit", "100") + .put("hive.orc.row-data.block.cache.enabled", "true") + .put("hive.orc.row-data.block.cache.ttl", "1h") + .put("hive.orc.row-data.block.cache.max.weight", "1MB") + .put("hive.orc.lazy-read-small-ranges", "false") + .put("hive.rcfile.time-zone", nonDefaultTimeZone().getID()) + .put("hive.rcfile.writer.validate", "true") + .put("hive.orc.writer.use-legacy-version-number", "true") + .put("hive.orc.writer.validation-percentage", "0.16") + .put("hive.orc.writer.validation-mode", "DETAILED") + .put("hive.metastore.authentication.type", "KERBEROS") + .put("hive.hdfs.authentication.type", "KERBEROS") + .put("hive.hdfs.impersonation.enabled", "true") + .put("hive.skip-deletion-for-alter", "true") + .put("hive.skip-target-cleanup-on-rollback", "true") + .put("hive.bucket-execution", "false") + .put("hive.sorted-writing", "false") + .put("hive.fs.cache.max-size", "1010") + .put("hive.table-statistics-enabled", "false") + .put("hive.optimize-mismatched-bucket-count", "true") + .put("hive.non-managed-table-writes-enabled", "true") + .put("hive.non-managed-table-creates-enabled", "false") + .put("hive.hdfs.wire-encryption.enabled", "true") + .put("hive.partition-statistics-sample-size", "1234") + .put("hive.ignore-corrupted-statistics", "true") + .put("hive.metastore-recording-path", "/foo/bar") + .put("hive.metastore-recording-duration", "42s") + .put("hive.replay-metastore-recording", "true") + .put("hive.collect-column-statistics-on-write", "false") + .put("hive.s3select-pushdown.enabled", "true") + .put("hive.s3select-pushdown.max-connections", "1234") + .put("hive.temporary-staging-directory-enabled", "false") + .put("hive.temporary-staging-directory-path", "updated") + .put("hive.file-status-cache-tables", "foo.bar1, foo.bar2") + .put("hive.file-status-cache-size", "1000") + .put("hive.file-status-cache-expire-time", "30m") + .put("hive.transaction-heartbeat-interval", "10s") + .put("hive.transaction-heartbeat-threads", "10") + .put("hive.metastore.thrift.client.ssl.enabled", "true") + .put("hive.table-creates-with-location-allowed", "false") + .put("hive.orc-cache-stats-metric-collection.enabled", "true") + .put("hive.vacuum-cleanup-recheck-interval", "10m") + .put("hive.vacuum-service-threads", "5") + .put("hive.metastore-client-service-threads", "5") + .put("hive.vacuum-delta-num-threshold", "5") + .put("hive.vacuum-delta-percent-threshold", "0.6") + .put("hive.auto-vacuum-enabled", "true") + .put("hive.orc-predicate-pushdown-enabled", "true") + .put("hive.vacuum-collector-interval", "5s") + .put("hive.max-splits-to-group", "20") + .put("hive.worker-metastore-cache-enabled", "true") + .put("hive.metastore-write-batch-size", "64") + .put("hive.aggregator-offload-enabled", "false") + .put("hive.filter-offload-enabled", "false") + .put("hive.min-aggregator-offload-factor", "0.2") + .put("hive.min-filter-offload-factor", "0.3") + .put("hive.min-offload-row-number", "100") + .put("hive.omnidata-enabled", "true") + .put("omni-data.ssl.enabled", "true") + .put("omni-data.ssl.pki.dir", "./") + .put("omni-data.ssl.client.cert.file.path", "./") + .put("omni-data.ssl.crl.file.path", "./") + .put("omni-data.ssl.private.key.file.path", "./") + .put("omni-data.ssl.trust.cert.file.path", "./") + .build(); + + HiveConfig expected = new HiveConfig() + .setMaxSplitSize(new DataSize(256, Unit.MEGABYTE)) + .setMaxPartitionsPerScan(123) + .setMaxOutstandingSplits(10) + .setMaxOutstandingSplitsSize(new DataSize(32, Unit.MEGABYTE)) + .setMaxSplitIteratorThreads(10) + .setAllowCorruptWritesForTesting(true) + .setMetastoreCacheTtl(new Duration(2, TimeUnit.HOURS)) + .setMetastoreRefreshInterval(new Duration(30, TimeUnit.MINUTES)) + .setMetastoreDBCacheTtl(new Duration(2, TimeUnit.HOURS)) + .setMetastoreDBRefreshInterval(new Duration(30, TimeUnit.MINUTES)) + .setMetastoreCacheMaximumSize(5000) + .setPerTransactionMetastoreCacheMaximumSize(500) + .setMaxMetastoreRefreshThreads(2500) + .setMetastoreSocksProxy(HostAndPort.fromParts("localhost", 1080)) + .setMetastoreTimeout(new Duration(20, TimeUnit.SECONDS)) + .setMinPartitionBatchSize(1) + .setMaxPartitionBatchSize(1000) + .setMaxInitialSplits(10) + .setMaxInitialSplitSize(new DataSize(16, Unit.MEGABYTE)) + .setSplitLoaderConcurrency(1) + .setMaxSplitsPerSecond(1) + .setDomainCompactionThreshold(42) + .setWriterSortBufferSize(new DataSize(13, Unit.MEGABYTE)) + .setForceLocalScheduling(true) + .setMaxConcurrentFileRenames(100) + .setRecursiveDirWalkerEnabled(true) + .setIpcPingInterval(new Duration(34, TimeUnit.SECONDS)) + .setDfsTimeout(new Duration(33, TimeUnit.SECONDS)) + .setDfsConnectTimeout(new Duration(20, TimeUnit.SECONDS)) + .setDfsKeyProviderCacheTtl(new Duration(42, TimeUnit.SECONDS)) + .setDfsConnectMaxRetries(10) + .setVerifyChecksum(false) + .setResourceConfigFiles(ImmutableList.of("/foo.xml", "/bar.xml")) + .setHiveStorageFormat(HiveStorageFormat.SEQUENCEFILE) + .setHiveCompressionCodec(HiveCompressionCodec.NONE) + .setRespectTableFormat(false) + .setImmutablePartitions(true) + .setCreateEmptyBucketFiles(true) + .setMaxPartitionsPerWriter(222) + .setMaxOpenSortFiles(333) + .setWriteValidationThreads(11) + .setDomainSocketPath("/foo") + .setS3FileSystemType(S3FileSystemType.EMRFS) + .setTextMaxLineLength(new DataSize(13, Unit.MEGABYTE)) + .setOrcLegacyTimeZone(nonDefaultTimeZone().getID()) + .setParquetTimeZone(nonDefaultTimeZone().getID()) + .setUseParquetColumnNames(true) + .setFailOnCorruptedParquetStatistics(false) + .setParquetMaxReadBlockSize(new DataSize(66, Unit.KILOBYTE)) + .setUseOrcColumnNames(true) + .setAssumeCanonicalPartitionKeys(true) + .setRcfileTimeZone(nonDefaultTimeZone().getID()) + .setOrcBloomFiltersEnabled(true) + .setOrcDefaultBloomFilterFpp(0.96) + .setOrcMaxMergeDistance(new DataSize(22, Unit.KILOBYTE)) + .setOrcMaxBufferSize(new DataSize(44, Unit.KILOBYTE)) + .setOrcStreamBufferSize(new DataSize(55, Unit.KILOBYTE)) + .setOrcTinyStripeThreshold(new DataSize(61, Unit.KILOBYTE)) + .setOrcMaxReadBlockSize(new DataSize(66, Unit.KILOBYTE)) + .setOrcFileTailCacheEnabled(true).setOrcFileTailCacheTtl(new Duration(1, TimeUnit.HOURS)).setOrcFileTailCacheLimit(100) + .setOrcStripeFooterCacheEnabled(true).setOrcStripeFooterCacheTtl(new Duration(1, TimeUnit.HOURS)).setOrcStripeFooterCacheLimit(100) + .setOrcRowIndexCacheEnabled(true).setOrcRowIndexCacheTtl(new Duration(1, TimeUnit.HOURS)).setOrcRowIndexCacheLimit(100) + .setOrcBloomFiltersCacheEnabled(true).setOrcBloomFiltersCacheTtl(new Duration(1, TimeUnit.HOURS)).setOrcBloomFiltersCacheLimit(100) + .setOrcRowDataCacheEnabled(true).setOrcRowDataCacheTtl(new Duration(1, TimeUnit.HOURS)).setOrcRowDataCacheMaximumWeight(new DataSize(1, MEGABYTE)) + .setOrcLazyReadSmallRanges(false) + .setRcfileTimeZone(nonDefaultTimeZone().getID()) + .setRcfileWriterValidate(true) + .setOrcWriteLegacyVersion(true) + .setOrcWriterValidationPercentage(0.16) + .setOrcWriterValidationMode(OrcWriteValidationMode.DETAILED) + .setHiveMetastoreAuthenticationType(HiveConfig.HiveMetastoreAuthenticationType.KERBEROS) + .setHdfsAuthenticationType(HiveConfig.HdfsAuthenticationType.KERBEROS) + .setHdfsImpersonationEnabled(true) + .setSkipDeletionForAlter(true) + .setSkipTargetCleanupOnRollback(true) + .setBucketExecutionEnabled(false) + .setSortedWritingEnabled(false) + .setFileSystemMaxCacheSize(1010) + .setTableStatisticsEnabled(false) + .setOptimizeMismatchedBucketCount(true) + .setWritesToNonManagedTablesEnabled(true) + .setCreatesOfNonManagedTablesEnabled(false) + .setHdfsWireEncryptionEnabled(true) + .setPartitionStatisticsSampleSize(1234) + .setIgnoreCorruptedStatistics(true) + .setRecordingPath("/foo/bar") + .setRecordingDuration(new Duration(42, TimeUnit.SECONDS)) + .setReplay(true) + .setCollectColumnStatisticsOnWrite(false) + .setS3SelectPushdownEnabled(true) + .setS3SelectPushdownMaxConnections(1234) + .setTemporaryStagingDirectoryEnabled(false) + .setTemporaryStagingDirectoryPath("updated") + .setFileStatusCacheTables("foo.bar1,foo.bar2") + .setFileStatusCacheMaxSize(1000) + .setFileStatusCacheExpireAfterWrite(new Duration(30, TimeUnit.MINUTES)) + .setHiveTransactionHeartbeatInterval(new Duration(10, TimeUnit.SECONDS)) + .setHiveTransactionHeartbeatThreads(10) + .setTableCreatesWithLocationAllowed(false) + .setTlsEnabled(true) + .setDynamicFilterPartitionFilteringEnabled(false) + .setDynamicFilteringRowFilteringThreshold(10000) + .setOrcCacheStatsMetricCollectionEnabled(true) + .setVacuumCleanupRecheckInterval(new Duration(10, TimeUnit.MINUTES)) + .setVacuumServiceThreads(5) + .setMetastoreClientServiceThreads(5) + .setVacuumDeltaNumThreshold(5) + .setAutoVacuumEnabled(true) + .setVacuumDeltaPercentThreshold(0.6) + .setOrcPredicatePushdownEnabled(true) + .setVacuumCollectorInterval(new Duration(5, TimeUnit.SECONDS)) + .setMaxSplitsToGroup(20) + .setWorkerMetaStoreCacheEnabled(true) + .setMetastoreWriteBatchSize(64) + .setAggregatorOffloadEnabled(false) + .setFilterOffloadEnabled(false) + .setOmniDataEnabled(true) + .setMinFilterOffloadFactor(0.3) + .setMinAggregatorOffloadFactor(0.2) + .setMinOffloadRowNumber(100) + .setOmniDataSslEnabled(true) + .setOmniDataSslPkiDir("./") + .setOmniDataSslClientCertFilePath("./") + .setOmniDataSslCrlFilePath("./") + .setOmniDataSslTrustCertFilePath("./") + .setOmniDataSslPrivateKeyFilePath("./"); + + ConfigAssertions.assertFullMapping(properties, expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveConnectorFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveConnectorFactory.java new file mode 100644 index 00000000..64a31fe2 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveConnectorFactory.java @@ -0,0 +1,78 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableMap; +import io.prestosql.spi.connector.Connector; +import io.prestosql.spi.connector.ConnectorPageSourceProvider; +import io.prestosql.spi.connector.ConnectorTransactionHandle; +import io.prestosql.spi.connector.classloader.ClassLoaderSafeConnectorMetadata; +import io.prestosql.spi.connector.classloader.ClassLoaderSafeConnectorSplitManager; +import io.prestosql.testing.TestingConnectorContext; +import org.testng.annotations.Test; + +import java.util.Map; +import java.util.Optional; + +import static io.airlift.testing.Assertions.assertContains; +import static io.airlift.testing.Assertions.assertInstanceOf; +import static io.prestosql.spi.transaction.IsolationLevel.READ_UNCOMMITTED; +import static org.testng.Assert.fail; + +public class TestHiveConnectorFactory +{ + @Test + public void testGetClient() + { + assertCreateConnector("thrift://localhost:1234"); + assertCreateConnector("thrift://localhost:1234,thrift://192.0.2.3:5678"); + + assertCreateConnectorFails("abc", "metastoreUri scheme is missing: abc"); + assertCreateConnectorFails("thrift://:8090", "metastoreUri host is missing: thrift://:8090"); + assertCreateConnectorFails("thrift://localhost", "metastoreUri port is missing: thrift://localhost"); + assertCreateConnectorFails("abc::", "metastoreUri scheme must be thrift: abc::"); + assertCreateConnectorFails("", "metastoreUris must specify at least one URI"); + assertCreateConnectorFails("thrift://localhost:1234,thrift://test-1", "metastoreUri port is missing: thrift://test-1"); + } + + private static void assertCreateConnector(String metastoreUri) + { + HiveConnectorFactory connectorFactory = new HiveConnectorFactory( + "hive-test", + HiveConnector.class.getClassLoader(), + Optional.empty()); + + Map config = ImmutableMap.builder() + .put("hive.metastore.uri", metastoreUri) + .build(); + + Connector connector = connectorFactory.create("hive-test", config, new TestingConnectorContext()); + ConnectorTransactionHandle transaction = connector.beginTransaction(READ_UNCOMMITTED, true); + assertInstanceOf(connector.getMetadata(transaction), ClassLoaderSafeConnectorMetadata.class); + assertInstanceOf(connector.getSplitManager(), ClassLoaderSafeConnectorSplitManager.class); + assertInstanceOf(connector.getPageSourceProvider(), ConnectorPageSourceProvider.class); + connector.commit(transaction); + } + + private static void assertCreateConnectorFails(String metastoreUri, String exceptionString) + { + try { + assertCreateConnector(metastoreUri); + fail("expected connector creation to fail:" + metastoreUri); + } + catch (RuntimeException e) { + assertContains(e.getMessage(), exceptionString); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDecimalParser.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDecimalParser.java new file mode 100644 index 00000000..907b71f2 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDecimalParser.java @@ -0,0 +1,59 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.type.DecimalType; +import org.testng.annotations.Test; + +import java.math.BigDecimal; + +import static java.nio.charset.StandardCharsets.US_ASCII; +import static org.testng.Assert.assertEquals; + +public class TestHiveDecimalParser +{ + @Test + public void testParseDecimal() + { + checkParseDecimal("3", 2, 1, new BigDecimal("3.0")); + checkParseDecimal("3.1", 2, 1, new BigDecimal("3.1")); + + // rounding + checkParseDecimal("3.11", 2, 1, new BigDecimal("3.1")); + checkParseDecimal("3.16", 2, 1, new BigDecimal("3.2")); + + // rouding of half (odd and even) + checkParseDecimal("3.15", 2, 1, new BigDecimal("3.2")); + checkParseDecimal("3.25", 2, 1, new BigDecimal("3.3")); + + // negative + checkParseDecimal("-3", 2, 1, new BigDecimal("-3.0")); + checkParseDecimal("-3.1", 2, 1, new BigDecimal("-3.1")); + + // negative rounding + checkParseDecimal("-3.11", 2, 1, new BigDecimal("-3.1")); + checkParseDecimal("-3.16", 2, 1, new BigDecimal("-3.2")); + + // negative rounding of half (odd and even) + checkParseDecimal("-3.15", 2, 1, new BigDecimal("-3.2")); + checkParseDecimal("-3.25", 2, 1, new BigDecimal("-3.3")); + } + + private void checkParseDecimal(String input, int precision, int scale, BigDecimal expected) + { + byte[] bytes = input.getBytes(US_ASCII); + BigDecimal parsed = HiveDecimalParser.parseHiveDecimal(bytes, 0, bytes.length, DecimalType.createDecimalType(precision, scale)); + assertEquals(parsed, expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedAggregations.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedAggregations.java new file mode 100644 index 00000000..b269534d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedAggregations.java @@ -0,0 +1,33 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.tests.AbstractTestAggregations; + +import static io.airlift.tpch.TpchTable.getTables; +import static io.prestosql.plugin.hive.HiveQueryRunner.createQueryRunner; + +public class TestHiveDistributedAggregations + extends AbstractTestAggregations +{ + protected boolean supportsPushdown() + { + return true; + } + + public TestHiveDistributedAggregations() + { + super(() -> createQueryRunner(getTables())); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedJoinQueries.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedJoinQueries.java new file mode 100644 index 00000000..c6445a08 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedJoinQueries.java @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.tests.AbstractTestJoinQueries; + +import static io.airlift.tpch.TpchTable.getTables; +import static io.prestosql.plugin.hive.HiveQueryRunner.createQueryRunner; + +public class TestHiveDistributedJoinQueries + extends AbstractTestJoinQueries +{ + public TestHiveDistributedJoinQueries() + { + super(() -> createQueryRunner(getTables())); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedJoinQueriesWithDynamicFiltering.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedJoinQueriesWithDynamicFiltering.java new file mode 100644 index 00000000..bc78c610 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedJoinQueriesWithDynamicFiltering.java @@ -0,0 +1,242 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.MoreCollectors; +import io.prestosql.Session; +import io.prestosql.operator.OperatorStats; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.QueryId; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.FixedPageSource; +import io.prestosql.spi.dynamicfilter.DynamicFilter; +import io.prestosql.spi.dynamicfilter.DynamicFilterFactory; +import io.prestosql.spi.dynamicfilter.DynamicFilterSupplier; +import io.prestosql.spi.plan.FilterNode; +import io.prestosql.spi.plan.PlanNodeId; +import io.prestosql.spi.plan.ProjectNode; +import io.prestosql.spi.plan.TableScanNode; +import io.prestosql.spi.type.StandardTypes; +import io.prestosql.spi.util.BloomFilter; +import io.prestosql.sql.analyzer.FeaturesConfig; +import io.prestosql.sql.planner.Plan; +import io.prestosql.sql.planner.optimizations.PlanNodeSearcher; +import io.prestosql.testing.MaterializedResult; +import io.prestosql.testing.TestingConnectorSession; +import io.prestosql.tests.AbstractTestQueryFramework; +import io.prestosql.tests.DistributedQueryRunner; +import io.prestosql.tests.ResultWithQueryId; +import org.testng.annotations.Test; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.function.Supplier; + +import static io.airlift.testing.Assertions.assertGreaterThan; +import static io.airlift.testing.Assertions.assertLessThanOrEqual; +import static io.airlift.tpch.TpchTable.getTables; +import static io.prestosql.SystemSessionProperties.DYNAMIC_FILTERING_WAIT_TIME; +import static io.prestosql.SystemSessionProperties.ENABLE_DYNAMIC_FILTERING; +import static io.prestosql.SystemSessionProperties.JOIN_DISTRIBUTION_TYPE; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static io.prestosql.plugin.hive.HiveQueryRunner.createQueryRunnerWithStateStore; +import static io.prestosql.plugin.hive.HiveTestUtils.TYPE_MANAGER; +import static io.prestosql.plugin.hive.HiveTestUtils.createTestHdfsEnvironment; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveDataStreamFactories; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveRecordCursorProvider; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveSelectiveFactories; +import static io.prestosql.plugin.hive.HiveTestUtils.getNoOpIndexCache; +import static io.prestosql.plugin.hive.HiveType.HIVE_INT; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; + +public class TestHiveDistributedJoinQueriesWithDynamicFiltering + extends AbstractTestQueryFramework +{ + public TestHiveDistributedJoinQueriesWithDynamicFiltering() + { + super(() -> createQueryRunnerWithStateStore(getTables())); + } + + @Override + protected Session getSession() + { + return Session.builder(super.getSession()) + .setSystemProperty(ENABLE_DYNAMIC_FILTERING, "true") + .setSystemProperty(DYNAMIC_FILTERING_WAIT_TIME, "2000ms") + .build(); + } + + @Test + public void testJoinWithEmptyBuildSide() + { + Session session = Session.builder(getSession()) + .setSystemProperty(JOIN_DISTRIBUTION_TYPE, FeaturesConfig.JoinDistributionType.BROADCAST.name()) + .build(); + DistributedQueryRunner runner = (DistributedQueryRunner) getQueryRunner(); + ResultWithQueryId result = runner.executeWithQueryId( + session, + "SELECT * FROM lineitem JOIN orders ON lineitem.orderkey = orders.orderkey AND orders.totalprice = 123.4567"); + assertEquals(result.getResult().getRowCount(), 0); + + OperatorStats probeStats = searchScanFilterAndProjectOperatorStats(result.getQueryId(), "tpch:lineitem"); + // Probe-side is not scanned at all, due to dynamic filtering: + assertEquals(probeStats.getInputPositions(), 0L); + } + + @Test + public void testIsPartitionFiltered() + throws IOException + { + Properties schema = new Properties(); + + ImmutableList partitionKeys = ImmutableList.of(new HivePartitionKey("p1", "100"), new HivePartitionKey("p2", "101"), new HivePartitionKey("p3", "__HIVE_DEFAULT_PARTITION__")); + + HiveSplitWrapper split = HiveSplitWrapper.wrap(new HiveSplit("db", "table", "partitionId", "path", 0, 50, 50, 0, schema, partitionKeys, ImmutableList.of(), OptionalInt.empty(), false, ImmutableMap.of(), Optional.empty(), false, Optional.empty(), Optional.empty(), false, ImmutableMap.of())); + + List filterValues = ImmutableList.of(1L, 50L, 100L); + + HiveColumnHandle testColumnHandle = new HiveColumnHandle("p1", HIVE_INT, parseTypeSignature(StandardTypes.INTEGER), 0, PARTITION_KEY, Optional.empty()); + Supplier>> dynamicFilter = createDynamicFilterSupplier(filterValues, testColumnHandle, "filter1"); + Optional dynamicFilterSupplier = Optional.of(new DynamicFilterSupplier(dynamicFilter, System.currentTimeMillis(), 10000)); + + HiveColumnHandle testColumnHandle2 = new HiveColumnHandle("p2", HIVE_INT, parseTypeSignature(StandardTypes.INTEGER), 0, PARTITION_KEY, Optional.empty()); + Supplier>> dynamicFilter2 = createDynamicFilterSupplier(filterValues, testColumnHandle2, "filter2"); + Optional dynamicFilterSupplier2 = Optional.of(new DynamicFilterSupplier(dynamicFilter2, System.currentTimeMillis(), 10000)); + + HiveColumnHandle testColumnHandle3 = new HiveColumnHandle("p3", HIVE_INT, parseTypeSignature(StandardTypes.INTEGER), 0, PARTITION_KEY, Optional.empty()); + Supplier>> dynamicFilter3 = createDynamicFilterSupplier(filterValues, testColumnHandle3, "filter3"); + Optional dynamicFilterSupplier3 = Optional.of(new DynamicFilterSupplier(dynamicFilter3, System.currentTimeMillis(), 10000)); + + HiveColumnHandle testColumnHandle4 = new HiveColumnHandle("p4", HIVE_INT, parseTypeSignature(StandardTypes.INTEGER), 0, PARTITION_KEY, Optional.empty()); + Supplier>> dynamicFilter4 = createDynamicFilterSupplier(filterValues, testColumnHandle4, "filter3"); + Optional dynamicFilterSupplier4 = Optional.of(new DynamicFilterSupplier(dynamicFilter4, System.currentTimeMillis(), 0)); + + HiveConfig config = new HiveConfig(); + HivePageSourceProvider provider = new HivePageSourceProvider(config, createTestHdfsEnvironment(config), getDefaultHiveRecordCursorProvider(config), getDefaultHiveDataStreamFactories(config), TYPE_MANAGER, getNoOpIndexCache(), getDefaultHiveSelectiveFactories(config)); + + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(config, new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + ConnectorTableHandle table = new HiveTableHandle("db", "table", ImmutableMap.of(), ImmutableList.of(), Optional.empty()); + HiveTransactionHandle transaction = new HiveTransactionHandle(); + + try { + ConnectorPageSource result = provider.createPageSource(transaction, session, split, table, ImmutableList.of(testColumnHandle), dynamicFilterSupplier); + assertFalse(result instanceof FixedPageSource); + } + catch (Exception e) { + assertTrue(e instanceof PrestoException); + } + + try { + ConnectorPageSource result = provider.createPageSource(transaction, session, split, table, ImmutableList.of(testColumnHandle2), dynamicFilterSupplier2); + assertTrue(result instanceof FixedPageSource); + } + catch (Exception e) { + fail("A FixedPageSource object should have been created"); + } + + try { + ConnectorPageSource result = provider.createPageSource(transaction, session, split, table, ImmutableList.of(testColumnHandle3), dynamicFilterSupplier3); + assertFalse(result instanceof FixedPageSource); + } + catch (Exception e) { + assertTrue(e instanceof PrestoException); + } + + try { + ConnectorPageSource result = provider.createPageSource(transaction, session, split, table, ImmutableList.of(testColumnHandle4), dynamicFilterSupplier4); + assertFalse(result instanceof FixedPageSource); + } + catch (Exception e) { + assertTrue(e instanceof PrestoException); + } + } + + @Test + public void testJoinWithSelectiveBuildSide() + { + Session session = Session.builder(getSession()) + .setSystemProperty(JOIN_DISTRIBUTION_TYPE, FeaturesConfig.JoinDistributionType.BROADCAST.name()) + .build(); + DistributedQueryRunner runner = (DistributedQueryRunner) getQueryRunner(); + ResultWithQueryId result = runner.executeWithQueryId( + session, + "SELECT * FROM lineitem JOIN orders ON lineitem.orderkey = orders.orderkey AND orders.custkey = 1"); + assertGreaterThan(result.getResult().getRowCount(), 0); + + OperatorStats probeStats = searchScanFilterAndProjectOperatorStats(result.getQueryId(), "tpch:lineitem"); + // Probe side may be partially scanned, depending on the drivers' scheduling: + assertLessThanOrEqual(probeStats.getInputPositions(), countRows("lineitem")); + } + + private OperatorStats searchScanFilterAndProjectOperatorStats(QueryId queryId, String tableName) + { + DistributedQueryRunner runner = (DistributedQueryRunner) getQueryRunner(); + Plan plan = runner.getQueryPlan(queryId); + PlanNodeId nodeId = PlanNodeSearcher.searchFrom(plan.getRoot()) + .where(node -> { + if (!(node instanceof ProjectNode)) { + return false; + } + ProjectNode projectNode = (ProjectNode) node; + FilterNode filterNode = (FilterNode) projectNode.getSource(); + TableScanNode tableScanNode = (TableScanNode) filterNode.getSource(); + return tableName.equals(tableScanNode.getTable().getConnectorHandle().toString()); + }) + .findOnlyElement() + .getId(); + return runner.getCoordinator() + .getQueryManager() + .getFullQueryInfo(queryId) + .getQueryStats() + .getOperatorSummaries() + .stream() + .filter(summary -> nodeId.equals(summary.getPlanNodeId())) + .collect(MoreCollectors.onlyElement()); + } + + private Long countRows(String tableName) + { + MaterializedResult result = getQueryRunner().execute("SELECT COUNT() FROM " + tableName); + return (Long) result.getOnlyValue(); + } + + private Supplier>> createDynamicFilterSupplier(List values, ColumnHandle columnHandle, String filterId) + throws IOException + { + BloomFilter filter = new BloomFilter(values.size(), 0.01); + for (Long value : values) { + filter.add(value); + } + ByteArrayOutputStream out = new ByteArrayOutputStream(); + filter.writeTo(out); + + DynamicFilter dynamicFilter = DynamicFilterFactory.create(filterId, columnHandle, out.toByteArray(), DynamicFilter.Type.GLOBAL); + + Map dynamicFilterMap = ImmutableMap.of(columnHandle, dynamicFilter); + return () -> ImmutableList.of(dynamicFilterMap); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedOrderByQueries.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedOrderByQueries.java new file mode 100644 index 00000000..0a17ab84 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedOrderByQueries.java @@ -0,0 +1,27 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.tests.AbstractTestOrderByQueries; + +import static io.airlift.tpch.TpchTable.getTables; + +public class TestHiveDistributedOrderByQueries + extends AbstractTestOrderByQueries +{ + public TestHiveDistributedOrderByQueries() + { + super(() -> HiveQueryRunner.createQueryRunner(getTables())); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedQueries.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedQueries.java new file mode 100644 index 00000000..51c63295 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedQueries.java @@ -0,0 +1,54 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.testing.MaterializedResult; +import io.prestosql.tests.AbstractTestDistributedQueries; +import org.testng.annotations.Test; + +import static com.google.common.collect.Iterables.getOnlyElement; +import static io.airlift.tpch.TpchTable.getTables; +import static io.prestosql.sql.tree.ExplainType.Type.LOGICAL; +import static org.testng.Assert.assertEquals; + +public class TestHiveDistributedQueries + extends AbstractTestDistributedQueries +{ + public TestHiveDistributedQueries() + { + super(() -> HiveQueryRunner.createQueryRunner(getTables())); + } + + @Override + @Test + public void testDelete() + { + // Hive connector currently does not support row-by-row delete + } + + protected boolean supportsPushdown() + { + return true; + } + + @Test + public void testExplainOfCreateTableAs() + { + String query = "CREATE TABLE copy_orders AS SELECT * FROM orders"; + MaterializedResult result = computeActual("EXPLAIN " + query); + assertEquals(getOnlyElement(result.getOnlyColumnAsSet()), getExplainPlan(query, LOGICAL)); + } + + // Hive specific tests should normally go in TestHiveIntegrationSmokeTest +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedStarTreeQueries.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedStarTreeQueries.java new file mode 100644 index 00000000..22e57f3c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedStarTreeQueries.java @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterables; +import io.airlift.testing.mysql.TestingMySqlServer; +import io.airlift.tpch.TpchTable; +import io.prestosql.tests.AbstractTestStarTreeQueries; +import io.prestosql.tests.DistributedQueryRunner; +import org.testng.annotations.AfterClass; +import org.testng.annotations.Test; + +import java.util.Optional; +import java.util.Random; + +import static io.airlift.tpch.TpchTable.getTables; + +@Test +public class TestHiveDistributedStarTreeQueries + extends AbstractTestStarTreeQueries +{ + private final TestingMySqlServer mysqlServer; + + public TestHiveDistributedStarTreeQueries() + throws Exception + { + this(createTestingMySqlServer()); + } + + public TestHiveDistributedStarTreeQueries(TestingMySqlServer mysqlServer) + { + super(() -> createQueryRunnerWithMetaStore(getTables(), mysqlServer)); + this.mysqlServer = mysqlServer; + } + + private static TestingMySqlServer createTestingMySqlServer() + throws Exception + { + return new TestingMySqlServer("user", "testpass", "cube_meta_store_" + new Random().nextInt(10000000)); + } + + public static DistributedQueryRunner createQueryRunnerWithMetaStore(Iterable> tables, TestingMySqlServer mySqlServer) + throws Exception + { + return HiveQueryRunner.createQueryRunner(tables, ImmutableMap.of(), "sql-standard", ImmutableMap.of(), Optional.empty(), false, mySqlServer.getJdbcUrl(Iterables.getOnlyElement(mySqlServer.getDatabases()))); + } + + @AfterClass(alwaysRun = true) + public final void destroy() + { + mysqlServer.close(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedWindowQueries.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedWindowQueries.java new file mode 100644 index 00000000..7d025a7d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveDistributedWindowQueries.java @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.tests.AbstractTestWindowQueries; + +import static io.airlift.tpch.TpchTable.getTables; +import static io.prestosql.plugin.hive.HiveQueryRunner.createQueryRunner; + +public class TestHiveDistributedWindowQueries + extends AbstractTestWindowQueries +{ + public TestHiveDistributedWindowQueries() + { + super(() -> createQueryRunner(getTables())); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveFileBasedSecurity.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveFileBasedSecurity.java new file mode 100644 index 00000000..31defd6d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveFileBasedSecurity.java @@ -0,0 +1,70 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.Session; +import io.prestosql.spi.security.Identity; +import io.prestosql.testing.QueryRunner; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.Optional; + +import static io.airlift.tpch.TpchTable.NATION; +import static io.prestosql.testing.TestingSession.testSessionBuilder; + +public class TestHiveFileBasedSecurity +{ + private QueryRunner queryRunner; + + @BeforeClass + public void setUp() + throws Exception + { + String path = this.getClass().getResource("security.json").getPath(); + queryRunner = HiveQueryRunner.createQueryRunner(ImmutableList.of(NATION), ImmutableMap.of(), "file", ImmutableMap.of("security.config-file", path), Optional.empty(), false); + } + + @AfterClass(alwaysRun = true) + public void tearDown() + { + queryRunner.close(); + queryRunner = null; + } + + @Test + public void testAdminCanRead() + { + Session admin = getSession("hive"); + queryRunner.execute(admin, "SELECT * FROM nation"); + } + + @Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = ".*Access Denied: Cannot select from table tpch.nation.*") + public void testNonAdminCannotRead() + { + Session bob = getSession("bob"); + queryRunner.execute(bob, "SELECT * FROM nation"); + } + + private Session getSession(String user) + { + return testSessionBuilder() + .setCatalog(queryRunner.getDefaultSession().getCatalog().get()) + .setSchema(queryRunner.getDefaultSession().getSchema().get()) + .setIdentity(new Identity(user, Optional.empty())).build(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveFileFormats.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveFileFormats.java new file mode 100644 index 00000000..8d706ba2 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveFileFormats.java @@ -0,0 +1,889 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import io.airlift.compress.lzo.LzoCodec; +import io.airlift.compress.lzo.LzopCodec; +import io.airlift.slice.Slices; +import io.prestosql.orc.OrcCacheStore; +import io.prestosql.orc.OrcWriterOptions; +import io.prestosql.plugin.hive.orc.OrcPageSourceFactory; +import io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory; +import io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.connector.RecordPageSource; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.testing.TestingConnectorSession; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hadoop.mapred.FileSplit; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.time.Duration; +import java.util.List; +import java.util.Locale; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.TimeZone; + +import static com.google.common.base.Predicates.not; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.Iterables.filter; +import static com.google.common.collect.Iterables.transform; +import static io.airlift.slice.Slices.utf8Slice; +import static io.prestosql.plugin.hive.HiveStorageFormat.AVRO; +import static io.prestosql.plugin.hive.HiveStorageFormat.CSV; +import static io.prestosql.plugin.hive.HiveStorageFormat.JSON; +import static io.prestosql.plugin.hive.HiveStorageFormat.ORC; +import static io.prestosql.plugin.hive.HiveStorageFormat.PARQUET; +import static io.prestosql.plugin.hive.HiveStorageFormat.RCBINARY; +import static io.prestosql.plugin.hive.HiveStorageFormat.RCTEXT; +import static io.prestosql.plugin.hive.HiveStorageFormat.SEQUENCEFILE; +import static io.prestosql.plugin.hive.HiveStorageFormat.TEXTFILE; +import static io.prestosql.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT; +import static io.prestosql.plugin.hive.HiveTestUtils.TYPE_MANAGER; +import static io.prestosql.plugin.hive.HiveTestUtils.createGenericHiveRecordCursorProvider; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; + +public class TestHiveFileFormats + extends AbstractTestHiveFileFormats +{ + private static final FileFormatDataSourceStats STATS = new FileFormatDataSourceStats(); + private static TestingConnectorSession parquetPageSourceSession = new TestingConnectorSession(new HiveSessionProperties(createParquetHiveConfig(false), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + private static TestingConnectorSession parquetPageSourceSessionUseName = new TestingConnectorSession(new HiveSessionProperties(createParquetHiveConfig(true), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + + @DataProvider(name = "rowCount") + public static Object[][] rowCountProvider() + { + return new Object[][] {{0}, {1000}}; + } + + @BeforeClass(alwaysRun = true) + public void setUp() + { + // ensure the expected timezone is configured for this VM + assertEquals(TimeZone.getDefault().getID(), + "America/Bahia_Banderas", + "Timezone not configured correctly. Add -Duser.timezone=America/Bahia_Banderas to your JVM arguments"); + } + + @Test(dataProvider = "rowCount") + public void testTextFile(int rowCount) + throws Exception + { + List testColumns = TEST_COLUMNS.stream() + .filter(column -> !column.getName().equals("t_map_null_key_complex_key_value")) + .collect(toList()); + + assertThatFileFormat(TEXTFILE) + .withColumns(testColumns) + .withRowsCount(rowCount) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + } + + @Test(dataProvider = "rowCount") + public void testSequenceFile(int rowCount) + throws Exception + { + List testColumns = TEST_COLUMNS.stream() + .filter(column -> !column.getName().equals("t_map_null_key_complex_key_value")) + .collect(toList()); + + assertThatFileFormat(SEQUENCEFILE) + .withColumns(testColumns) + .withRowsCount(rowCount) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + } + + @Test(dataProvider = "rowCount") + public void testCsvFile(int rowCount) + throws Exception + { + List testColumns = TEST_COLUMNS.stream() + // CSV table only support Hive string columns. Notice that CSV does not allow to store null, it uses an empty string instead. + .filter(column -> column.isPartitionKey() || ("string".equals(column.getType()) && !column.getName().contains("_null_"))) + .collect(toImmutableList()); + + assertTrue(testColumns.size() > 5); + + assertThatFileFormat(CSV) + .withColumns(testColumns) + .withRowsCount(rowCount) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + } + + @Test + public void testCsvFileWithNullAndValue() + throws Exception + { + assertThatFileFormat(CSV) + .withColumns(ImmutableList.of( + new TestColumn("t_null_string", javaStringObjectInspector, null, Slices.utf8Slice("")), // null was converted to empty string! + new TestColumn("t_string", javaStringObjectInspector, "test", Slices.utf8Slice("test")))) + .withRowsCount(2) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + } + + @Test(dataProvider = "rowCount") + public void testJson(int rowCount) + throws Exception + { + List testColumns = TEST_COLUMNS.stream() + // binary is not supported + .filter(column -> !column.getName().equals("t_binary")) + // non-string map keys are not supported + .filter(column -> !column.getName().equals("t_map_tinyint")) + .filter(column -> !column.getName().equals("t_map_smallint")) + .filter(column -> !column.getName().equals("t_map_int")) + .filter(column -> !column.getName().equals("t_map_bigint")) + .filter(column -> !column.getName().equals("t_map_float")) + .filter(column -> !column.getName().equals("t_map_double")) + // null map keys are not supported + .filter(TestHiveFileFormats::withoutNullMapKeyTests) + // decimal(38) is broken or not supported + .filter(column -> !column.getName().equals("t_decimal_precision_38")) + .filter(column -> !column.getName().equals("t_map_decimal_precision_38")) + .filter(column -> !column.getName().equals("t_array_decimal_precision_38")) + .collect(toList()); + + assertThatFileFormat(JSON) + .withColumns(testColumns) + .withRowsCount(rowCount) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + } + + @Test(dataProvider = "rowCount") + public void testRCText(int rowCount) + throws Exception + { + List testColumns = ImmutableList.copyOf(filter(TEST_COLUMNS, testColumn -> { + // TODO: This is a bug in the RC text reader + // RC file does not support complex type as key of a map + return !testColumn.getName().equals("t_struct_null") + && !testColumn.getName().equals("t_map_null_key_complex_key_value"); + })); + assertThatFileFormat(RCTEXT) + .withColumns(testColumns) + .withRowsCount(rowCount) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + } + + @Test(dataProvider = "rowCount") + public void testRcTextPageSource(int rowCount) + throws Exception + { + assertThatFileFormat(RCTEXT) + .withColumns(TEST_COLUMNS) + .withRowsCount(rowCount) + .isReadableByPageSource(new RcFilePageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())); + } + + @Test(dataProvider = "rowCount") + public void testRcTextOptimizedWriter(int rowCount) + throws Exception + { + List testColumns = TEST_COLUMNS.stream() + // t_map_null_key_* must be disabled because Presto can not produce maps with null keys so the writer will throw + .filter(TestHiveFileFormats::withoutNullMapKeyTests) + .collect(toImmutableList()); + + assertThatFileFormat(RCTEXT) + .withColumns(testColumns) + .withRowsCount(rowCount) + .withFileWriterFactory(new RcFileFileWriterFactory(HDFS_ENVIRONMENT, TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE, STATS)) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) + .isReadableByPageSource(new RcFilePageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())); + } + + @Test(dataProvider = "rowCount") + public void testRcBinaryPageSource(int rowCount) + throws Exception + { + // RCBinary does not support complex type as key of a map and interprets empty VARCHAR as nulls + // Hive binary writers are broken for timestamps + List testColumns = TEST_COLUMNS.stream() + .filter(testColumn -> !testColumn.getName().equals("t_empty_varchar")) + .filter(TestHiveFileFormats::withoutTimestamps) + .collect(toList()); + + assertThatFileFormat(RCBINARY) + .withColumns(testColumns) + .withRowsCount(rowCount) + .isReadableByPageSource(new RcFilePageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())); + } + + @Test(dataProvider = "rowCount") + public void testRcBinaryOptimizedWriter(int rowCount) + throws Exception + { + List testColumns = TEST_COLUMNS.stream() + // RCBinary interprets empty VARCHAR as nulls + .filter(testColumn -> !testColumn.getName().equals("t_empty_varchar")) + // t_map_null_key_* must be disabled because Presto can not produce maps with null keys so the writer will throw + .filter(TestHiveFileFormats::withoutNullMapKeyTests) + .collect(toList()); + + // Hive cannot read timestamps from old files + List testColumnsNoTimestamps = testColumns.stream() + .filter(TestHiveFileFormats::withoutTimestamps) + .collect(toList()); + + assertThatFileFormat(RCBINARY) + .withColumns(testColumns) + .withRowsCount(rowCount) + .withFileWriterFactory(new RcFileFileWriterFactory(HDFS_ENVIRONMENT, TYPE_MANAGER, new NodeVersion("test"), HIVE_STORAGE_TIME_ZONE, STATS)) + .isReadableByPageSource(new RcFilePageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())) + .withColumns(testColumnsNoTimestamps) + .isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + } + + @Test(dataProvider = "rowCount") + public void testOrc(int rowCount) + throws Exception + { + // Hive binary writers are broken for timestamps + List testColumns = TEST_COLUMNS.stream() + .filter(TestHiveFileFormats::withoutTimestamps) + .collect(toImmutableList()); + + assertThatFileFormat(ORC) + .withColumns(testColumns) + .withRowsCount(rowCount) + .isReadableByPageSource(new OrcPageSourceFactory(TYPE_MANAGER, new HiveConfig().setUseOrcColumnNames(false), HDFS_ENVIRONMENT, STATS, OrcCacheStore.builder().newCacheStore( + new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), + new HiveConfig().getOrcStripeFooterCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), + new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), + new HiveConfig().getOrcBloomFiltersCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), + new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), + new HiveConfig().isOrcCacheStatsMetricCollectionEnabled()))); + } + + @Test(dataProvider = "rowCount") + public void testOrcOptimizedWriter(int rowCount) + throws Exception + { + TestingConnectorSession session = new TestingConnectorSession( + new HiveSessionProperties( + new HiveConfig() + .setOrcWriterValidationPercentage(100.0), + new OrcFileWriterConfig(), + new ParquetFileWriterConfig()).getSessionProperties()); + + // A Presto page can not contain a map with null keys, so a page based writer can not write null keys + List testColumns = TEST_COLUMNS.stream() + .filter(TestHiveFileFormats::withoutNullMapKeyTests) + .collect(toList()); + + assertThatFileFormat(ORC) + .withColumns(testColumns) + .withRowsCount(rowCount) + .withSession(session) + .withFileWriterFactory(new OrcFileWriterFactory(HDFS_ENVIRONMENT, TYPE_MANAGER, new NodeVersion("test"), false, STATS, new OrcWriterOptions())) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)) + .isReadableByPageSource(new OrcPageSourceFactory(TYPE_MANAGER, new HiveConfig().setUseOrcColumnNames(false), HDFS_ENVIRONMENT, STATS, OrcCacheStore.builder().newCacheStore( + new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), + new HiveConfig().getOrcStripeFooterCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), + new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), + new HiveConfig().getOrcBloomFiltersCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), + new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), + new HiveConfig().isOrcCacheStatsMetricCollectionEnabled()))); + } + + @Test(dataProvider = "rowCount") + public void testOrcUseColumnNames(int rowCount) + throws Exception + { + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig(), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + + // Hive binary writers are broken for timestamps + List testColumns = TEST_COLUMNS.stream() + .filter(TestHiveFileFormats::withoutTimestamps) + .collect(toImmutableList()); + + assertThatFileFormat(ORC) + .withWriteColumns(testColumns) + .withRowsCount(rowCount) + .withReadColumns(Lists.reverse(testColumns)) + .withSession(session) + .isReadableByPageSource(new OrcPageSourceFactory(TYPE_MANAGER, new HiveConfig().setUseOrcColumnNames(true), HDFS_ENVIRONMENT, STATS, OrcCacheStore.builder().newCacheStore( + new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), + new HiveConfig().getOrcStripeFooterCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), + new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), + new HiveConfig().getOrcBloomFiltersCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), + new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), + new HiveConfig().isOrcCacheStatsMetricCollectionEnabled()))); + } + + @Test(dataProvider = "rowCount") + public void testOrcUseColumnNameLowerCaseConversion(int rowCount) + throws Exception + { + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig(), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + + List testColumnsUpperCase = TEST_COLUMNS.stream() + .map(testColumn -> new TestColumn(testColumn.getName().toUpperCase(Locale.ENGLISH), testColumn.getObjectInspector(), testColumn.getWriteValue(), testColumn.getExpectedValue(), testColumn.isPartitionKey())) + .collect(toList()); + + assertThatFileFormat(ORC) + .withWriteColumns(testColumnsUpperCase) + .withRowsCount(rowCount) + .withReadColumns(TEST_COLUMNS) + .withSession(session); + } + + @Test(dataProvider = "rowCount") + public void testAvro(int rowCount) + throws Exception + { + assertThatFileFormat(AVRO) + .withColumns(getTestColumnsSupportedByAvro()) + .withRowsCount(rowCount) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + } + + private static List getTestColumnsSupportedByAvro() + { + // Avro only supports String for Map keys, and doesn't support smallint or tinyint. + return TEST_COLUMNS.stream() + .filter(column -> !column.getName().startsWith("t_map_") || column.getName().equals("t_map_string")) + .filter(column -> !column.getName().endsWith("_smallint")) + .filter(column -> !column.getName().endsWith("_tinyint")) + .collect(toList()); + } + + @Test(dataProvider = "rowCount") + public void testParquetPageSource(int rowCount) + throws Exception + { + List testColumns = getTestColumnsSupportedByParquet(); + assertThatFileFormat(PARQUET) + .withColumns(testColumns) + .withSession(parquetPageSourceSession) + .withRowsCount(rowCount) + .isReadableByPageSource(new ParquetPageSourceFactory(HiveTestUtils.TYPE_MANAGER, HiveTestUtils.HDFS_ENVIRONMENT, STATS, new HiveConfig())); + } + + @Test(dataProvider = "rowCount") + public void testParquetPageSourceSchemaEvolution(int rowCount) + throws Exception + { + List writeColumns = getTestColumnsSupportedByParquet(); + + // test index-based access + List readColumns = writeColumns.stream() + .map(column -> new TestColumn( + column.getName() + "_new", + column.getObjectInspector(), + column.getWriteValue(), + column.getExpectedValue(), + column.isPartitionKey())) + .collect(toList()); + assertThatFileFormat(PARQUET) + .withWriteColumns(writeColumns) + .withReadColumns(readColumns) + .withSession(parquetPageSourceSession) + .withRowsCount(rowCount) + .isReadableByPageSource(new ParquetPageSourceFactory(HiveTestUtils.TYPE_MANAGER, HiveTestUtils.HDFS_ENVIRONMENT, STATS, new HiveConfig())); + + // test name-based access + readColumns = Lists.reverse(writeColumns); + assertThatFileFormat(PARQUET) + .withWriteColumns(writeColumns) + .withReadColumns(readColumns) + .withSession(parquetPageSourceSessionUseName) + .isReadableByPageSource(new ParquetPageSourceFactory(HiveTestUtils.TYPE_MANAGER, HiveTestUtils.HDFS_ENVIRONMENT, STATS, new HiveConfig())); + } + + private static List getTestColumnsSupportedByParquet() + { + // Write of complex hive data to Parquet is broken + // TODO: empty arrays or maps with null keys don't seem to work + // Parquet does not support DATE + return TEST_COLUMNS.stream() + .filter(TestHiveFileFormats::withoutTimestamps) + .filter(TestHiveFileFormats::withoutNullMapKeyTests) + .filter(column -> !column.getName().equals("t_null_array_int")) + .filter(column -> !column.getName().equals("t_array_empty")) + .filter(column -> column.isPartitionKey() || ( + !hasType(column.getObjectInspector(), PrimitiveCategory.DATE)) && + !hasType(column.getObjectInspector(), PrimitiveCategory.SHORT) && + !hasType(column.getObjectInspector(), PrimitiveCategory.BYTE)) + .collect(toList()); + } + + @Test + public void testTruncateVarcharColumn() + throws Exception + { + TestColumn writeColumn = new TestColumn("varchar_column", getPrimitiveJavaObjectInspector(new VarcharTypeInfo(4)), new HiveVarchar("test", 4), utf8Slice("test")); + TestColumn readColumn = new TestColumn("varchar_column", getPrimitiveJavaObjectInspector(new VarcharTypeInfo(3)), new HiveVarchar("tes", 3), utf8Slice("tes")); + + assertThatFileFormat(RCTEXT) + .withWriteColumns(ImmutableList.of(writeColumn)) + .withReadColumns(ImmutableList.of(readColumn)) + .isReadableByPageSource(new RcFilePageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + + assertThatFileFormat(RCBINARY) + .withWriteColumns(ImmutableList.of(writeColumn)) + .withReadColumns(ImmutableList.of(readColumn)) + .isReadableByPageSource(new RcFilePageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig())) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + + assertThatFileFormat(ORC) + .withWriteColumns(ImmutableList.of(writeColumn)) + .withReadColumns(ImmutableList.of(readColumn)) + .isReadableByPageSource(new OrcPageSourceFactory(TYPE_MANAGER, new HiveConfig().setUseOrcColumnNames(false), HDFS_ENVIRONMENT, STATS, OrcCacheStore.builder().newCacheStore( + new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), + new HiveConfig().getOrcStripeFooterCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), + new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), + new HiveConfig().getOrcBloomFiltersCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), + new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), + new HiveConfig().isOrcCacheStatsMetricCollectionEnabled()))); + + assertThatFileFormat(PARQUET) + .withWriteColumns(ImmutableList.of(writeColumn)) + .withReadColumns(ImmutableList.of(readColumn)) + .withSession(parquetPageSourceSession) + .isReadableByPageSource(new ParquetPageSourceFactory(HiveTestUtils.TYPE_MANAGER, HiveTestUtils.HDFS_ENVIRONMENT, STATS, new HiveConfig())); + + assertThatFileFormat(AVRO) + .withWriteColumns(ImmutableList.of(writeColumn)) + .withReadColumns(ImmutableList.of(readColumn)) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + + assertThatFileFormat(SEQUENCEFILE) + .withWriteColumns(ImmutableList.of(writeColumn)) + .withReadColumns(ImmutableList.of(readColumn)) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + + assertThatFileFormat(TEXTFILE) + .withWriteColumns(ImmutableList.of(writeColumn)) + .withReadColumns(ImmutableList.of(readColumn)) + .isReadableByRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)); + } + + @Test + public void testFailForLongVarcharPartitionColumn() + throws Exception + { + TestColumn partitionColumn = new TestColumn("partition_column", getPrimitiveJavaObjectInspector(new VarcharTypeInfo(3)), "test", utf8Slice("tes"), true); + TestColumn varcharColumn = new TestColumn("varchar_column", getPrimitiveJavaObjectInspector(new VarcharTypeInfo(3)), new HiveVarchar("tes", 3), utf8Slice("tes")); + + List columns = ImmutableList.of(partitionColumn, varcharColumn); + + HiveErrorCode expectedErrorCode = HiveErrorCode.HIVE_INVALID_PARTITION_VALUE; + String expectedMessage = "Invalid partition value 'test' for varchar(3) partition key: partition_column"; + + assertThatFileFormat(RCTEXT) + .withColumns(columns) + .isFailingForPageSource(new RcFilePageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig()), expectedErrorCode, expectedMessage) + .isFailingForRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), expectedErrorCode, expectedMessage); + + assertThatFileFormat(RCBINARY) + .withColumns(columns) + .isFailingForPageSource(new RcFilePageSourceFactory(TYPE_MANAGER, HDFS_ENVIRONMENT, STATS, new HiveConfig()), expectedErrorCode, expectedMessage) + .isFailingForRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), expectedErrorCode, expectedMessage); + + assertThatFileFormat(ORC) + .withColumns(columns) + .isFailingForPageSource(new OrcPageSourceFactory(TYPE_MANAGER, new HiveConfig().setUseOrcColumnNames(false), HDFS_ENVIRONMENT, STATS, OrcCacheStore.builder().newCacheStore( + new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), + new HiveConfig().getOrcStripeFooterCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), + new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), + new HiveConfig().getOrcBloomFiltersCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), + new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), + new HiveConfig().isOrcCacheStatsMetricCollectionEnabled())), expectedErrorCode, expectedMessage); + + assertThatFileFormat(PARQUET) + .withColumns(columns) + .withSession(parquetPageSourceSession) + .isFailingForPageSource(new ParquetPageSourceFactory(HiveTestUtils.TYPE_MANAGER, HiveTestUtils.HDFS_ENVIRONMENT, STATS, new HiveConfig()), expectedErrorCode, expectedMessage); + + assertThatFileFormat(SEQUENCEFILE) + .withColumns(columns) + .isFailingForRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), expectedErrorCode, expectedMessage); + + assertThatFileFormat(TEXTFILE) + .withColumns(columns) + .isFailingForRecordCursor(new GenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), expectedErrorCode, expectedMessage); + } + + private void testCursorProvider(HiveRecordCursorProvider cursorProvider, + FileSplit split, + HiveStorageFormat storageFormat, + List testColumns, + ConnectorSession session, + int rowCount) + { + Properties splitProperties = new Properties(); + splitProperties.setProperty(FILE_INPUT_FORMAT, storageFormat.getInputFormat()); + splitProperties.setProperty(SERIALIZATION_LIB, storageFormat.getSerDe()); + splitProperties.setProperty("columns", Joiner.on(',').join(transform(filter(testColumns, not(TestColumn::isPartitionKey)), TestColumn::getName))); + splitProperties.setProperty("columns.types", Joiner.on(',').join(transform(filter(testColumns, not(TestColumn::isPartitionKey)), TestColumn::getType))); + + List partitionKeys = testColumns.stream() + .filter(TestColumn::isPartitionKey) + .map(input -> new HivePartitionKey(input.getName(), (String) input.getWriteValue())) + .collect(toList()); + + Configuration configuration = new Configuration(); + configuration.set("io.compression.codecs", LzoCodec.class.getName() + "," + LzopCodec.class.getName()); + Optional pageSource = HivePageSourceProvider.createHivePageSource( + ImmutableSet.of(cursorProvider), + ImmutableSet.of(), + configuration, + session, + split.getPath(), + OptionalInt.empty(), + split.getStart(), + split.getLength(), + split.getLength(), + splitProperties, + TupleDomain.all(), + getColumnHandles(testColumns), + partitionKeys, + TYPE_MANAGER, + ImmutableMap.of(), + Optional.empty(), + false, + Optional.empty(), + Optional.empty(), + Optional.empty(), + Optional.empty(), + null, + false, + -1L, + ImmutableMap.of(), + ImmutableList.of(), + Optional.empty(), + new HiveOffloadExpression(), + ImmutableMap.of()); + + RecordCursor cursor = ((RecordPageSource) pageSource.get()).getCursor(); + + checkCursor(cursor, testColumns, rowCount); + } + + private void testPageSourceFactory(HivePageSourceFactory sourceFactory, + FileSplit split, + HiveStorageFormat storageFormat, + List testColumns, + ConnectorSession session, + int rowCount) + throws IOException + { + Properties splitProperties = new Properties(); + splitProperties.setProperty(FILE_INPUT_FORMAT, storageFormat.getInputFormat()); + splitProperties.setProperty(SERIALIZATION_LIB, storageFormat.getSerDe()); + splitProperties.setProperty("columns", Joiner.on(',').join(transform(filter(testColumns, not(TestColumn::isPartitionKey)), TestColumn::getName))); + splitProperties.setProperty("columns.types", Joiner.on(',').join(transform(filter(testColumns, not(TestColumn::isPartitionKey)), TestColumn::getType))); + + List partitionKeys = testColumns.stream() + .filter(TestColumn::isPartitionKey) + .map(input -> new HivePartitionKey(input.getName(), (String) input.getWriteValue())) + .collect(toList()); + + List columnHandles = getColumnHandles(testColumns); + + Optional pageSource = HivePageSourceProvider.createHivePageSource( + ImmutableSet.of(), + ImmutableSet.of(sourceFactory), + new Configuration(), + session, + split.getPath(), + OptionalInt.empty(), + split.getStart(), + split.getLength(), + split.getLength(), + splitProperties, + TupleDomain.all(), + columnHandles, + partitionKeys, + TYPE_MANAGER, + ImmutableMap.of(), + Optional.empty(), + false, + Optional.empty(), + Optional.empty(), + Optional.empty(), + Optional.empty(), + null, + false, + -1L, + ImmutableMap.of(), + ImmutableList.of(), + Optional.empty(), + new HiveOffloadExpression(), + ImmutableMap.of()); + + assertTrue(pageSource.isPresent()); + + checkPageSource(pageSource.get(), testColumns, HiveTestUtils.getTypes(columnHandles), rowCount); + } + + public static boolean hasType(ObjectInspector objectInspector, PrimitiveCategory... types) + { + if (objectInspector instanceof PrimitiveObjectInspector) { + PrimitiveObjectInspector primitiveInspector = (PrimitiveObjectInspector) objectInspector; + PrimitiveCategory primitiveCategory = primitiveInspector.getPrimitiveCategory(); + for (PrimitiveCategory type : types) { + if (primitiveCategory == type) { + return true; + } + } + return false; + } + if (objectInspector instanceof ListObjectInspector) { + ListObjectInspector listInspector = (ListObjectInspector) objectInspector; + return hasType(listInspector.getListElementObjectInspector(), types); + } + if (objectInspector instanceof MapObjectInspector) { + MapObjectInspector mapInspector = (MapObjectInspector) objectInspector; + return hasType(mapInspector.getMapKeyObjectInspector(), types) || + hasType(mapInspector.getMapValueObjectInspector(), types); + } + if (objectInspector instanceof StructObjectInspector) { + for (StructField field : ((StructObjectInspector) objectInspector).getAllStructFieldRefs()) { + if (hasType(field.getFieldObjectInspector(), types)) { + return true; + } + } + return false; + } + throw new IllegalArgumentException("Unknown object inspector type " + objectInspector); + } + + private static boolean withoutNullMapKeyTests(TestColumn testColumn) + { + String name = testColumn.getName(); + return !name.equals("t_map_null_key") && + !name.equals("t_map_null_key_complex_key_value") && + !name.equals("t_map_null_key_complex_value"); + } + + private FileFormatAssertion assertThatFileFormat(HiveStorageFormat hiveStorageFormat) + { + return new FileFormatAssertion(hiveStorageFormat.name()) + .withStorageFormat(hiveStorageFormat); + } + + private static HiveConfig createParquetHiveConfig(boolean useParquetColumnNames) + { + return new HiveConfig() + .setUseParquetColumnNames(useParquetColumnNames); + } + + private class FileFormatAssertion + { + private final String formatName; + private HiveStorageFormat storageFormat; + private HiveCompressionCodec compressionCodec = HiveCompressionCodec.NONE; + private List writeColumns; + private List readColumns; + private ConnectorSession session = HiveTestUtils.SESSION; + private int rowsCount = 1000; + private HiveFileWriterFactory fileWriterFactory; + + private FileFormatAssertion(String formatName) + { + this.formatName = requireNonNull(formatName, "formatName is null"); + } + + public FileFormatAssertion withStorageFormat(HiveStorageFormat storageFormat) + { + this.storageFormat = requireNonNull(storageFormat, "storageFormat is null"); + return this; + } + + public FileFormatAssertion withCompressionCodec(HiveCompressionCodec compressionCodec) + { + this.compressionCodec = requireNonNull(compressionCodec, "compressionCodec is null"); + return this; + } + + public FileFormatAssertion withFileWriterFactory(HiveFileWriterFactory fileWriterFactory) + { + this.fileWriterFactory = requireNonNull(fileWriterFactory, "fileWriterFactory is null"); + return this; + } + + public FileFormatAssertion withColumns(List inputColumns) + { + withWriteColumns(inputColumns); + withReadColumns(inputColumns); + return this; + } + + public FileFormatAssertion withWriteColumns(List writeColumns) + { + this.writeColumns = requireNonNull(writeColumns, "writeColumns is null"); + return this; + } + + public FileFormatAssertion withReadColumns(List readColumns) + { + this.readColumns = requireNonNull(readColumns, "readColumns is null"); + return this; + } + + public FileFormatAssertion withRowsCount(int rowsCount) + { + this.rowsCount = rowsCount; + return this; + } + + public FileFormatAssertion withSession(ConnectorSession session) + { + this.session = requireNonNull(session, "session is null"); + return this; + } + + public FileFormatAssertion isReadableByPageSource(HivePageSourceFactory pageSourceFactory) + throws Exception + { + assertRead(Optional.of(pageSourceFactory), Optional.empty()); + return this; + } + + public FileFormatAssertion isReadableByRecordCursor(HiveRecordCursorProvider cursorProvider) + throws Exception + { + assertRead(Optional.empty(), Optional.of(cursorProvider)); + return this; + } + + public FileFormatAssertion isFailingForPageSource(HivePageSourceFactory pageSourceFactory, HiveErrorCode expectedErrorCode, String expectedMessage) + throws Exception + { + assertFailure(Optional.of(pageSourceFactory), Optional.empty(), expectedErrorCode, expectedMessage); + return this; + } + + public FileFormatAssertion isFailingForRecordCursor(HiveRecordCursorProvider cursorProvider, HiveErrorCode expectedErrorCode, String expectedMessage) + throws Exception + { + assertFailure(Optional.empty(), Optional.of(cursorProvider), expectedErrorCode, expectedMessage); + return this; + } + + private void assertRead(Optional pageSourceFactory, Optional cursorProvider) + throws Exception + { + assertNotNull(storageFormat, "storageFormat must be specified"); + assertNotNull(writeColumns, "writeColumns must be specified"); + assertNotNull(readColumns, "readColumns must be specified"); + assertNotNull(session, "session must be specified"); + assertTrue(rowsCount >= 0, "rowsCount must be greater than zero"); + + String compressionSuffix = compressionCodec.getCodec() + .map(codec -> { + try { + return codec.getConstructor().newInstance().getDefaultExtension(); + } + catch (Exception e) { + throw new RuntimeException(e); + } + }) + .orElse(""); + + File file = File.createTempFile("presto_test", formatName + compressionSuffix); + file.delete(); + try { + FileSplit split; + if (fileWriterFactory != null) { + split = createTestFilePresto(file.getAbsolutePath(), storageFormat, compressionCodec, writeColumns, session, rowsCount, fileWriterFactory); + } + else { + split = createTestFileHive(file.getAbsolutePath(), storageFormat, compressionCodec, writeColumns, rowsCount); + } + if (pageSourceFactory.isPresent()) { + testPageSourceFactory(pageSourceFactory.get(), split, storageFormat, readColumns, session, rowsCount); + } + if (cursorProvider.isPresent()) { + testCursorProvider(cursorProvider.get(), split, storageFormat, readColumns, session, rowsCount); + } + } + finally { + //noinspection ResultOfMethodCallIgnored + file.delete(); + } + } + + private void assertFailure( + Optional pageSourceFactory, + Optional cursorProvider, + HiveErrorCode expectedErrorCode, + String expectedMessage) + throws Exception + { + try { + assertRead(pageSourceFactory, cursorProvider); + fail("failure is expected"); + } + catch (PrestoException prestoException) { + assertEquals(prestoException.getErrorCode(), expectedErrorCode.toErrorCode()); + assertEquals(prestoException.getMessage(), expectedMessage); + } + } + } + + private static boolean withoutTimestamps(TestColumn testColumn) + { + String name = testColumn.getName(); + return !name.equals("t_timestamp") && + !name.equals("t_map_timestamp") && + !name.equals("t_array_timestamp"); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveFileMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveFileMetastore.java new file mode 100644 index 00000000..cc2b88de --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveFileMetastore.java @@ -0,0 +1,68 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableSet; +import io.prestosql.plugin.hive.authentication.NoHdfsAuthentication; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.file.FileHiveMetastore; +import org.testng.SkipException; + +import java.io.File; + +public class TestHiveFileMetastore + extends AbstractTestHiveLocal +{ + @Override + protected HiveMetastore createMetastore(File tempDir) + { + File baseDir = new File(tempDir, "metastore"); + HiveConfig hiveConfig = new HiveConfig(); + HdfsConfigurationInitializer updator = new HdfsConfigurationInitializer(hiveConfig); + HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(updator, ImmutableSet.of()); + HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hiveConfig, new NoHdfsAuthentication()); + return new FileHiveMetastore(hdfsEnvironment, baseDir.toURI().toString(), "test"); + } + + @Override + public void testMismatchSchemaTable() + { + // FileHiveMetastore only supports replaceTable() for views + } + + @Override + public void testPartitionSchemaMismatch() + { + // test expects an exception to be thrown + throw new SkipException("FileHiveMetastore only supports replaceTable() for views"); + } + + @Override + public void testBucketedTableEvolution() + { + // FileHiveMetastore only supports replaceTable() for views + } + + @Override + public void testTransactionDeleteInsert() + { + // FileHiveMetastore has various incompatibilities + } + + @Override + public void testInsertOverwriteUnpartitioned() + { + // FileHiveMetastore has various incompatibilities + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveInMemoryMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveInMemoryMetastore.java new file mode 100644 index 00000000..39619bd4 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveInMemoryMetastore.java @@ -0,0 +1,44 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.BridgingHiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.InMemoryThriftMetastore; + +import java.io.File; + +public class TestHiveInMemoryMetastore + extends AbstractTestHiveLocal +{ + @Override + protected HiveMetastore createMetastore(File tempDir) + { + File baseDir = new File(tempDir, "metastore"); + InMemoryThriftMetastore hiveMetastore = new InMemoryThriftMetastore(baseDir); + return new BridgingHiveMetastore(hiveMetastore); + } + + @Override + public void testMetadataDelete() + { + // InMemoryHiveMetastore ignores "removeData" flag in dropPartition + } + + @Override + public void testTransactionDeleteInsert() + { + // InMemoryHiveMetastore does not check whether partition exist in createPartition and dropPartition + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveIntegrationSmokeTest.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveIntegrationSmokeTest.java new file mode 100644 index 00000000..75f1889f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveIntegrationSmokeTest.java @@ -0,0 +1,6625 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Multimap; +import io.prestosql.Session; +import io.prestosql.client.NodeVersion; +import io.prestosql.cost.StatsAndCosts; +import io.prestosql.dynamicfilter.DynamicFilterService; +import io.prestosql.execution.Lifespan; +import io.prestosql.execution.MockRemoteTaskFactory; +import io.prestosql.execution.NodeTaskMap; +import io.prestosql.execution.RemoteTask; +import io.prestosql.execution.SplitKey; +import io.prestosql.execution.SqlStageExecution; +import io.prestosql.execution.StageId; +import io.prestosql.execution.TableInfo; +import io.prestosql.execution.TestSqlTaskManager; +import io.prestosql.execution.scheduler.LegacyNetworkTopology; +import io.prestosql.execution.scheduler.NodeScheduler; +import io.prestosql.execution.scheduler.NodeSchedulerConfig; +import io.prestosql.execution.scheduler.NodeSelector; +import io.prestosql.execution.scheduler.SplitSchedulerStats; +import io.prestosql.failuredetector.NoOpFailureDetector; +import io.prestosql.filesystem.FileSystemClientManager; +import io.prestosql.metadata.InMemoryNodeManager; +import io.prestosql.metadata.InsertTableHandle; +import io.prestosql.metadata.InternalNode; +import io.prestosql.metadata.Metadata; +import io.prestosql.metadata.Split; +import io.prestosql.metadata.TableMetadata; +import io.prestosql.seedstore.SeedStoreManager; +import io.prestosql.snapshot.QuerySnapshotManager; +import io.prestosql.spi.HetuConstant; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.QueryId; +import io.prestosql.spi.connector.CatalogName; +import io.prestosql.spi.connector.CatalogSchemaTableName; +import io.prestosql.spi.connector.ColumnMetadata; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorSplit; +import io.prestosql.spi.connector.Constraint; +import io.prestosql.spi.connector.QualifiedObjectName; +import io.prestosql.spi.metadata.TableHandle; +import io.prestosql.spi.operator.ReuseExchangeOperator; +import io.prestosql.spi.plan.PlanNodeId; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.security.Identity; +import io.prestosql.spi.security.SelectedRole; +import io.prestosql.spi.service.PropertyService; +import io.prestosql.spi.type.BigintType; +import io.prestosql.spi.type.BooleanType; +import io.prestosql.spi.type.CharType; +import io.prestosql.spi.type.DateType; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.DoubleType; +import io.prestosql.spi.type.IntegerType; +import io.prestosql.spi.type.SmallintType; +import io.prestosql.spi.type.TimestampType; +import io.prestosql.spi.type.TinyintType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeSignature; +import io.prestosql.spi.type.VarcharType; +import io.prestosql.split.ConnectorAwareSplitSource; +import io.prestosql.sql.analyzer.FeaturesConfig.JoinDistributionType; +import io.prestosql.sql.analyzer.FeaturesConfig.JoinReorderingStrategy; +import io.prestosql.sql.planner.Plan; +import io.prestosql.sql.planner.PlanFragment; +import io.prestosql.sql.planner.StageExecutionPlan; +import io.prestosql.sql.planner.plan.ExchangeNode; +import io.prestosql.sql.planner.planprinter.IoPlanPrinter.ColumnConstraint; +import io.prestosql.sql.planner.planprinter.IoPlanPrinter.FormattedDomain; +import io.prestosql.sql.planner.planprinter.IoPlanPrinter.FormattedMarker; +import io.prestosql.sql.planner.planprinter.IoPlanPrinter.FormattedRange; +import io.prestosql.sql.planner.planprinter.IoPlanPrinter.IoPlan; +import io.prestosql.sql.planner.planprinter.IoPlanPrinter.IoPlan.TableColumnInfo; +import io.prestosql.statestore.LocalStateStoreProvider; +import io.prestosql.testing.MaterializedResult; +import io.prestosql.testing.MaterializedRow; +import io.prestosql.testing.TestingSplit; +import io.prestosql.tests.AbstractTestIntegrationSmokeTest; +import io.prestosql.tests.DistributedQueryRunner; +import io.prestosql.util.FinalizerService; +import org.apache.hadoop.fs.Path; +import org.intellij.lang.annotations.Language; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FilenameFilter; +import java.math.BigDecimal; +import java.net.URI; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.function.BiConsumer; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.LongStream; +import java.util.stream.Stream; + +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.Iterables.getOnlyElement; +import static com.google.common.io.Files.asCharSink; +import static com.google.common.io.MoreFiles.deleteRecursively; +import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static io.airlift.concurrent.Threads.daemonThreadsNamed; +import static io.airlift.json.JsonCodec.jsonCodec; +import static io.airlift.tpch.TpchTable.CUSTOMER; +import static io.airlift.tpch.TpchTable.LINE_ITEM; +import static io.airlift.tpch.TpchTable.ORDERS; +import static io.prestosql.SessionTestUtils.TEST_SESSION; +import static io.prestosql.SessionTestUtils.TEST_SESSION_REUSE; +import static io.prestosql.SystemSessionProperties.COLOCATED_JOIN; +import static io.prestosql.SystemSessionProperties.CONCURRENT_LIFESPANS_PER_NODE; +import static io.prestosql.SystemSessionProperties.DYNAMIC_SCHEDULE_FOR_GROUPED_EXECUTION; +import static io.prestosql.SystemSessionProperties.ENABLE_DYNAMIC_FILTERING; +import static io.prestosql.SystemSessionProperties.GROUPED_EXECUTION; +import static io.prestosql.SystemSessionProperties.JOIN_DISTRIBUTION_TYPE; +import static io.prestosql.SystemSessionProperties.JOIN_REORDERING_STRATEGY; +import static io.prestosql.execution.SqlStageExecution.createSqlStageExecution; +import static io.prestosql.execution.scheduler.TestPhasedExecutionSchedule.createTableScanPlanFragment; +import static io.prestosql.execution.scheduler.TestSourcePartitionedScheduler.createFixedSplitSource; +import static io.prestosql.plugin.hive.HiveColumnHandle.BUCKET_COLUMN_NAME; +import static io.prestosql.plugin.hive.HiveColumnHandle.PATH_COLUMN_NAME; +import static io.prestosql.plugin.hive.HiveCompressionCodec.NONE; +import static io.prestosql.plugin.hive.HiveQueryRunner.TPCH_SCHEMA; +import static io.prestosql.plugin.hive.HiveTableProperties.BUCKETED_BY_PROPERTY; +import static io.prestosql.plugin.hive.HiveTableProperties.BUCKET_COUNT_PROPERTY; +import static io.prestosql.plugin.hive.HiveTableProperties.PARTITIONED_BY_PROPERTY; +import static io.prestosql.plugin.hive.HiveTableProperties.STORAGE_FORMAT_PROPERTY; +import static io.prestosql.plugin.hive.HiveTableProperties.TRANSACTIONAL; +import static io.prestosql.plugin.hive.HiveTestUtils.TYPE_MANAGER; +import static io.prestosql.plugin.hive.HiveUtil.columnExtraInfo; +import static io.prestosql.spi.predicate.Marker.Bound.EXACTLY; +import static io.prestosql.spi.security.SelectedRole.Type.ROLE; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.CharType.createCharType; +import static io.prestosql.spi.type.DecimalType.createDecimalType; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.VarcharType.VARCHAR; +import static io.prestosql.spi.type.VarcharType.createUnboundedVarcharType; +import static io.prestosql.spi.type.VarcharType.createVarcharType; +import static io.prestosql.sql.analyzer.FeaturesConfig.JoinDistributionType.BROADCAST; +import static io.prestosql.sql.planner.optimizations.PlanNodeSearcher.searchFrom; +import static io.prestosql.sql.planner.planprinter.PlanPrinter.textLogicalPlan; +import static io.prestosql.testing.MaterializedResult.resultBuilder; +import static io.prestosql.testing.TestingAccessControlManager.TestingPrivilegeType.SELECT_COLUMN; +import static io.prestosql.testing.TestingAccessControlManager.privilege; +import static io.prestosql.testing.TestingSession.testSessionBuilder; +import static io.prestosql.testing.TestingSnapshotUtils.NOOP_SNAPSHOT_UTILS; +import static io.prestosql.testing.assertions.Assert.assertEquals; +import static io.prestosql.tests.QueryAssertions.assertEqualsIgnoreOrder; +import static io.prestosql.transaction.TransactionBuilder.transaction; +import static java.lang.String.format; +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.nio.file.Files.createTempDirectory; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.Executors.newCachedThreadPool; +import static java.util.concurrent.Executors.newScheduledThreadPool; +import static java.util.stream.Collectors.joining; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; +import static org.testng.FileAssert.assertFile; + +public class TestHiveIntegrationSmokeTest + extends AbstractTestIntegrationSmokeTest +{ + private static final CatalogName CONNECTOR_ID = new CatalogName("connector_id"); + private static final String TEST_CATALOG = "test_catalog"; + private static final String TEST_SCHEMA = "test_schema"; + private static final String TEST_TABLE = "test_table"; + + private final String catalog; + private final Session bucketedSession; + private final Session autoVacuumSession; + private final TypeTranslator typeTranslator; + + private Session testSessionSort; + private Session testSessionSortPrcntDrv50; + private Session testSessionSortPrcntDrv25; + private Session testSessionSortPrcntDrv40; + private FinalizerService finalizerService; + private NodeTaskMap nodeTaskMap; + private InMemoryNodeManager nodeManager; + private NodeSelector nodeSelector; + private Map taskMap; + private ExecutorService remoteTaskExecutor; + private ScheduledExecutorService remoteTaskScheduledExecutor; + + @SuppressWarnings("unused") + public TestHiveIntegrationSmokeTest() + { + this(() -> HiveQueryRunner.createQueryRunner(ORDERS, CUSTOMER, LINE_ITEM), + HiveQueryRunner.createBucketedSession(Optional.of(new SelectedRole(ROLE, Optional.of("admin")))), + HiveQueryRunner.createAutoVacuumSession(Optional.of(new SelectedRole(SelectedRole.Type.ALL, Optional.empty()))), + HiveQueryRunner.HIVE_CATALOG, + new HiveTypeTranslator()); + } + + protected TestHiveIntegrationSmokeTest(QueryRunnerSupplier queryRunnerSupplier, Session bucketedSession, Session autoVacuumSession, String catalog, TypeTranslator typeTranslator) + { + super(queryRunnerSupplier); + this.catalog = requireNonNull(catalog, "catalog is null"); + this.bucketedSession = requireNonNull(bucketedSession, "bucketSession is null"); + this.autoVacuumSession = requireNonNull(autoVacuumSession, "autoVacuumSession is null"); + this.typeTranslator = requireNonNull(typeTranslator, "typeTranslator is null"); + + this.remoteTaskExecutor = newCachedThreadPool(daemonThreadsNamed("remoteTaskExecutor-%s")); + this.remoteTaskScheduledExecutor = newScheduledThreadPool(2, daemonThreadsNamed("remoteTaskScheduledExecutor-%s")); + PropertyService.setProperty(HetuConstant.SPLIT_CACHE_MAP_ENABLED, false); + + finalizerService = new FinalizerService(); + nodeTaskMap = new NodeTaskMap(finalizerService); + nodeManager = new InMemoryNodeManager(); + + NodeSchedulerConfig nodeSchedulerConfig = new NodeSchedulerConfig() + .setMaxSplitsPerNode(20) + .setIncludeCoordinator(false) + .setMaxPendingSplitsPerTask(10); + + NodeScheduler nodeScheduler = new NodeScheduler(new LegacyNetworkTopology(), nodeManager, nodeSchedulerConfig, nodeTaskMap); + // contents of taskMap indicate the node-task map for the current stage + taskMap = new HashMap<>(); + nodeSelector = nodeScheduler.createNodeSelector(CONNECTOR_ID, false, null); + } + + @Test + public void testSchemaOperations() + { + Session admin = Session.builder(getQueryRunner().getDefaultSession()) + .setIdentity(new Identity("hive", Optional.empty(), ImmutableMap.of("hive", new SelectedRole(SelectedRole.Type.ROLE, Optional.of("admin"))))) + .build(); + + assertUpdate(admin, "CREATE SCHEMA new_schema"); + + assertUpdate(admin, "CREATE TABLE new_schema.test (x bigint)"); + + assertQueryFails(admin, "DROP SCHEMA new_schema", "Schema not empty: new_schema"); + + assertUpdate(admin, "DROP TABLE new_schema.test"); + + assertUpdate(admin, "DROP SCHEMA new_schema"); + } + + @Test + public void testIOExplain() + { + // Test IO explain with small number of discrete components. + computeActual("CREATE TABLE test_orders WITH (partitioned_by = ARRAY['orderkey', 'processing']) AS select custkey, orderkey, orderstatus = 'P' processing FROM orders where orderkey < 3"); + + MaterializedResult result = computeActual("EXPLAIN (TYPE IO, FORMAT JSON) INSERT INTO test_orders SELECT custkey, orderkey, processing FROM test_orders where custkey <= 10"); + assertEquals( + jsonCodec(IoPlan.class).fromJson((String) getOnlyElement(result.getOnlyColumnAsSet())), + new IoPlan( + ImmutableSet.of(new TableColumnInfo( + new CatalogSchemaTableName(catalog, "tpch", "test_orders"), + ImmutableSet.of( + new ColumnConstraint( + "orderkey", + BIGINT.getTypeSignature(), + new FormattedDomain( + false, + ImmutableSet.of( + new FormattedRange( + new FormattedMarker(Optional.of("1"), EXACTLY), + new FormattedMarker(Optional.of("1"), EXACTLY)), + new FormattedRange( + new FormattedMarker(Optional.of("2"), EXACTLY), + new FormattedMarker(Optional.of("2"), EXACTLY))))), + new ColumnConstraint( + "processing", + BOOLEAN.getTypeSignature(), + new FormattedDomain( + false, + ImmutableSet.of( + new FormattedRange( + new FormattedMarker(Optional.of("false"), EXACTLY), + new FormattedMarker(Optional.of("false"), EXACTLY)))))))), + Optional.of(new CatalogSchemaTableName(catalog, "tpch", "test_orders")))); + + assertUpdate("DROP TABLE test_orders"); + + // Test IO explain with large number of discrete components where Domain::simpify comes into play. + computeActual("CREATE TABLE test_orders WITH (partitioned_by = ARRAY['orderkey']) AS select custkey, orderkey FROM orders where orderkey < 200"); + + result = computeActual("EXPLAIN (TYPE IO, FORMAT JSON) INSERT INTO test_orders SELECT custkey, orderkey + 10 FROM test_orders where custkey <= 10"); + assertEquals( + jsonCodec(IoPlan.class).fromJson((String) getOnlyElement(result.getOnlyColumnAsSet())), + new IoPlan( + ImmutableSet.of(new TableColumnInfo( + new CatalogSchemaTableName(catalog, "tpch", "test_orders"), + ImmutableSet.of( + new ColumnConstraint( + "orderkey", + BIGINT.getTypeSignature(), + new FormattedDomain( + false, + ImmutableSet.of( + new FormattedRange( + new FormattedMarker(Optional.of("1"), EXACTLY), + new FormattedMarker(Optional.of("199"), EXACTLY)))))))), + Optional.of(new CatalogSchemaTableName(catalog, "tpch", "test_orders")))); + + assertUpdate("DROP TABLE test_orders"); + } + + @Test + public void testIoExplainWithPrimitiveTypes() + { + Map data = new HashMap<>(); + data.put("foo", VarcharType.createUnboundedVarcharType()); + data.put(Byte.toString((byte) (Byte.MAX_VALUE / 2)), TinyintType.TINYINT); + data.put(Short.toString((short) (Short.MAX_VALUE / 2)), SmallintType.SMALLINT); + data.put(Integer.toString(Integer.MAX_VALUE / 2), IntegerType.INTEGER); + data.put(Long.toString(Long.MAX_VALUE / 2), BigintType.BIGINT); + data.put(Boolean.TRUE.toString(), BooleanType.BOOLEAN); + data.put("bar", CharType.createCharType(3)); + data.put("1.2345678901234578E14", DoubleType.DOUBLE); + data.put("123456789012345678901234.567", DecimalType.createDecimalType(30, 3)); + data.put("2019-01-01", DateType.DATE); + data.put("2019-01-01 23:22:21.123", TimestampType.TIMESTAMP); + for (Map.Entry entry : data.entrySet()) { + @Language("SQL") String query = format( + "CREATE TABLE test_types_table WITH (partitioned_by = ARRAY['my_col']) AS " + + "SELECT 'foo' my_non_partition_col, CAST('%s' AS %s) my_col", + entry.getKey(), + entry.getValue().getDisplayName()); + + assertUpdate(query, 1); + MaterializedResult result = computeActual("EXPLAIN (TYPE IO, FORMAT JSON) SELECT * FROM test_types_table"); + assertEquals( + jsonCodec(IoPlan.class).fromJson((String) getOnlyElement(result.getOnlyColumnAsSet())), + new IoPlan( + ImmutableSet.of(new TableColumnInfo( + new CatalogSchemaTableName(catalog, "tpch", "test_types_table"), + ImmutableSet.of( + new ColumnConstraint( + "my_col", + entry.getValue().getTypeSignature(), + new FormattedDomain( + false, + ImmutableSet.of( + new FormattedRange( + new FormattedMarker(Optional.of(entry.getKey().toString()), EXACTLY), + new FormattedMarker(Optional.of(entry.getKey().toString()), EXACTLY)))))))), + Optional.empty())); + + assertUpdate("DROP TABLE test_types_table"); + } + } + + @Test + public void testReadNoColumns() + { + testWithAllStorageFormats(this::testReadNoColumns); + } + + private void testReadNoColumns(Session session, HiveStorageFormat storageFormat) + { + assertUpdate(session, format("CREATE TABLE test_read_no_columns WITH (format = '%s') AS SELECT 0 x", storageFormat), 1); + assertQuery(session, "SELECT count(*) FROM test_read_no_columns", "SELECT 1"); + assertUpdate(session, "DROP TABLE test_read_no_columns"); + } + + @Test + public void createTableWithEveryType() + { + @Language("SQL") String query = "" + + "CREATE TABLE test_types_table AS " + + "SELECT" + + " 'foo' _varchar" + + ", cast('bar' as varbinary) _varbinary" + + ", cast(1 as bigint) _bigint" + + ", 2 _integer" + + ", CAST('3.14' AS DOUBLE) _double" + + ", true _boolean" + + ", DATE '1980-05-07' _date" + + ", TIMESTAMP '1980-05-07 11:22:33.456' _timestamp" + + ", CAST('3.14' AS DECIMAL(3,2)) _decimal_short" + + ", CAST('12345678901234567890.0123456789' AS DECIMAL(30,10)) _decimal_long" + + ", CAST('bar' AS CHAR(10)) _char"; + + assertUpdate(query, 1); + + MaterializedResult results = getQueryRunner().execute(getSession(), "SELECT * FROM test_types_table").toTestTypes(); + assertEquals(results.getRowCount(), 1); + MaterializedRow row = results.getMaterializedRows().get(0); + assertEquals(row.getField(0), "foo"); + assertEquals(row.getField(1), "bar".getBytes(UTF_8)); + assertEquals(row.getField(2), 1L); + assertEquals(row.getField(3), 2); + assertEquals(row.getField(4), 3.14); + assertEquals(row.getField(5), true); + assertEquals(row.getField(6), LocalDate.of(1980, 5, 7)); + assertEquals(row.getField(7), LocalDateTime.of(1980, 5, 7, 11, 22, 33, 456_000_000)); + assertEquals(row.getField(8), new BigDecimal("3.14")); + assertEquals(row.getField(9), new BigDecimal("12345678901234567890.0123456789")); + assertEquals(row.getField(10), "bar "); + assertUpdate("DROP TABLE test_types_table"); + + assertFalse(getQueryRunner().tableExists(getSession(), "test_types_table")); + } + + @Test + public void testCreatePartitionedTable() + { + testWithAllStorageFormats(this::testCreatePartitionedTable); + } + + private void testCreatePartitionedTable(Session session, HiveStorageFormat storageFormat) + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_partitioned_table (" + + " _string VARCHAR" + + ", _varchar VARCHAR(65535)" + + ", _char CHAR(10)" + + ", _bigint BIGINT" + + ", _integer INTEGER" + + ", _smallint SMALLINT" + + ", _tinyint TINYINT" + + ", _real REAL" + + ", _double DOUBLE" + + ", _boolean BOOLEAN" + + ", _decimal_short DECIMAL(3,2)" + + ", _decimal_long DECIMAL(30,10)" + + ", _partition_string VARCHAR" + + ", _partition_varchar VARCHAR(65535)" + + ", _partition_char CHAR(10)" + + ", _partition_tinyint TINYINT" + + ", _partition_smallint SMALLINT" + + ", _partition_integer INTEGER" + + ", _partition_bigint BIGINT" + + ", _partition_boolean BOOLEAN" + + ", _partition_decimal_short DECIMAL(3,2)" + + ", _partition_decimal_long DECIMAL(30,10)" + + ", _partition_date DATE" + + ", _partition_timestamp TIMESTAMP" + + ") " + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ '_partition_string', '_partition_varchar', '_partition_char', '_partition_tinyint', '_partition_smallint', '_partition_integer', '_partition_bigint', '_partition_boolean', '_partition_decimal_short', '_partition_decimal_long', '_partition_date', '_partition_timestamp']" + + ") "; + + if (storageFormat == HiveStorageFormat.AVRO) { + createTable = createTable.replace(" _smallint SMALLINT,", " _smallint INTEGER,"); + createTable = createTable.replace(" _tinyint TINYINT,", " _tinyint INTEGER,"); + } + + assertUpdate(session, createTable); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_partitioned_table"); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + + List partitionedBy = ImmutableList.of( + "_partition_string", + "_partition_varchar", + "_partition_char", + "_partition_tinyint", + "_partition_smallint", + "_partition_integer", + "_partition_bigint", + "_partition_boolean", + "_partition_decimal_short", + "_partition_decimal_long", + "_partition_date", + "_partition_timestamp"); + assertEquals(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY), partitionedBy); + for (ColumnMetadata columnMetadata : tableMetadata.getColumns()) { + boolean partitionKey = partitionedBy.contains(columnMetadata.getName()); + assertEquals(columnMetadata.getExtraInfo(), columnExtraInfo(partitionKey)); + } + + assertColumnType(tableMetadata, "_string", createUnboundedVarcharType()); + assertColumnType(tableMetadata, "_varchar", createVarcharType(65535)); + assertColumnType(tableMetadata, "_char", createCharType(10)); + assertColumnType(tableMetadata, "_partition_string", createUnboundedVarcharType()); + assertColumnType(tableMetadata, "_partition_varchar", createVarcharType(65535)); + + MaterializedResult result = computeActual("SELECT * from test_partitioned_table"); + assertEquals(result.getRowCount(), 0); + + @Language("SQL") String select = "" + + "SELECT" + + " 'foo' _string" + + ", 'bar' _varchar" + + ", CAST('boo' AS CHAR(10)) _char" + + ", CAST(1 AS BIGINT) _bigint" + + ", 2 _integer" + + ", CAST (3 AS SMALLINT) _smallint" + + ", CAST (4 AS TINYINT) _tinyint" + + ", CAST('123.45' AS REAL) _real" + + ", CAST('3.14' AS DOUBLE) _double" + + ", true _boolean" + + ", CAST('3.14' AS DECIMAL(3,2)) _decimal_short" + + ", CAST('12345678901234567890.0123456789' AS DECIMAL(30,10)) _decimal_long" + + ", 'foo' _partition_string" + + ", 'bar' _partition_varchar" + + ", CAST('boo' AS CHAR(10)) _partition_char" + + ", CAST(1 AS TINYINT) _partition_tinyint" + + ", CAST(1 AS SMALLINT) _partition_smallint" + + ", 1 _partition_integer" + + ", CAST (1 AS BIGINT) _partition_bigint" + + ", true _partition_boolean" + + ", CAST('3.14' AS DECIMAL(3,2)) _partition_decimal_short" + + ", CAST('12345678901234567890.0123456789' AS DECIMAL(30,10)) _partition_decimal_long" + + ", CAST('2017-05-01' AS DATE) _partition_date" + + ", CAST('2017-05-01 10:12:34' AS TIMESTAMP) _partition_timestamp"; + + if (storageFormat == HiveStorageFormat.AVRO) { + select = select.replace(" CAST (3 AS SMALLINT) _smallint,", " 3 _smallint,"); + select = select.replace(" CAST (4 AS TINYINT) _tinyint,", " 4 _tinyint,"); + } + + assertUpdate(session, "INSERT INTO test_partitioned_table " + select, 1); + assertQuery(session, "SELECT * from test_partitioned_table", select); + assertQuery(session, + "SELECT * from test_partitioned_table WHERE" + + " 'foo' = _partition_string" + + " AND 'bar' = _partition_varchar" + + " AND CAST('boo' AS CHAR(10)) = _partition_char" + + " AND CAST(1 AS TINYINT) = _partition_tinyint" + + " AND CAST(1 AS SMALLINT) = _partition_smallint" + + " AND 1 = _partition_integer" + + " AND CAST(1 AS BIGINT) = _partition_bigint" + + " AND true = _partition_boolean" + + " AND CAST('3.14' AS DECIMAL(3,2)) = _partition_decimal_short" + + " AND CAST('12345678901234567890.0123456789' AS DECIMAL(30,10)) = _partition_decimal_long" + + " AND CAST('2017-05-01' AS DATE) = _partition_date" + + " AND CAST('2017-05-01 10:12:34' AS TIMESTAMP) = _partition_timestamp", + select); + + assertUpdate(session, "DROP TABLE test_partitioned_table"); + + assertFalse(getQueryRunner().tableExists(session, "test_partitioned_table")); + } + + @Test + public void createTableLike() + { + createTableLike("", false); + createTableLike("EXCLUDING PROPERTIES", false); + createTableLike("INCLUDING PROPERTIES", true); + } + + private void createTableLike(String likeSuffix, boolean hasPartition) + { + // Create a non-partitioned table + @Language("SQL") String createTable = "" + + "CREATE TABLE test_table_original (" + + " tinyint_col tinyint " + + ", smallint_col smallint" + + ")"; + assertUpdate(createTable); + + // Verify the table is correctly created + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_table_original"); + assertColumnType(tableMetadata, "tinyint_col", TINYINT); + assertColumnType(tableMetadata, "smallint_col", SMALLINT); + + // Create a partitioned table + @Language("SQL") String createPartitionedTable = "" + + "CREATE TABLE test_partitioned_table_original (" + + " string_col VARCHAR" + + ", decimal_long_col DECIMAL(30,10)" + + ", partition_bigint BIGINT" + + ", partition_decimal_long DECIMAL(30,10)" + + ") " + + "WITH (" + + "partitioned_by = ARRAY['partition_bigint', 'partition_decimal_long']" + + ")"; + assertUpdate(createPartitionedTable); + + // Verify the table is correctly created + tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_partitioned_table_original"); + + // Verify the partition keys are correctly created + List partitionedBy = ImmutableList.of("partition_bigint", "partition_decimal_long"); + assertEquals(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY), partitionedBy); + + // Verify the column types + assertColumnType(tableMetadata, "string_col", createUnboundedVarcharType()); + assertColumnType(tableMetadata, "partition_bigint", BIGINT); + assertColumnType(tableMetadata, "partition_decimal_long", createDecimalType(30, 10)); + + // Create a table using only one LIKE + @Language("SQL") String createTableSingleLike = "" + + "CREATE TABLE test_partitioned_table_single_like (" + + "LIKE test_partitioned_table_original " + likeSuffix + + ")"; + assertUpdate(createTableSingleLike); + + tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_partitioned_table_single_like"); + + // Verify the partitioned keys are correctly created if copying partition columns + verifyPartition(hasPartition, tableMetadata, partitionedBy); + + // Verify the column types + assertColumnType(tableMetadata, "string_col", createUnboundedVarcharType()); + assertColumnType(tableMetadata, "partition_bigint", BIGINT); + assertColumnType(tableMetadata, "partition_decimal_long", createDecimalType(30, 10)); + + @Language("SQL") String createTableLikeExtra = "" + + "CREATE TABLE test_partitioned_table_like_extra (" + + " bigint_col BIGINT" + + ", double_col DOUBLE" + + ", LIKE test_partitioned_table_single_like " + likeSuffix + + ")"; + assertUpdate(createTableLikeExtra); + + tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_partitioned_table_like_extra"); + + // Verify the partitioned keys are correctly created if copying partition columns + verifyPartition(hasPartition, tableMetadata, partitionedBy); + + // Verify the column types + assertColumnType(tableMetadata, "bigint_col", BIGINT); + assertColumnType(tableMetadata, "double_col", DOUBLE); + assertColumnType(tableMetadata, "string_col", createUnboundedVarcharType()); + assertColumnType(tableMetadata, "partition_bigint", BIGINT); + assertColumnType(tableMetadata, "partition_decimal_long", createDecimalType(30, 10)); + + @Language("SQL") String createTableDoubleLike = "" + + "CREATE TABLE test_partitioned_table_double_like (" + + " LIKE test_table_original " + + ", LIKE test_partitioned_table_like_extra " + likeSuffix + + ")"; + assertUpdate(createTableDoubleLike); + + tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_partitioned_table_double_like"); + + // Verify the partitioned keys are correctly created if copying partition columns + verifyPartition(hasPartition, tableMetadata, partitionedBy); + + // Verify the column types + assertColumnType(tableMetadata, "tinyint_col", TINYINT); + assertColumnType(tableMetadata, "smallint_col", SMALLINT); + assertColumnType(tableMetadata, "string_col", createUnboundedVarcharType()); + assertColumnType(tableMetadata, "partition_bigint", BIGINT); + assertColumnType(tableMetadata, "partition_decimal_long", createDecimalType(30, 10)); + + assertUpdate("DROP TABLE test_table_original"); + assertUpdate("DROP TABLE test_partitioned_table_original"); + assertUpdate("DROP TABLE test_partitioned_table_single_like"); + assertUpdate("DROP TABLE test_partitioned_table_like_extra"); + assertUpdate("DROP TABLE test_partitioned_table_double_like"); + } + + @Test + public void testCreateOrcTransactionalTable() + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_orc_transactional_table " + + "(" + + " a BIGINT," + + " b BIGINT" + + ") " + + "WITH (" + + STORAGE_FORMAT_PROPERTY + " = 'ORC', " + + TRANSACTIONAL + " = true" + + ") "; + assertUpdate(createTable); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_orc_transactional_table"); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), HiveStorageFormat.ORC); + assertEquals(tableMetadata.getMetadata().getProperties().get(TRANSACTIONAL), true); + + assertColumnType(tableMetadata, "a", BIGINT); + assertColumnType(tableMetadata, "b", BIGINT); + + assertUpdate(getSession(), "DROP TABLE test_orc_transactional_table"); + + assertFalse(getQueryRunner().tableExists(getSession(), "test_orc_transactional_table")); + } + + @Test + public void testVacuum() + { + String table = "tab1"; + String schema = "default"; + assertUpdate(String.format("CREATE SCHEMA IF NOT EXISTS %s", schema)); + + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int) with (transactional=true, format='orc')", + schema, table)); + + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 2)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (3, 4)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (5, 6)", schema, table), 1); + + assertUpdate(String.format("VACUUM TABLE %s.%s AND WAIT", schema, table), 3); + + TableMetadata tableMetadata = getTableMetadata("hive", schema, table); + String tablePath = ((String) tableMetadata.getMetadata().getProperties().get("location")); + + assertFilesAfterCleanup(tablePath, 1); + assertUpdate(String.format("DROP TABLE %s.%s", schema, table)); + } + + @Test + public void testFullVacuum1() + { + String table = "tab2"; + String schema = "default"; + assertUpdate(String.format("CREATE SCHEMA IF NOT EXISTS %s", schema)); + + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int) with (transactional=true, format='orc')", + schema, table)); + + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 2)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (3, 4)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (5, 6)", schema, table), 1); + + assertUpdate(String.format("VACUUM TABLE %s.%s FULL AND WAIT", schema, table), 3); + + TableMetadata tableMetadata = getTableMetadata("hive", schema, table); + String tablePath = ((String) tableMetadata.getMetadata().getProperties().get("location")); + + assertFilesAfterCleanup(tablePath, 1); + assertUpdate(String.format("DROP TABLE %s.%s", schema, table)); + } + + @Test + public void testFullUnifyVacuum1() + { + String table = "tab_fm_vacuum"; + String schema = "default"; + assertUpdate(String.format("CREATE SCHEMA IF NOT EXISTS %s", schema)); + + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int) with (transactional=true, format='orc')", + schema, table)); + + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 2)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (3, 4)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (5, 6)", schema, table), 1); + + assertUpdate(String.format("VACUUM TABLE %s.%s FULL UNIFY AND WAIT", schema, table), 3); + + TableMetadata tableMetadata = getTableMetadata("hive", schema, table); + String tablePath = ((String) tableMetadata.getMetadata().getProperties().get("location")); + + assertFilesAfterCleanup(tablePath, 1); + assertUpdate(String.format("DROP TABLE %s.%s", schema, table)); + } + + @Test + public void testFullUnifyVacuum2() + { + String table = "tab_fm_vacuum_2"; + String schema = "default"; + assertUpdate(String.format("CREATE SCHEMA IF NOT EXISTS %s", schema)); + + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int) with (transactional=true, format='orc')", + schema, table)); + + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 2)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (3, 4)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (5, 6)", schema, table), 1); + + assertUpdate(String.format("VACUUM TABLE %s.%s AND WAIT", schema, table), 3); + assertUpdate(String.format("VACUUM TABLE %s.%s FULL UNIFY AND WAIT", schema, table), 3); + + TableMetadata tableMetadata = getTableMetadata("hive", schema, table); + String tablePath = ((String) tableMetadata.getMetadata().getProperties().get("location")); + + assertFilesAfterCleanup(tablePath, 1); + assertUpdate(String.format("DROP TABLE %s.%s", schema, table)); + } + + @Test + public void testFullVacuum2() + { + String table = "tab3"; + String schema = "default"; + assertUpdate(String.format("CREATE SCHEMA IF NOT EXISTS %s", schema)); + + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int) with (transactional=true, format='orc')", + schema, table)); + + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 2)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (3, 4)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (5, 6)", schema, table), 1); + + assertUpdate(String.format("VACUUM TABLE %s.%s AND WAIT", schema, table), 3); + assertUpdate(String.format("VACUUM TABLE %s.%s FULL AND WAIT", schema, table), 3); + + TableMetadata tableMetadata = getTableMetadata("hive", schema, table); + String tablePath = ((String) tableMetadata.getMetadata().getProperties().get("location")); + + assertFilesAfterCleanup(tablePath, 1); + assertUpdate(String.format("DROP TABLE %s.%s", schema, table)); + } + + @Test + public void testVacuumOnDeleteDelta() + { + String table = "tab4"; + String schema = "default"; + assertUpdate(String.format("CREATE SCHEMA IF NOT EXISTS %s", schema)); + + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int) with (transactional=true, format='orc')", + schema, table)); + + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 2)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (3, 4)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (5, 6)", schema, table), 1); + assertUpdate(String.format("UPDATE %s.%s SET b = -1 WHERE a > 2", schema, table), 2); + + assertUpdate(String.format("VACUUM TABLE %s.%s AND WAIT", schema, table), 7); + + TableMetadata tableMetadata = getTableMetadata("hive", schema, table); + String tablePath = ((String) tableMetadata.getMetadata().getProperties().get("location")); + assertFilesAfterCleanup(tablePath, 2); + + assertUpdate(String.format("VACUUM TABLE %s.%s FULL AND WAIT", schema, table), 3); + + assertFilesAfterCleanup(tablePath, 1); + assertUpdate(String.format("DROP TABLE %s.%s", schema, table)); + } + + @Test + public void testVacuumOnPartitionedTable1() + { + String table = "tab5"; + String schema = "default"; + assertUpdate(String.format("CREATE SCHEMA IF NOT EXISTS %s", schema)); + + String partitionedColumn = "b"; + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int) with (transactional=true, format='orc', partitioned_by=Array['%s'])", + schema, table, partitionedColumn)); + + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 1)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 2)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (2, 1)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (2, 2)", schema, table), 1); + + assertUpdate(String.format("VACUUM TABLE %s.%s AND WAIT", schema, table), 4); + + TableMetadata tableMetadata = getTableMetadata("hive", schema, table); + String tablePath = (String) tableMetadata.getMetadata().getProperties().get("location"); + assertFilesAfterCleanupOnPartitionTable(tablePath, partitionedColumn, ImmutableList.of("1", "2"), 1); + + assertUpdate(String.format("UPDATE %s.%s SET a = -1 WHERE a = 2", schema, table), 2); + + assertUpdate(String.format("VACUUM TABLE %s.%s AND WAIT", schema, table), 8); + assertFilesAfterCleanupOnPartitionTable(tablePath, partitionedColumn, ImmutableList.of("1", "2"), 2); + + assertUpdate(String.format("VACUUM TABLE %s.%s FULL AND WAIT", schema, table), 4); + assertFilesAfterCleanupOnPartitionTable(tablePath, partitionedColumn, ImmutableList.of("1", "2"), 1); + + assertUpdate(String.format("DROP TABLE %s.%s", schema, table)); + } + + @Test + public void testVacuumOnPartitionedTable2() + { + String table = "tab6"; + String schema = "default"; + assertUpdate(String.format("CREATE SCHEMA IF NOT EXISTS %s", schema)); + + String partitionedColumn = "b"; + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int) with (transactional=true, format='orc', partitioned_by=Array['%s'])", + schema, table, partitionedColumn)); + + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 1)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 2)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (2, 1)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (2, 2)", schema, table), 1); + + assertUpdate(String.format("VACUUM TABLE %s.%s PARTITION '%s=1' AND WAIT", schema, table, partitionedColumn), 2); + + TableMetadata tableMetadata = getTableMetadata("hive", schema, table); + String tablePath = (String) tableMetadata.getMetadata().getProperties().get("location"); + assertFilesAfterCleanupOnPartitionTable(tablePath, partitionedColumn, ImmutableList.of("1"), 1); + assertFilesAfterCleanupOnPartitionTable(tablePath, partitionedColumn, ImmutableList.of("2"), 2); + } + + @Test + public void testVacuumOnTableWithZeroRows() + { + String table = "tab7"; + String schema = "default"; + assertUpdate(String.format("CREATE SCHEMA IF NOT EXISTS %s", schema)); + + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int) with (transactional=true, format='orc')", + schema, table)); + + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 2)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (3, 4)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (5, 6)", schema, table), 1); + assertUpdate(String.format("DELETE FROM %s.%s", schema, table), 3); + + assertUpdate(String.format("VACUUM TABLE %s.%s FULL AND WAIT", schema, table), 0); + + TableMetadata tableMetadata = getTableMetadata("hive", schema, table); + String tablePath = ((String) tableMetadata.getMetadata().getProperties().get("location")); + assertFilesAfterCleanup(tablePath, 1); + + assertUpdate(String.format("DROP TABLE %s.%s", schema, table)); + } + + @Test + public void testVacuumOnTableWithZeroRowsOnPartitionTable() + { + String table = "tab8"; + String schema = "default"; + assertUpdate(String.format("CREATE SCHEMA IF NOT EXISTS %s", schema)); + + String partitionedColumn = "b"; + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int) with (transactional=true, format='orc', partitioned_by=Array['%s'])", + schema, table, partitionedColumn)); + + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 1)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 2)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (2, 1)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (2, 2)", schema, table), 1); + assertUpdate(String.format("DELETE FROM %s.%s WHERE %s=2", schema, table, partitionedColumn), 2); + + assertUpdate(String.format("VACUUM TABLE %s.%s AND WAIT", schema, table), 6); + + TableMetadata tableMetadata = getTableMetadata("hive", schema, table); + String tablePath = ((String) tableMetadata.getMetadata().getProperties().get("location")); + + assertFilesAfterCleanupOnPartitionTable(tablePath, partitionedColumn, ImmutableList.of("1"), 1); + assertFilesAfterCleanupOnPartitionTable(tablePath, partitionedColumn, ImmutableList.of("2"), 2); + assertUpdate(String.format("DROP TABLE %s.%s", schema, table)); + } + + @Test + public void testVacuumOnPartitionedTable() + { + String table = "tab7_partitioned"; + String schema = "default"; + assertUpdate(String.format("CREATE SCHEMA IF NOT EXISTS %s", schema)); + + String partitionedColumn = "b"; + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int) with (transactional=true, format='orc', partitioned_by=Array['%s'])", + schema, table, partitionedColumn)); + TableMetadata tableMetadata = getTableMetadata("hive", schema, table); + String tablePath = (String) tableMetadata.getMetadata().getProperties().get("location"); + + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1,1),(1,2)", schema, table), 2); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (2,1),(2,2)", schema, table), 2); + assertUpdate(String.format("VACUUM TABLE %s.%s FULL AND WAIT", schema, table), 4); + assertFilesAfterCleanupOnPartitionTable(tablePath, partitionedColumn, ImmutableList.of("2"), 1); + //INSERT ONLY to partition b=1 and CALL VACUUM FULL, should compact only partition b=1 with 4 rows. + assertUpdate(String.format("INSERT INTO %s.%s VALUES (3, 1)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (4, 1)", schema, table), 1); + String[] part2Dirs = listPartition(tablePath, "b=2"); + assertUpdate(String.format("VACUUM TABLE %s.%s FULL AND WAIT", schema, table), 4); + verifyPartitionDirs(tablePath, "b=2", part2Dirs.length, part2Dirs); + assertFilesAfterCleanupOnPartitionTable(tablePath, partitionedColumn, ImmutableList.of("1"), 1); + //INSERT ONLY to partition b=3 and CALL VACUUM FULL, should compact only partition b=3 with 2 rows. + String[] part1Dirs = listPartition(tablePath, "b=1"); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (3, 3)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (4, 3)", schema, table), 1); + assertUpdate(String.format("VACUUM TABLE %s.%s FULL AND WAIT", schema, table), 2); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (5, 3)", schema, table), 1); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (6, 3)", schema, table), 1); + //partition 3 should now have baseDir along with 2 delta dirs. + String[] part3Dirs = listPartition(tablePath, "b=3"); + long minId = Long.MAX_VALUE; + long maxId = Long.MIN_VALUE; + for (String delta : part3Dirs) { + Matcher matcher = DELTA_PATTERN.matcher(delta); + if (matcher.matches()) { + minId = Math.min(Long.parseLong(matcher.group(2)), minId); + maxId = Math.max(Long.parseLong(matcher.group(3)), maxId); + } + } + assertUpdate(String.format("VACUUM TABLE %s.%s AND WAIT", schema, table), 2); + verifyPartitionDirs(tablePath, "b=2", part2Dirs.length, part2Dirs); + verifyPartitionDirs(tablePath, "b=1", part1Dirs.length, part1Dirs); + assertFilesAfterCleanupOnPartitionTable(tablePath, partitionedColumn, ImmutableList.of("3"), 2); + verifyPartitionDirs(tablePath, "b=3", 2, part3Dirs[0], String.format("delta_%07d_%07d", minId, maxId)); + } + + private static final Pattern DELTA_PATTERN = Pattern.compile("(delete_)?delta_(\\d+)_(\\d+)(_\\d+)?"); + private static final Pattern BASE_PATTERN = Pattern.compile("base_(\\d+)"); + + private String[] listPartition(String tablePath, String partition) + { + if (tablePath.startsWith("file:")) { + tablePath = tablePath.replace("file:", ""); + } + String[] partitionDirs = new File(tablePath + "/" + partition).list((f, s) -> !s.startsWith(".")); + Arrays.sort(partitionDirs); + return partitionDirs; + } + + private void verifyPartitionDirs(String tablePath, String partition, int expectedDirs, String... expectedBaseFile) + { + String[] partitionDirs = listPartition(tablePath, partition); + System.out.println(Arrays.toString(partitionDirs)); + assertEquals(partitionDirs.length, expectedDirs); + for (int i = 0; i < expectedDirs; i++) { + assertEquals(partitionDirs[i], expectedBaseFile[i]); + } + } + + private void assertFilesAfterCleanupOnPartitionTable(String tablePath, String partitionedColumn, ImmutableList partitionValue, int expectedNumberOfDirectories) + { + partitionValue.forEach(value -> { + String partitionPath = tablePath + "/" + partitionedColumn + "=" + value; + assertFilesAfterCleanup(partitionPath, expectedNumberOfDirectories); + }); + } + + private void assertFilesAfterCleanup(String tablePath, int expectedNumberOfDirectories) + { + int loopNumber = 50; + if (tablePath.startsWith("file:")) { + tablePath = tablePath.replace("file:", ""); + } + String[] otherDirectories; + do { + try { + Thread.sleep(10); + } + catch (InterruptedException e) { + // Ignore + } + otherDirectories = new File(tablePath).list(new FilenameFilter() + { + @Override + public boolean accept(File file, String s) + { + // Ignore hidden directories + return !s.startsWith("."); + } + }); + try { + assertEquals(otherDirectories.length, expectedNumberOfDirectories); + break; + } + catch (AssertionError e) { + // Ignore + } + } + while (loopNumber-- > 0); + + if (loopNumber < 1) { + assertEquals(otherDirectories.length, expectedNumberOfDirectories, + String.format("Unexpected directories on path %s", tablePath)); + } + } + + @Test + public void testCreateTableAs() + { + testWithAllStorageFormats(this::testCreateTableAs); + } + + private void testCreateTableAs(Session session, HiveStorageFormat storageFormat) + { + @Language("SQL") String select = "SELECT" + + " 'foo' _varchar" + + ", CAST('bar' AS CHAR(10)) _char" + + ", CAST (1 AS BIGINT) _bigint" + + ", 2 _integer" + + ", CAST (3 AS SMALLINT) _smallint" + + ", CAST (4 AS TINYINT) _tinyint" + + ", CAST ('123.45' as REAL) _real" + + ", CAST('3.14' AS DOUBLE) _double" + + ", true _boolean" + + ", CAST('3.14' AS DECIMAL(3,2)) _decimal_short" + + ", CAST('12345678901234567890.0123456789' AS DECIMAL(30,10)) _decimal_long"; + + if (storageFormat == HiveStorageFormat.AVRO) { + select = select.replace(" CAST (3 AS SMALLINT) _smallint,", " 3 _smallint,"); + select = select.replace(" CAST (4 AS TINYINT) _tinyint,", " 4 _tinyint,"); + } + + String createTableAs = format("CREATE TABLE test_format_table WITH (format = '%s') AS %s", storageFormat, select); + + assertUpdate(session, createTableAs, 1); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_format_table"); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + + assertColumnType(tableMetadata, "_varchar", createVarcharType(3)); + assertColumnType(tableMetadata, "_char", createCharType(10)); + + // assure reader supports basic column reordering and pruning + assertQuery(session, "SELECT _integer, _varchar, _integer from test_format_table", "SELECT 2, 'foo', 2"); + + assertQuery(session, "SELECT * from test_format_table", select); + + assertUpdate(session, "DROP TABLE test_format_table"); + + assertFalse(getQueryRunner().tableExists(session, "test_format_table")); + } + + @Test + public void testCreatePartitionedTableAs() + { + testWithAllStorageFormats(this::testCreatePartitionedTableAs); + testWithAllStorageFormats(this::testCreatePartitionedTableAsWithPartitionedRedistribute); + } + + private void testCreatePartitionedTableAs(Session session, HiveStorageFormat storageFormat) + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_create_partitioned_table_as " + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'SHIP_PRIORITY', 'ORDER_STATUS' ]" + + ") " + + "AS " + + "SELECT orderkey AS order_key, shippriority AS ship_priority, orderstatus AS order_status " + + "FROM tpch.tiny.orders"; + + assertUpdate(session, createTable, "SELECT count(*) from orders"); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_create_partitioned_table_as"); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + assertEquals(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY), ImmutableList.of("ship_priority", "order_status")); + + List partitions = getPartitions("test_create_partitioned_table_as"); + assertEquals(partitions.size(), 3); + + assertQuery(session, "SELECT * from test_create_partitioned_table_as", "SELECT orderkey, shippriority, orderstatus FROM orders"); + + assertUpdate(session, "DROP TABLE test_create_partitioned_table_as"); + + assertFalse(getQueryRunner().tableExists(session, "test_create_partitioned_table_as")); + } + + // Presto: test case for partitioned redistribute writes type + private void testCreatePartitionedTableAsWithPartitionedRedistribute(Session session, HiveStorageFormat storageFormat) + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_create_partitioned_table_as " + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'SHIP_PRIORITY', 'ORDER_STATUS' ]" + + ") " + + "AS " + + "SELECT orderkey AS order_key, shippriority AS ship_priority, orderstatus AS order_status " + + "FROM tpch.tiny.orders"; + + Long count = (Long) computeActual("SELECT count(*) from orders").getOnlyValue(); + assertUpdate(Session.builder(getSession()) + .setSystemProperty("redistribute_writes_type", "PARTITIONED") + .build(), + createTable, count, assertRemotePartitionedExchange("orderstatus")); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_create_partitioned_table_as"); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + assertEquals(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY), ImmutableList.of("ship_priority", "order_status")); + + List partitions = getPartitions("test_create_partitioned_table_as"); + assertEquals(partitions.size(), 3); + + assertQuery(session, "SELECT * from test_create_partitioned_table_as", "SELECT orderkey, shippriority, orderstatus FROM orders"); + + assertUpdate(session, "DROP TABLE test_create_partitioned_table_as"); + + assertFalse(getQueryRunner().tableExists(session, "test_create_partitioned_table_as")); + } + + @Test + public void testPropertiesTable() + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_show_properties" + + " WITH (" + + "format = 'orc', " + + "partitioned_by = ARRAY['ship_priority', 'order_status']," + + "orc_bloom_filter_columns = ARRAY['ship_priority', 'order_status']," + + "orc_bloom_filter_fpp = 0.5" + + ") " + + "AS " + + "SELECT orderkey AS order_key, shippriority AS ship_priority, orderstatus AS order_status " + + "FROM tpch.tiny.orders"; + + assertUpdate(createTable, "SELECT count(*) FROM orders"); + String queryId = (String) computeScalar("SELECT query_id FROM system.runtime.queries WHERE query LIKE 'CREATE TABLE test_show_properties%'"); + String nodeVersion = (String) computeScalar("SELECT node_version FROM system.runtime.nodes WHERE coordinator"); + assertQuery("SELECT * FROM \"test_show_properties$properties\"", + "SELECT '" + "ship_priority,order_status" + "','" + "0.5" + "','" + queryId + "','" + nodeVersion + "'"); + assertUpdate("DROP TABLE test_show_properties"); + } + + @Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = "Partition keys must be the last columns in the table and in the same order as the table properties.*") + public void testCreatePartitionedTableInvalidColumnOrdering() + { + assertUpdate("" + + "CREATE TABLE test_create_table_invalid_column_ordering\n" + + "(grape bigint, apple varchar, orange bigint, pear varchar)\n" + + "WITH (partitioned_by = ARRAY['apple'])"); + } + + @Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = "Partition keys must be the last columns in the table and in the same order as the table properties.*") + public void testCreatePartitionedTableAsInvalidColumnOrdering() + { + assertUpdate("" + + "CREATE TABLE test_create_table_as_invalid_column_ordering " + + "WITH (partitioned_by = ARRAY['SHIP_PRIORITY', 'ORDER_STATUS']) " + + "AS " + + "SELECT shippriority AS ship_priority, orderkey AS order_key, orderstatus AS order_status " + + "FROM tpch.tiny.orders"); + } + + @Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = "Table contains only partition columns") + public void testCreateTableOnlyPartitionColumns() + { + assertUpdate("" + + "CREATE TABLE test_create_table_only_partition_columns\n" + + "(grape bigint, apple varchar, orange bigint, pear varchar)\n" + + "WITH (partitioned_by = ARRAY['grape', 'apple', 'orange', 'pear'])"); + } + + @Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = "Partition columns .* not present in schema") + public void testCreateTableNonExistentPartitionColumns() + { + assertUpdate("" + + "CREATE TABLE test_create_table_nonexistent_partition_columns\n" + + "(grape bigint, apple varchar, orange bigint, pear varchar)\n" + + "WITH (partitioned_by = ARRAY['dragonfruit'])"); + } + + @Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = "Unsupported type .* for partition: .*") + public void testCreateTableUnsupportedPartitionType() + { + assertUpdate("" + + "CREATE TABLE test_create_table_unsupported_partition_type " + + "(foo bigint, bar ARRAY(varchar)) " + + "WITH (partitioned_by = ARRAY['bar'])"); + } + + @Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = "Unsupported type .* for partition: a") + public void testCreateTableUnsupportedPartitionTypeAs() + { + assertUpdate("" + + "CREATE TABLE test_create_table_unsupported_partition_type_as " + + "WITH (partitioned_by = ARRAY['a']) " + + "AS " + + "SELECT 123 x, ARRAY ['foo'] a"); + } + + @Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = "Unsupported Hive type: varchar\\(65536\\)\\. Supported VARCHAR types: VARCHAR\\(<=65535\\), VARCHAR\\.") + public void testCreateTableNonSupportedVarcharColumn() + { + assertUpdate("CREATE TABLE test_create_table_non_supported_varchar_column (apple varchar(65536))"); + } + + @Test + public void testCreatePartitionedBucketedTableAsFewRows() + { + // go through all storage formats to make sure the empty buckets are correctly created + testWithAllStorageFormats(this::testCreatePartitionedBucketedTableAsFewRows); + } + + private void testCreatePartitionedBucketedTableAsFewRows(Session session, HiveStorageFormat storageFormat) + { + testCreatePartitionedBucketedTableAsFewRows(session, storageFormat, true); + testCreatePartitionedBucketedTableAsFewRows(session, storageFormat, false); + } + + private void testCreatePartitionedBucketedTableAsFewRows(Session session, HiveStorageFormat storageFormat, boolean createEmpty) + { + String tableName = "test_create_partitioned_bucketed_table_as_few_rows"; + + @Language("SQL") String createTable = "" + + "CREATE TABLE " + tableName + " " + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'partition_key' ], " + + "bucketed_by = ARRAY[ 'bucket_key' ], " + + "bucket_count = 11 " + + ") " + + "AS " + + "SELECT * " + + "FROM (" + + "VALUES " + + " (VARCHAR 'a', VARCHAR 'b', VARCHAR 'c'), " + + " ('aa', 'bb', 'cc'), " + + " ('aaa', 'bbb', 'ccc')" + + ") t(bucket_key, col, partition_key)"; + + assertUpdate( + // make sure that we will get one file per bucket regardless of writer count configured + Session.builder(getParallelWriteSession()) + .setCatalogSessionProperty(catalog, "create_empty_bucket_files", String.valueOf(createEmpty)) + .build(), + createTable, + 3); + + verifyPartitionedBucketedTableAsFewRows(storageFormat, tableName); + + assertUpdate(session, "DROP TABLE " + tableName); + assertFalse(getQueryRunner().tableExists(session, tableName)); + } + + @Test + public void testCreatePartitionedBucketedTableAs() + { + testCreatePartitionedBucketedTableAs(HiveStorageFormat.RCBINARY); + } + + private void testCreatePartitionedBucketedTableAs(HiveStorageFormat storageFormat) + { + String tableName = "test_create_partitioned_bucketed_table_as"; + + @Language("SQL") String createTable = "" + + "CREATE TABLE " + tableName + " " + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'orderstatus' ], " + + "bucketed_by = ARRAY[ 'custkey', 'custkey2' ], " + + "bucket_count = 11 " + + ") " + + "AS " + + "SELECT custkey, custkey AS custkey2, comment, orderstatus " + + "FROM tpch.tiny.orders"; + + assertUpdate( + // make sure that we will get one file per bucket regardless of writer count configured + getParallelWriteSession(), + createTable, + "SELECT count(*) from orders"); + + verifyPartitionedBucketedTable(storageFormat, tableName); + + assertUpdate("DROP TABLE " + tableName); + assertFalse(getQueryRunner().tableExists(getSession(), tableName)); + } + + @Test + public void testCreatePartitionedBucketedTableAsWithUnionAll() + { + testCreatePartitionedBucketedTableAsWithUnionAll(HiveStorageFormat.RCBINARY); + } + + private void testCreatePartitionedBucketedTableAsWithUnionAll(HiveStorageFormat storageFormat) + { + String tableName = "test_create_partitioned_bucketed_table_as_with_union_all"; + + @Language("SQL") String createTable = "" + + "CREATE TABLE " + tableName + " " + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'orderstatus' ], " + + "bucketed_by = ARRAY[ 'custkey', 'custkey2' ], " + + "bucket_count = 11 " + + ") " + + "AS " + + "SELECT custkey, custkey AS custkey2, comment, orderstatus " + + "FROM tpch.tiny.orders " + + "WHERE length(comment) % 2 = 0 " + + "UNION ALL " + + "SELECT custkey, custkey AS custkey2, comment, orderstatus " + + "FROM tpch.tiny.orders " + + "WHERE length(comment) % 2 = 1"; + + assertUpdate( + // make sure that we will get one file per bucket regardless of writer count configured + getParallelWriteSession(), + createTable, + "SELECT count(*) from orders"); + + verifyPartitionedBucketedTable(storageFormat, tableName); + + assertUpdate("DROP TABLE " + tableName); + assertFalse(getQueryRunner().tableExists(getSession(), tableName)); + } + + private void verifyPartitionedBucketedTable(HiveStorageFormat storageFormat, String tableName) + { + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, tableName); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + + assertEquals(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY), ImmutableList.of("orderstatus")); + assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKETED_BY_PROPERTY), ImmutableList.of("custkey", "custkey2")); + assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKET_COUNT_PROPERTY), 11); + + List partitions = getPartitions(tableName); + assertEquals(partitions.size(), 3); + + assertQuery("SELECT * from " + tableName, "SELECT custkey, custkey, comment, orderstatus FROM orders"); + + for (int i = 1; i <= 30; i++) { + assertQuery( + format("SELECT * from " + tableName + " where custkey = %d and custkey2 = %d", i, i), + format("SELECT custkey, custkey, comment, orderstatus FROM orders where custkey = %d", i)); + } + } + + @Test + public void testCreateInvalidBucketedTable() + { + testCreateInvalidBucketedTable(HiveStorageFormat.RCBINARY); + } + + private void testCreateInvalidBucketedTable(HiveStorageFormat storageFormat) + { + String tableName = "test_create_invalid_bucketed_table"; + + try { + computeActual("" + + "CREATE TABLE " + tableName + " (" + + " a BIGINT," + + " b DOUBLE," + + " p VARCHAR" + + ") WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'p' ], " + + "bucketed_by = ARRAY[ 'a', 'c' ], " + + "bucket_count = 11 " + + ")"); + fail(); + } + catch (Exception e) { + assertEquals(e.getMessage(), "Bucketing columns [c] not present in schema"); + } + + try { + computeActual("" + + "CREATE TABLE " + tableName + " " + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'orderstatus' ], " + + "bucketed_by = ARRAY[ 'custkey', 'custkey3' ], " + + "bucket_count = 11 " + + ") " + + "AS " + + "SELECT custkey, custkey AS custkey2, comment, orderstatus " + + "FROM tpch.tiny.orders"); + fail(); + } + catch (Exception e) { + assertEquals(e.getMessage(), "Bucketing columns [custkey3] not present in schema"); + } + + assertFalse(getQueryRunner().tableExists(getSession(), tableName)); + } + + @Test + public void testCreatePartitionedUnionAll() + { + assertUpdate("CREATE TABLE test_create_partitioned_union_all (a varchar, ds varchar) WITH (partitioned_by = ARRAY['ds'])"); + assertUpdate("INSERT INTO test_create_partitioned_union_all SELECT 'a', '2013-05-17' UNION ALL SELECT 'b', '2013-05-17'", 2); + assertUpdate("DROP TABLE test_create_partitioned_union_all"); + } + + @Test + public void testInsertPartitionedBucketedTableFewRows() + { + // go through all storage formats to make sure the empty buckets are correctly created + testWithAllStorageFormats(this::testInsertPartitionedBucketedTableFewRows); + } + + private void testInsertPartitionedBucketedTableFewRows(Session session, HiveStorageFormat storageFormat) + { + String tableName = "test_insert_partitioned_bucketed_table_few_rows"; + + assertUpdate(session, "" + + "CREATE TABLE " + tableName + " (" + + " bucket_key varchar," + + " col varchar," + + " partition_key varchar)" + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'partition_key' ], " + + "bucketed_by = ARRAY[ 'bucket_key' ], " + + "bucket_count = 11)"); + + assertUpdate( + // make sure that we will get one file per bucket regardless of writer count configured + getParallelWriteSession(), + "INSERT INTO " + tableName + " " + + "VALUES " + + " (VARCHAR 'a', VARCHAR 'b', VARCHAR 'c'), " + + " ('aa', 'bb', 'cc'), " + + " ('aaa', 'bbb', 'ccc')", + 3); + + verifyPartitionedBucketedTableAsFewRows(storageFormat, tableName); + + assertUpdate(session, "DROP TABLE test_insert_partitioned_bucketed_table_few_rows"); + assertFalse(getQueryRunner().tableExists(session, tableName)); + } + + private void verifyPartitionedBucketedTableAsFewRows(HiveStorageFormat storageFormat, String tableName) + { + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, tableName); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + + assertEquals(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY), ImmutableList.of("partition_key")); + assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKETED_BY_PROPERTY), ImmutableList.of("bucket_key")); + assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKET_COUNT_PROPERTY), 11); + + List partitions = getPartitions(tableName); + assertEquals(partitions.size(), 3); + + MaterializedResult actual = computeActual("SELECT * from " + tableName); + MaterializedResult expected = resultBuilder(getSession(), canonicalizeType(createUnboundedVarcharType()), canonicalizeType(createUnboundedVarcharType()), canonicalizeType(createUnboundedVarcharType())) + .row("a", "b", "c") + .row("aa", "bb", "cc") + .row("aaa", "bbb", "ccc") + .build(); + assertEqualsIgnoreOrder(actual.getMaterializedRows(), expected.getMaterializedRows()); + } + + @Test + public void testCastNullToColumnTypes() + { + String tableName = "test_cast_null_to_column_types"; + + assertUpdate("" + + "CREATE TABLE " + tableName + " (" + + " col1 bigint," + + " col2 map(bigint, bigint)," + + " partition_key varchar)" + + "WITH (" + + " format = 'ORC', " + + " partitioned_by = ARRAY[ 'partition_key' ] " + + ")"); + + assertUpdate(format("INSERT INTO %s (col1) VALUES (1), (2), (3)", tableName), 3); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testCreateEmptyNonBucketedPartition() + { + String tableName = "test_insert_empty_partitioned_unbucketed_table"; + assertUpdate("" + + "CREATE TABLE " + tableName + " (" + + " dummy_col bigint," + + " part varchar)" + + "WITH (" + + " format = 'ORC', " + + " partitioned_by = ARRAY[ 'part' ] " + + ")"); + assertQuery(format("SELECT count(*) FROM \"%s$partitions\"", tableName), "SELECT 0"); + + // create an empty partition + assertUpdate(String.format("CALL system.create_empty_partition('%s', '%s', ARRAY['part'], ARRAY['%s'])", TPCH_SCHEMA, tableName, "empty")); + assertQuery(format("SELECT count(*) FROM \"%s$partitions\"", tableName), "SELECT 1"); + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testCreateEmptyBucketedPartition() + { + for (TestingHiveStorageFormat storageFormat : getAllTestingHiveStorageFormat()) { + testCreateEmptyBucketedPartition(storageFormat.getFormat()); + } + } + + private void testCreateEmptyBucketedPartition(HiveStorageFormat storageFormat) + { + String tableName = "test_insert_empty_partitioned_bucketed_table"; + createPartitionedBucketedTable(tableName, storageFormat); + + List orderStatusList = ImmutableList.of("F", "O", "P"); + for (int i = 0; i < orderStatusList.size(); i++) { + String sql = String.format("CALL system.create_empty_partition('%s', '%s', ARRAY['orderstatus'], ARRAY['%s'])", TPCH_SCHEMA, tableName, orderStatusList.get(i)); + assertUpdate(sql); + assertQuery( + format("SELECT count(*) FROM \"%s$partitions\"", tableName), + "SELECT " + (i + 1)); + + assertQueryFails(sql, "Partition already exists.*"); + } + + assertUpdate("DROP TABLE " + tableName); + assertFalse(getQueryRunner().tableExists(getSession(), tableName)); + } + + @Test + public void testInsertPartitionedBucketedTable() + { + testInsertPartitionedBucketedTable(HiveStorageFormat.RCBINARY); + } + + private void testInsertPartitionedBucketedTable(HiveStorageFormat storageFormat) + { + String tableName = "test_insert_partitioned_bucketed_table"; + createPartitionedBucketedTable(tableName, storageFormat); + + List orderStatusList = ImmutableList.of("F", "O", "P"); + for (int i = 0; i < orderStatusList.size(); i++) { + String orderStatus = orderStatusList.get(i); + assertUpdate( + // make sure that we will get one file per bucket regardless of writer count configured + getParallelWriteSession(), + format( + "INSERT INTO " + tableName + " " + + "SELECT custkey, custkey AS custkey2, comment, orderstatus " + + "FROM tpch.tiny.orders " + + "WHERE orderstatus = '%s'", + orderStatus), + format("SELECT count(*) from orders where orderstatus = '%s'", orderStatus)); + } + + verifyPartitionedBucketedTable(storageFormat, tableName); + + assertUpdate("DROP TABLE " + tableName); + assertFalse(getQueryRunner().tableExists(getSession(), tableName)); + } + + private void createPartitionedBucketedTable(String tableName, HiveStorageFormat storageFormat) + { + assertUpdate("" + + "CREATE TABLE " + tableName + " (" + + " custkey bigint," + + " custkey2 bigint," + + " comment varchar," + + " orderstatus varchar)" + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'orderstatus' ], " + + "bucketed_by = ARRAY[ 'custkey', 'custkey2' ], " + + "bucket_count = 11)"); + } + + @Test + public void testInsertPartitionedBucketedTableWithUnionAll() + { + testInsertPartitionedBucketedTableWithUnionAll(HiveStorageFormat.RCBINARY); + } + + private void testInsertPartitionedBucketedTableWithUnionAll(HiveStorageFormat storageFormat) + { + String tableName = "test_insert_partitioned_bucketed_table_with_union_all"; + + assertUpdate("" + + "CREATE TABLE " + tableName + " (" + + " custkey bigint," + + " custkey2 bigint," + + " comment varchar," + + " orderstatus varchar)" + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'orderstatus' ], " + + "bucketed_by = ARRAY[ 'custkey', 'custkey2' ], " + + "bucket_count = 11)"); + + List orderStatusList = ImmutableList.of("F", "O", "P"); + for (int i = 0; i < orderStatusList.size(); i++) { + String orderStatus = orderStatusList.get(i); + assertUpdate( + // make sure that we will get one file per bucket regardless of writer count configured + getParallelWriteSession(), + format( + "INSERT INTO " + tableName + " " + + "SELECT custkey, custkey AS custkey2, comment, orderstatus " + + "FROM tpch.tiny.orders " + + "WHERE orderstatus = '%s' and length(comment) %% 2 = 0 " + + "UNION ALL " + + "SELECT custkey, custkey AS custkey2, comment, orderstatus " + + "FROM tpch.tiny.orders " + + "WHERE orderstatus = '%s' and length(comment) %% 2 = 1", + orderStatus, orderStatus), + format("SELECT count(*) from orders where orderstatus = '%s'", orderStatus)); + } + + verifyPartitionedBucketedTable(storageFormat, tableName); + + assertUpdate("DROP TABLE " + tableName); + assertFalse(getQueryRunner().tableExists(getSession(), tableName)); + } + + @Test + public void testInsert() + { + testWithAllStorageFormats(this::testInsert); + } + + private void testInsert(Session session, HiveStorageFormat storageFormat) + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_insert_format_table " + + "(" + + " _string VARCHAR," + + " _varchar VARCHAR(65535)," + + " _char CHAR(10)," + + " _bigint BIGINT," + + " _integer INTEGER," + + " _smallint SMALLINT," + + " _tinyint TINYINT," + + " _real REAL," + + " _double DOUBLE," + + " _boolean BOOLEAN," + + " _decimal_short DECIMAL(3,2)," + + " _decimal_long DECIMAL(30,10)" + + ") " + + "WITH (format = '" + storageFormat + "') "; + + if (storageFormat == HiveStorageFormat.AVRO) { + createTable = createTable.replace(" _smallint SMALLINT,", " _smallint INTEGER,"); + createTable = createTable.replace(" _tinyint TINYINT,", " _tinyint INTEGER,"); + } + + assertUpdate(session, createTable); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_insert_format_table"); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + + assertColumnType(tableMetadata, "_string", createUnboundedVarcharType()); + assertColumnType(tableMetadata, "_varchar", createVarcharType(65535)); + assertColumnType(tableMetadata, "_char", createCharType(10)); + + @Language("SQL") String select = "SELECT" + + " 'foo' _string" + + ", 'bar' _varchar" + + ", CAST('boo' AS CHAR(10)) _char" + + ", 1 _bigint" + + ", CAST(42 AS INTEGER) _integer" + + ", CAST(43 AS SMALLINT) _smallint" + + ", CAST(44 AS TINYINT) _tinyint" + + ", CAST('123.45' AS REAL) _real" + + ", CAST('3.14' AS DOUBLE) _double" + + ", true _boolean" + + ", CAST('3.14' AS DECIMAL(3,2)) _decimal_short" + + ", CAST('12345678901234567890.0123456789' AS DECIMAL(30,10)) _decimal_long"; + + if (storageFormat == HiveStorageFormat.AVRO) { + select = select.replace(" CAST (43 AS SMALLINT) _smallint,", " 3 _smallint,"); + select = select.replace(" CAST (44 AS TINYINT) _tinyint,", " 4 _tinyint,"); + } + + assertUpdate(session, "INSERT INTO test_insert_format_table " + select, 1); + + assertQuery(session, "SELECT * from test_insert_format_table", select); + + assertUpdate(session, "INSERT INTO test_insert_format_table (_tinyint, _smallint, _integer, _bigint, _real, _double) SELECT CAST(1 AS TINYINT), CAST(2 AS SMALLINT), 3, 4, cast(14.3E0 as REAL), 14.3E0", 1); + + assertQuery(session, "SELECT * from test_insert_format_table where _bigint = 4", "SELECT null, null, null, 4, 3, 2, 1, 14.3, 14.3, null, null, null"); + + assertQuery(session, "SELECT * from test_insert_format_table where _real = CAST(14.3 as REAL)", "SELECT null, null, null, 4, 3, 2, 1, 14.3, 14.3, null, null, null"); + + assertUpdate(session, "INSERT INTO test_insert_format_table (_double, _bigint) SELECT 2.72E0, 3", 1); + + assertQuery(session, "SELECT * from test_insert_format_table where _bigint = 3", "SELECT null, null, null, 3, null, null, null, null, 2.72, null, null, null"); + + assertUpdate(session, "INSERT INTO test_insert_format_table (_decimal_short, _decimal_long) SELECT DECIMAL '2.72', DECIMAL '98765432101234567890.0123456789'", 1); + + assertQuery(session, "SELECT * from test_insert_format_table where _decimal_long = DECIMAL '98765432101234567890.0123456789'", "SELECT null, null, null, null, null, null, null, null, null, null, 2.72, 98765432101234567890.0123456789"); + + assertUpdate(session, "DROP TABLE test_insert_format_table"); + + assertFalse(getQueryRunner().tableExists(session, "test_insert_format_table")); + } + + @Test + public void testInsertPartitionedTable() + { + testWithAllStorageFormats(this::testInsertPartitionedTable); + testWithAllStorageFormats(this::testInsertPartitionedTableWithPartitionedRedistribute); + } + + private void testInsertPartitionedTable(Session session, HiveStorageFormat storageFormat) + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_insert_partitioned_table " + + "(" + + " ORDER_KEY BIGINT," + + " SHIP_PRIORITY INTEGER," + + " ORDER_STATUS VARCHAR" + + ") " + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'SHIP_PRIORITY', 'ORDER_STATUS' ]" + + ") "; + + assertUpdate(session, createTable); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_insert_partitioned_table"); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + assertEquals(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY), ImmutableList.of("ship_priority", "order_status")); + + String partitionsTable = "\"test_insert_partitioned_table$partitions\""; + + assertQuery( + session, + "SELECT * FROM " + partitionsTable, + "SELECT shippriority, orderstatus FROM orders LIMIT 0"); + + // Hive will reorder the partition keys, so we must insert into the table assuming the partition keys have been moved to the end + Long count = (Long) computeActual("SELECT count(*) from orders").getOnlyValue(); + assertUpdate( + session, + "" + + "INSERT INTO test_insert_partitioned_table " + + "SELECT orderkey, shippriority, orderstatus " + + "FROM tpch.tiny.orders", + "SELECT count(*) from orders"); + + // verify the partitions + List partitions = getPartitions("test_insert_partitioned_table"); + assertEquals(partitions.size(), 3); + + assertQuery(session, "SELECT * from test_insert_partitioned_table", "SELECT orderkey, shippriority, orderstatus FROM orders"); + + assertQuery( + session, + "SELECT * FROM " + partitionsTable, + "SELECT DISTINCT shippriority, orderstatus FROM orders"); + + assertQuery( + session, + "SELECT * FROM " + partitionsTable + " ORDER BY order_status LIMIT 2", + "SELECT DISTINCT shippriority, orderstatus FROM orders ORDER BY orderstatus LIMIT 2"); + + assertQuery( + session, + "SELECT * FROM " + partitionsTable + " WHERE order_status = 'O'", + "SELECT DISTINCT shippriority, orderstatus FROM orders WHERE orderstatus = 'O'"); + + assertQueryFails(session, "SELECT * FROM " + partitionsTable + " WHERE no_such_column = 1", "line \\S*: Column 'no_such_column' cannot be resolved"); + assertQueryFails(session, "SELECT * FROM " + partitionsTable + " WHERE orderkey = 1", "line \\S*: Column 'orderkey' cannot be resolved"); + + assertUpdate(session, "DROP TABLE test_insert_partitioned_table"); + + assertFalse(getQueryRunner().tableExists(session, "test_insert_partitioned_table")); + } + + // Presto: test case for partitioned redistribute writes type + private void testInsertPartitionedTableWithPartitionedRedistribute(Session session, HiveStorageFormat storageFormat) + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_insert_partitioned_table " + + "(" + + " ORDER_KEY BIGINT," + + " SHIP_PRIORITY INTEGER," + + " ORDER_STATUS VARCHAR" + + ") " + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'SHIP_PRIORITY', 'ORDER_STATUS' ]" + + ") "; + + assertUpdate(session, createTable); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_insert_partitioned_table"); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + assertEquals(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY), ImmutableList.of("ship_priority", "order_status")); + + String partitionsTable = "\"test_insert_partitioned_table$partitions\""; + + assertQuery( + session, + "SELECT * FROM " + partitionsTable, + "SELECT shippriority, orderstatus FROM orders LIMIT 0"); + + // Hive will reorder the partition keys, so we must insert into the table assuming the partition keys have been moved to the end + Long count = (Long) computeActual("SELECT count(*) from orders").getOnlyValue(); + assertUpdate( + Session.builder(getSession()) + .setSystemProperty("redistribute_writes_type", "PARTITIONED") + .build(), + "" + + "INSERT INTO test_insert_partitioned_table " + + "SELECT orderkey, shippriority, orderstatus " + + "FROM tpch.tiny.orders", + count, assertRemotePartitionedExchange("orderstatus")); + + // verify the partitions + List partitions = getPartitions("test_insert_partitioned_table"); + assertEquals(partitions.size(), 3); + + assertQuery(session, "SELECT * from test_insert_partitioned_table", "SELECT orderkey, shippriority, orderstatus FROM orders"); + + assertQuery( + session, + "SELECT * FROM " + partitionsTable, + "SELECT DISTINCT shippriority, orderstatus FROM orders"); + + assertQuery( + session, + "SELECT * FROM " + partitionsTable + " ORDER BY order_status LIMIT 2", + "SELECT DISTINCT shippriority, orderstatus FROM orders ORDER BY orderstatus LIMIT 2"); + + assertQuery( + session, + "SELECT * FROM " + partitionsTable + " WHERE order_status = 'O'", + "SELECT DISTINCT shippriority, orderstatus FROM orders WHERE orderstatus = 'O'"); + + assertUpdate(session, "DROP TABLE test_insert_partitioned_table"); + + assertFalse(getQueryRunner().tableExists(session, "test_insert_partitioned_table")); + } + + @Test + public void testInsertPartitionedTableExistingPartition() + { + testWithAllStorageFormats(this::testInsertPartitionedTableExistingPartition); + } + + private void testInsertPartitionedTableExistingPartition(Session session, HiveStorageFormat storageFormat) + { + String tableName = "test_insert_partitioned_table_existing_partition"; + + @Language("SQL") String createTable = "" + + "CREATE TABLE " + tableName + " " + + "(" + + " order_key BIGINT," + + " comment VARCHAR," + + " order_status VARCHAR" + + ") " + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'order_status' ]" + + ") "; + + assertUpdate(session, createTable); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, tableName); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + assertEquals(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY), ImmutableList.of("order_status")); + + for (int i = 0; i < 3; i++) { + assertUpdate( + session, + format( + "INSERT INTO " + tableName + " " + + "SELECT orderkey, comment, orderstatus " + + "FROM tpch.tiny.orders " + + "WHERE orderkey %% 3 = %d", + i), + format("SELECT count(*) from orders where orderkey %% 3 = %d", i)); + } + + // verify the partitions + List partitions = getPartitions(tableName); + assertEquals(partitions.size(), 3); + + assertQuery( + session, + "SELECT * from " + tableName, + "SELECT orderkey, comment, orderstatus FROM orders"); + + assertUpdate(session, "DROP TABLE " + tableName); + + assertFalse(getQueryRunner().tableExists(session, tableName)); + } + + @Test + public void testInsertPartitionedTableOverwriteExistingPartition() + { + testInsertPartitionedTableOverwriteExistingPartition( + Session.builder(getSession()) + .setCatalogSessionProperty(catalog, "insert_existing_partitions_behavior", "OVERWRITE") + .build(), + HiveStorageFormat.ORC, false); + } + + @Test + public void testInsertPartitionedTxnTableOverwriteExistingPartition() + { + testInsertPartitionedTableOverwriteExistingPartition( + Session.builder(getSession()) + .setCatalogSessionProperty(catalog, "insert_existing_partitions_behavior", "OVERWRITE") + .build(), + HiveStorageFormat.ORC, true); + } + + private void testInsertPartitionedTableOverwriteExistingPartition(Session session, HiveStorageFormat storageFormat, boolean transactional) + { + String tableName = "test_insert_partitioned_table_overwrite_existing_partition"; + + @Language("SQL") String createTable = "" + + "CREATE TABLE " + tableName + " " + + "(" + + " order_key BIGINT," + + " comment VARCHAR," + + " order_status VARCHAR" + + ") " + + "WITH (" + + (transactional ? "transactional=true, " : "") + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'order_status' ]" + + ") "; + + assertUpdate(session, createTable); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, tableName); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + assertEquals(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY), ImmutableList.of("order_status")); + + for (int i = 0; i < 3; i++) { + assertUpdate( + session, + format( + "INSERT OVERWRITE " + tableName + " " + + "SELECT orderkey, comment, orderstatus " + + "FROM tpch.tiny.orders " + + "WHERE orderkey %% 3 = %d", + i), + format("SELECT count(*) from orders where orderkey %% 3 = %d", i)); + + // verify the partitions + List partitions = getPartitions(tableName); + assertEquals(partitions.size(), 3); + + assertQuery( + session, + "SELECT * from " + tableName, + format("SELECT orderkey, comment, orderstatus FROM orders where orderkey %% 3 = %d", i)); + if (transactional) { + TableMetadata metadata = getTableMetadata("hive", session.getSchema().get(), tableName); + String tablePath = (String) tableMetadata.getMetadata().getProperties().get("location"); + File file = new File(tablePath.replace("file:", "")); + File[] partitionsLocations = file.listFiles((a) -> a.isDirectory() && !a.getName().startsWith(".")); + int expectedBaseCount = i + 1; + Arrays.stream(partitionsLocations).forEach((partition) -> { + File[] baseDirectories = partition.listFiles((f) -> f.isDirectory() && f.getName().startsWith("base_")); + //In case of transactional insert_overwrite base directory is written directly instead of delta. + assertEquals(expectedBaseCount, baseDirectories.length); + }); + } + } + assertUpdate(session, "DROP TABLE " + tableName); + + assertFalse(getQueryRunner().tableExists(session, tableName)); + } + + @Test + public void testNullPartitionValues() + { + assertUpdate("" + + "CREATE TABLE test_null_partition (test VARCHAR, part VARCHAR)\n" + + "WITH (partitioned_by = ARRAY['part'])"); + + assertUpdate("INSERT INTO test_null_partition VALUES ('hello', 'test'), ('world', null)", 2); + + assertQuery( + "SELECT * FROM test_null_partition", + "VALUES ('hello', 'test'), ('world', null)"); + + assertQuery( + "SELECT * FROM \"test_null_partition$partitions\"", + "VALUES 'test', null"); + + assertUpdate("DROP TABLE test_null_partition"); + } + + @Test + public void testPartitionPerScanLimit() + { + TestingHiveStorageFormat storageFormat = new TestingHiveStorageFormat(getSession(), HiveStorageFormat.ORC); + testWithStorageFormat(storageFormat, this::testPartitionPerScanLimit); + } + + private void testPartitionPerScanLimit(Session session, HiveStorageFormat storageFormat) + { + String tableName = "test_partition_per_scan_limit"; + String partitionsTable = "\"" + tableName + "$partitions\""; + + @Language("SQL") String createTable = "" + + "CREATE TABLE " + tableName + " " + + "(" + + " foo VARCHAR," + + " part BIGINT" + + ") " + + "WITH (" + + "format = '" + storageFormat + "', " + + "partitioned_by = ARRAY[ 'part' ]" + + ") "; + + assertUpdate(session, createTable); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, tableName); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + assertEquals(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY), ImmutableList.of("part")); + + // insert 1200 partitions + for (int i = 0; i < 12; i++) { + int partStart = i * 100; + int partEnd = (i + 1) * 100 - 1; + + @Language("SQL") String insertPartitions = "" + + "INSERT INTO " + tableName + " " + + "SELECT 'bar' foo, part " + + "FROM UNNEST(SEQUENCE(" + partStart + ", " + partEnd + ")) AS TMP(part)"; + + assertUpdate(session, insertPartitions, 100); + } + + // we are not constrained by hive.max-partitions-per-scan when listing partitions + assertQuery( + session, + "SELECT * FROM " + partitionsTable + " WHERE part > 490 and part <= 500", + "VALUES 491, 492, 493, 494, 495, 496, 497, 498, 499, 500"); + + assertQuery( + session, + "SELECT * FROM " + partitionsTable + " WHERE part < 0", + "SELECT null WHERE false"); + + assertQuery( + session, + "SELECT * FROM " + partitionsTable, + "VALUES " + LongStream.range(0, 1200) + .mapToObj(String::valueOf) + .collect(joining(","))); + + // verify can query 1000 partitions + assertQuery( + session, + "SELECT count(foo) FROM " + tableName + " WHERE part < 1000", + "SELECT 1000"); + + // verify the rest 200 partitions are successfully inserted + assertQuery( + session, + "SELECT count(foo) FROM " + tableName + " WHERE part >= 1000 AND part < 1200", + "SELECT 200"); + + // verify cannot query more than 1000 partitions + assertQueryFails( + session, + "SELECT * from " + tableName + " WHERE part < 1001", + format("Query over table 'tpch.%s' can potentially read more than 1000 partitions", tableName)); + + // verify cannot query all partitions + assertQueryFails( + session, + "SELECT * from " + tableName, + format("Query over table 'tpch.%s' can potentially read more than 1000 partitions", tableName)); + + assertUpdate(session, "DROP TABLE " + tableName); + + assertFalse(getQueryRunner().tableExists(session, tableName)); + } + + @Test + public void testShowColumnsFromPartitions() + { + String tableName = "test_show_columns_from_partitions"; + + @Language("SQL") String createTable = "" + + "CREATE TABLE " + tableName + " " + + "(" + + " foo VARCHAR," + + " part1 BIGINT," + + " part2 VARCHAR" + + ") " + + "WITH (" + + "partitioned_by = ARRAY[ 'part1', 'part2' ]" + + ") "; + + assertUpdate(getSession(), createTable); + + assertQuery( + getSession(), + "SHOW COLUMNS FROM \"" + tableName + "$partitions\"", + "VALUES ('part1', 'bigint', '', ''), ('part2', 'varchar', '', '')"); + + assertQueryFails( + getSession(), + "SHOW COLUMNS FROM \"$partitions\"", + ".*Table '.*\\.tpch\\.\\$partitions' does not exist"); + + assertQueryFails( + getSession(), + "SHOW COLUMNS FROM \"orders$partitions\"", + ".*Table '.*\\.tpch\\.orders\\$partitions' does not exist"); + + assertQueryFails( + getSession(), + "SHOW COLUMNS FROM \"blah$partitions\"", + ".*Table '.*\\.tpch\\.blah\\$partitions' does not exist"); + } + + @Test + public void testPartitionsTableInvalidAccess() + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_partitions_invalid " + + "(" + + " foo VARCHAR," + + " part1 BIGINT," + + " part2 VARCHAR" + + ") " + + "WITH (" + + "partitioned_by = ARRAY[ 'part1', 'part2' ]" + + ") "; + + assertUpdate(getSession(), createTable); + + assertQueryFails( + getSession(), + "SELECT * FROM \"test_partitions_invalid$partitions$partitions\"", + ".*Table .*\\.tpch\\.test_partitions_invalid\\$partitions\\$partitions does not exist"); + + assertQueryFails( + getSession(), + "SELECT * FROM \"non_existent$partitions\"", + ".*Table .*\\.tpch\\.non_existent\\$partitions does not exist"); + } + + @Test + public void testInsertUnpartitionedTable() + { + testWithAllStorageFormats(this::testInsertUnpartitionedTable); + } + + private void testInsertUnpartitionedTable(Session session, HiveStorageFormat storageFormat) + { + String tableName = "test_insert_unpartitioned_table"; + + @Language("SQL") String createTable = "" + + "CREATE TABLE " + tableName + " " + + "(" + + " order_key BIGINT," + + " comment VARCHAR," + + " order_status VARCHAR" + + ") " + + "WITH (" + + "format = '" + storageFormat + "'" + + ") "; + + assertUpdate(session, createTable); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, tableName); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + + for (int i = 0; i < 3; i++) { + assertUpdate( + session, + format( + "INSERT INTO " + tableName + " " + + "SELECT orderkey, comment, orderstatus " + + "FROM tpch.tiny.orders " + + "WHERE orderkey %% 3 = %d", + i), + format("SELECT count(*) from orders where orderkey %% 3 = %d", i)); + } + + assertQuery( + session, + "SELECT * from " + tableName, + "SELECT orderkey, comment, orderstatus FROM orders"); + + assertUpdate(session, "DROP TABLE " + tableName); + + assertFalse(getQueryRunner().tableExists(session, tableName)); + } + + @Test + public void testDeleteFromUnpartitionedTable() + { + assertUpdate("CREATE TABLE test_delete_unpartitioned AS SELECT orderstatus FROM tpch.tiny.orders", "SELECT count(*) from orders"); + + assertUpdate("DELETE FROM test_delete_unpartitioned"); + + MaterializedResult result = computeActual("SELECT * from test_delete_unpartitioned"); + assertEquals(result.getRowCount(), 0); + + assertUpdate("DROP TABLE test_delete_unpartitioned"); + + assertFalse(getQueryRunner().tableExists(getSession(), "test_delete_unpartitioned")); + } + + @Test + public void testMetadataDelete() + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_metadata_delete " + + "(" + + " ORDER_KEY BIGINT," + + " LINE_NUMBER INTEGER," + + " LINE_STATUS VARCHAR" + + ") " + + "WITH (" + + PARTITIONED_BY_PROPERTY + " = ARRAY[ 'LINE_NUMBER', 'LINE_STATUS' ]" + + ") "; + + assertUpdate(createTable); + + assertUpdate("" + + "INSERT INTO test_metadata_delete " + + "SELECT orderkey, linenumber, linestatus " + + "FROM tpch.tiny.lineitem", + "SELECT count(*) from lineitem"); + + // Delete returns number of rows deleted, or null if obtaining the number is hard or impossible. + // Currently, Hive implementation always returns null. + assertUpdate("DELETE FROM test_metadata_delete WHERE LINE_STATUS='F' and LINE_NUMBER=CAST(3 AS INTEGER)"); + + assertQuery("SELECT * from test_metadata_delete", "SELECT orderkey, linenumber, linestatus FROM lineitem WHERE linestatus<>'F' or linenumber<>3"); + + assertUpdate("DELETE FROM test_metadata_delete WHERE LINE_STATUS='O'"); + + assertQuery("SELECT * from test_metadata_delete", "SELECT orderkey, linenumber, linestatus FROM lineitem WHERE linestatus<>'O' and linenumber<>3"); + + try { + getQueryRunner().execute("DELETE FROM test_metadata_delete WHERE ORDER_KEY=1"); + fail("expected exception"); + } + catch (RuntimeException e) { + assertEquals(e.getMessage(), "This connector only supports delete where one or more partitions are deleted entirely for Non-Transactional tables"); + } + + assertQuery("SELECT * from test_metadata_delete", "SELECT orderkey, linenumber, linestatus FROM lineitem WHERE linestatus<>'O' and linenumber<>3"); + + Session session1 = Session.builder(getSession()) + .setCatalogSessionProperty(getSession().getCatalog().get(), "orc_predicate_pushdown_enabled", "true") + .build(); + assertQuery(session1, "SELECT * from test_metadata_delete", "SELECT orderkey, linenumber, linestatus FROM lineitem WHERE linestatus<>'O' and linenumber<>3"); + + @Language("SQL") String multiPartTable = "" + + "CREATE TABLE test_multi_part " + + "(" + + " ID1 INTEGER," + + " ID2 INTEGER," + + " ID3 INTEGER," + + " ID4 INTEGER," + + " ID5 INTEGER," + + " ID6 INTEGER," + + " ID7 INTEGER," + + " ID8 INTEGER," + + " ID9 INTEGER," + + " ID10 INTEGER," + + " ID11 INTEGER," + + " ID12 INTEGER," + + " ID13 INTEGER," + + " ID14 INTEGER " + + ") " + + "WITH (" + + PARTITIONED_BY_PROPERTY + " = ARRAY[ 'ID2','ID3','ID4','ID5','ID6','ID7','ID8','ID9','ID10','ID11','ID12','ID13','ID14']" + + ") "; + + assertUpdate(multiPartTable); + assertUpdate("" + + "INSERT INTO test_multi_part values(1,2,3,4,5,6,7,8,9,10,11,12,13,14) ", + "SELECT 1"); + assertEquals(computeActual("SELECT *, \"$path\" FROM test_multi_part").getRowCount(), 1L); + assertUpdate("DROP TABLE test_multi_part"); + assertUpdate("DROP TABLE test_metadata_delete"); + + assertFalse(getQueryRunner().tableExists(getSession(), "test_metadata_delete")); + } + + private TableMetadata getTableMetadata(String catalog, String schema, String tableName) + { + Session session = getSession(); + Metadata metadata = ((DistributedQueryRunner) getQueryRunner()).getCoordinator().getMetadata(); + + return transaction(getQueryRunner().getTransactionManager(), getQueryRunner().getAccessControl()) + .readOnly() + .execute(session, transactionSession -> { + Optional tableHandle = metadata.getTableHandle(transactionSession, new QualifiedObjectName(catalog, schema, tableName)); + assertTrue(tableHandle.isPresent()); + return metadata.getTableMetadata(transactionSession, tableHandle.get()); + }); + } + + private Object getHiveTableProperty(String tableName, Function propertyGetter) + { + Session session = getSession(); + Metadata metadata = ((DistributedQueryRunner) getQueryRunner()).getCoordinator().getMetadata(); + + return transaction(getQueryRunner().getTransactionManager(), getQueryRunner().getAccessControl()) + .readOnly() + .execute(session, transactionSession -> { + QualifiedObjectName name = new QualifiedObjectName(catalog, TPCH_SCHEMA, tableName); + TableHandle table = metadata.getTableHandle(transactionSession, name) + .orElseThrow(() -> new AssertionError("table not found: " + name)); + table = metadata.applyFilter(transactionSession, table, Constraint.alwaysTrue()) + .orElseThrow(() -> new AssertionError("applyFilter did not return a result")) + .getHandle(); + return propertyGetter.apply((HiveTableHandle) table.getConnectorHandle()); + }); + } + + private List getPartitions(String tableName) + { + return (List) getHiveTableProperty(tableName, handle -> handle.getPartitions().get()); + } + + private int getBucketCount(String tableName) + { + return (int) getHiveTableProperty(tableName, table -> table.getBucketHandle().get().getTableBucketCount()); + } + + @Test + public void testShowColumnsPartitionKey() + { + assertUpdate("" + + "CREATE TABLE test_show_columns_partition_key\n" + + "(grape bigint, orange bigint, pear varchar(65535), mango integer, lychee smallint, kiwi tinyint, apple varchar, pineapple varchar(65535))\n" + + "WITH (partitioned_by = ARRAY['apple', 'pineapple'])"); + + MaterializedResult actual = computeActual("SHOW COLUMNS FROM test_show_columns_partition_key"); + Type unboundedVarchar = canonicalizeType(VARCHAR); + MaterializedResult expected = resultBuilder(getSession(), unboundedVarchar, unboundedVarchar, unboundedVarchar, unboundedVarchar) + .row("grape", canonicalizeTypeName("bigint"), "", "") + .row("orange", canonicalizeTypeName("bigint"), "", "") + .row("pear", canonicalizeTypeName("varchar(65535)"), "", "") + .row("mango", canonicalizeTypeName("integer"), "", "") + .row("lychee", canonicalizeTypeName("smallint"), "", "") + .row("kiwi", canonicalizeTypeName("tinyint"), "", "") + .row("apple", canonicalizeTypeName("varchar"), "partition key", "") + .row("pineapple", canonicalizeTypeName("varchar(65535)"), "partition key", "") + .build(); + assertEquals(actual, expected); + } + + // TODO: These should be moved to another class, when more connectors support arrays + @Test + public void testArrays() + { + assertUpdate("CREATE TABLE tmp_array1 AS SELECT ARRAY[1, 2, NULL] AS col", 1); + assertQuery("SELECT col[2] FROM tmp_array1", "SELECT 2"); + assertQuery("SELECT col[3] FROM tmp_array1", "SELECT NULL"); + + assertUpdate("CREATE TABLE tmp_array2 AS SELECT ARRAY[1.0E0, 2.5E0, 3.5E0] AS col", 1); + assertQuery("SELECT col[2] FROM tmp_array2", "SELECT 2.5"); + + assertUpdate("CREATE TABLE tmp_array3 AS SELECT ARRAY['puppies', 'kittens', NULL] AS col", 1); + assertQuery("SELECT col[2] FROM tmp_array3", "SELECT 'kittens'"); + assertQuery("SELECT col[3] FROM tmp_array3", "SELECT NULL"); + + assertUpdate("CREATE TABLE tmp_array4 AS SELECT ARRAY[TRUE, NULL] AS col", 1); + assertQuery("SELECT col[1] FROM tmp_array4", "SELECT TRUE"); + assertQuery("SELECT col[2] FROM tmp_array4", "SELECT NULL"); + + assertUpdate("CREATE TABLE tmp_array5 AS SELECT ARRAY[ARRAY[1, 2], NULL, ARRAY[3, 4]] AS col", 1); + assertQuery("SELECT col[1][2] FROM tmp_array5", "SELECT 2"); + + assertUpdate("CREATE TABLE tmp_array6 AS SELECT ARRAY[ARRAY['\"hi\"'], NULL, ARRAY['puppies']] AS col", 1); + assertQuery("SELECT col[1][1] FROM tmp_array6", "SELECT '\"hi\"'"); + assertQuery("SELECT col[3][1] FROM tmp_array6", "SELECT 'puppies'"); + + assertUpdate("CREATE TABLE tmp_array7 AS SELECT ARRAY[ARRAY[INTEGER'1', INTEGER'2'], NULL, ARRAY[INTEGER'3', INTEGER'4']] AS col", 1); + assertQuery("SELECT col[1][2] FROM tmp_array7", "SELECT 2"); + + assertUpdate("CREATE TABLE tmp_array8 AS SELECT ARRAY[ARRAY[SMALLINT'1', SMALLINT'2'], NULL, ARRAY[SMALLINT'3', SMALLINT'4']] AS col", 1); + assertQuery("SELECT col[1][2] FROM tmp_array8", "SELECT 2"); + + assertUpdate("CREATE TABLE tmp_array9 AS SELECT ARRAY[ARRAY[TINYINT'1', TINYINT'2'], NULL, ARRAY[TINYINT'3', TINYINT'4']] AS col", 1); + assertQuery("SELECT col[1][2] FROM tmp_array9", "SELECT 2"); + + assertUpdate("CREATE TABLE tmp_array10 AS SELECT ARRAY[ARRAY[DECIMAL '3.14']] AS col1, ARRAY[ARRAY[DECIMAL '12345678901234567890.0123456789']] AS col2", 1); + assertQuery("SELECT col1[1][1] FROM tmp_array10", "SELECT 3.14"); + assertQuery("SELECT col2[1][1] FROM tmp_array10", "SELECT 12345678901234567890.0123456789"); + + assertUpdate("CREATE TABLE tmp_array13 AS SELECT ARRAY[ARRAY[REAL'1.234', REAL'2.345'], NULL, ARRAY[REAL'3.456', REAL'4.567']] AS col", 1); + assertQuery("SELECT col[1][2] FROM tmp_array13", "SELECT 2.345"); + } + + @Test + public void testTemporalArrays() + { + assertUpdate("CREATE TABLE tmp_array11 AS SELECT ARRAY[DATE '2014-09-30'] AS col", 1); + assertOneNotNullResult("SELECT col[1] FROM tmp_array11"); + assertUpdate("CREATE TABLE tmp_array12 AS SELECT ARRAY[TIMESTAMP '2001-08-22 03:04:05.321'] AS col", 1); + assertOneNotNullResult("SELECT col[1] FROM tmp_array12"); + } + + @Test + public void testMaps() + { + assertUpdate("CREATE TABLE tmp_map1 AS SELECT MAP(ARRAY[0,1], ARRAY[2,NULL]) AS col", 1); + assertQuery("SELECT col[0] FROM tmp_map1", "SELECT 2"); + assertQuery("SELECT col[1] FROM tmp_map1", "SELECT NULL"); + + assertUpdate("CREATE TABLE tmp_map2 AS SELECT MAP(ARRAY[INTEGER'1'], ARRAY[INTEGER'2']) AS col", 1); + assertQuery("SELECT col[INTEGER'1'] FROM tmp_map2", "SELECT 2"); + + assertUpdate("CREATE TABLE tmp_map3 AS SELECT MAP(ARRAY[SMALLINT'1'], ARRAY[SMALLINT'2']) AS col", 1); + assertQuery("SELECT col[SMALLINT'1'] FROM tmp_map3", "SELECT 2"); + + assertUpdate("CREATE TABLE tmp_map4 AS SELECT MAP(ARRAY[TINYINT'1'], ARRAY[TINYINT'2']) AS col", 1); + assertQuery("SELECT col[TINYINT'1'] FROM tmp_map4", "SELECT 2"); + + assertUpdate("CREATE TABLE tmp_map5 AS SELECT MAP(ARRAY[1.0], ARRAY[2.5]) AS col", 1); + assertQuery("SELECT col[1.0] FROM tmp_map5", "SELECT 2.5"); + + assertUpdate("CREATE TABLE tmp_map6 AS SELECT MAP(ARRAY['puppies'], ARRAY['kittens']) AS col", 1); + assertQuery("SELECT col['puppies'] FROM tmp_map6", "SELECT 'kittens'"); + + assertUpdate("CREATE TABLE tmp_map7 AS SELECT MAP(ARRAY[TRUE], ARRAY[FALSE]) AS col", 1); + assertQuery("SELECT col[TRUE] FROM tmp_map7", "SELECT FALSE"); + + assertUpdate("CREATE TABLE tmp_map8 AS SELECT MAP(ARRAY[DATE '2014-09-30'], ARRAY[DATE '2014-09-29']) AS col", 1); + assertOneNotNullResult("SELECT col[DATE '2014-09-30'] FROM tmp_map8"); + assertUpdate("CREATE TABLE tmp_map9 AS SELECT MAP(ARRAY[TIMESTAMP '2001-08-22 03:04:05.321'], ARRAY[TIMESTAMP '2001-08-22 03:04:05.321']) AS col", 1); + assertOneNotNullResult("SELECT col[TIMESTAMP '2001-08-22 03:04:05.321'] FROM tmp_map9"); + + assertUpdate("CREATE TABLE tmp_map10 AS SELECT MAP(ARRAY[DECIMAL '3.14', DECIMAL '12345678901234567890.0123456789'], " + + "ARRAY[DECIMAL '12345678901234567890.0123456789', DECIMAL '3.0123456789']) AS col", 1); + assertQuery("SELECT col[DECIMAL '3.14'], col[DECIMAL '12345678901234567890.0123456789'] FROM tmp_map10", "SELECT 12345678901234567890.0123456789, 3.0123456789"); + + assertUpdate("CREATE TABLE tmp_map11 AS SELECT MAP(ARRAY[REAL'1.234'], ARRAY[REAL'2.345']) AS col", 1); + assertQuery("SELECT col[REAL'1.234'] FROM tmp_map11", "SELECT 2.345"); + + assertUpdate("CREATE TABLE tmp_map12 AS SELECT MAP(ARRAY[1.0E0], ARRAY[ARRAY[1, 2]]) AS col", 1); + assertQuery("SELECT col[1.0][2] FROM tmp_map12", "SELECT 2"); + } + + @Test + public void testRows() + { + assertUpdate("CREATE TABLE tmp_row1 AS SELECT cast(row(CAST(1 as BIGINT), CAST(NULL as BIGINT)) AS row(col0 bigint, col1 bigint)) AS a", 1); + assertQuery( + "SELECT a.col0, a.col1 FROM tmp_row1", + "SELECT 1, cast(null as bigint)"); + } + + @Test + public void testComplex() + { + assertUpdate("CREATE TABLE tmp_complex1 AS SELECT " + + "ARRAY [MAP(ARRAY['a', 'b'], ARRAY[2.0E0, 4.0E0]), MAP(ARRAY['c', 'd'], ARRAY[12.0E0, 14.0E0])] AS a", + 1); + + assertQuery( + "SELECT a[1]['a'], a[2]['d'] FROM tmp_complex1", + "SELECT 2.0, 14.0"); + } + + @Test + public void testBucketedCatalog() + { + String bucketedCatalog = bucketedSession.getCatalog().get(); + String bucketedSchema = bucketedSession.getSchema().get(); + + TableMetadata ordersTableMetadata = getTableMetadata(bucketedCatalog, bucketedSchema, "orders"); + assertEquals(ordersTableMetadata.getMetadata().getProperties().get(BUCKETED_BY_PROPERTY), ImmutableList.of("custkey")); + assertEquals(ordersTableMetadata.getMetadata().getProperties().get(BUCKET_COUNT_PROPERTY), 11); + + TableMetadata customerTableMetadata = getTableMetadata(bucketedCatalog, bucketedSchema, "customer"); + assertEquals(customerTableMetadata.getMetadata().getProperties().get(BUCKETED_BY_PROPERTY), ImmutableList.of("custkey")); + assertEquals(customerTableMetadata.getMetadata().getProperties().get(BUCKET_COUNT_PROPERTY), 11); + } + + @Test + public void testBucketedExecution() + { + assertQuery(bucketedSession, "select count(*) a from orders t1 join orders t2 on t1.custkey=t2.custkey"); + assertQuery(bucketedSession, "select count(*) a from orders t1 join customer t2 on t1.custkey=t2.custkey", "SELECT count(*) from orders"); + assertQuery(bucketedSession, "select count(distinct custkey) from orders"); + + assertQuery( + Session.builder(bucketedSession).setSystemProperty("task_writer_count", "1").build(), + "SELECT custkey, COUNT(*) FROM orders GROUP BY custkey"); + assertQuery( + Session.builder(bucketedSession).setSystemProperty("task_writer_count", "4").build(), + "SELECT custkey, COUNT(*) FROM orders GROUP BY custkey"); + } + + @Test + public void testScaleWriters() + { + try { + // small table that will only have one writer + assertUpdate( + Session.builder(getSession()) + .setSystemProperty("scale_writers", "true") + .setSystemProperty("writer_min_size", "32MB") + .build(), + "CREATE TABLE scale_writers_small AS SELECT * FROM tpch.tiny.orders", + (long) computeActual("SELECT count(*) FROM tpch.tiny.orders").getOnlyValue()); + + assertEquals(computeActual("SELECT count(DISTINCT \"$path\") FROM scale_writers_small").getOnlyValue(), 1L); + + // large table that will scale writers to multiple machines + assertUpdate( + Session.builder(getSession()) + .setSystemProperty("scale_writers", "true") + .setSystemProperty("writer_min_size", "1MB") + .build(), + "CREATE TABLE scale_writers_large WITH (format = 'RCBINARY') AS SELECT * FROM tpch.sf1.orders", + (long) computeActual("SELECT count(*) FROM tpch.sf1.orders").getOnlyValue()); + + long files = (long) computeScalar("SELECT count(DISTINCT \"$path\") FROM scale_writers_large"); + long workers = (long) computeScalar("SELECT count(*) FROM system.runtime.nodes"); + assertThat(files).isBetween(2L, workers); + } + finally { + assertUpdate("DROP TABLE IF EXISTS scale_writers_large"); + assertUpdate("DROP TABLE IF EXISTS scale_writers_small"); + } + } + + @Test + public void testTableCommentsTable() + { + assertUpdate("CREATE TABLE test_comment (c1 bigint) COMMENT 'foo'"); + String selectTableComment = format("" + + "SELECT comment FROM system.metadata.table_comments " + + "WHERE catalog_name = '%s' AND schema_name = '%s' AND table_name = 'test_comment'", + getSession().getCatalog().get(), + getSession().getSchema().get()); + assertQuery(selectTableComment, "SELECT 'foo'"); + + assertUpdate("DROP TABLE IF EXISTS test_comment"); + } + + @Test + public void testShowCreateTable() + { + String createTableSql = format("" + + "CREATE TABLE %s.%s.%s (\n" + + " c1 bigint,\n" + + " c2 double,\n" + + " \"c 3\" varchar,\n" + + " \"c'4\" array(bigint),\n" + + " c5 map(bigint, varchar)\n" + + ")\n" + + "WITH (\n" + + " format = 'RCBINARY'\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get(), + "test_show_create_table"); + + assertUpdate(createTableSql); + MaterializedResult actualResult = computeActual("SHOW CREATE TABLE test_show_create_table"); + assertShowCreateTableOutput(getOnlyElement(actualResult.getOnlyColumnAsSet()), createTableSql); + + createTableSql = format("" + + "CREATE TABLE %s.%s.%s (\n" + + " c1 bigint,\n" + + " \"c 2\" varchar,\n" + + " \"c'3\" array(bigint),\n" + + " c4 map(bigint, varchar) COMMENT 'comment test4',\n" + + " c5 double COMMENT 'comment test5'\n)\n" + + "COMMENT 'test'\n" + + "WITH (\n" + + " bucket_count = 5,\n" + + " bucketed_by = ARRAY['c1','c 2'],\n" + + " bucketing_version = 1,\n" + + " format = 'ORC',\n" + + " orc_bloom_filter_columns = ARRAY['c1','c2'],\n" + + " orc_bloom_filter_fpp = 7E-1,\n" + + " partitioned_by = ARRAY['c5'],\n" + + " sorted_by = ARRAY['c1','c 2 DESC']\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get(), + "\"test_show_create_table'2\""); + assertUpdate(createTableSql); + actualResult = computeActual("SHOW CREATE TABLE \"test_show_create_table'2\""); + assertShowCreateTableOutput(getOnlyElement(actualResult.getOnlyColumnAsSet()), createTableSql); + } + + @Test + public void testCreateExternalTable() + throws Exception + { + File tempDir = createTempDirectory(getClass().getName()).toFile(); + File dataFile = new File(tempDir, "test.txt"); + asCharSink(dataFile, UTF_8).write("hello\nworld\n"); + + @Language("SQL") String createTableSql = format("" + + "CREATE TABLE %s.%s.test_create_external (\n" + + " name varchar\n" + + ")\n" + + "WITH (\n" + + " location = '%s',\n" + + " external = true,\n" + + " format = 'TEXTFILE'\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get(), + new Path(tempDir.toURI().toASCIIString()).toString()); + + assertUpdate(createTableSql); + MaterializedResult actual = computeActual("SHOW CREATE TABLE test_create_external"); + assertShowCreateTableOutput(actual.getOnlyValue(), createTableSql); + + actual = computeActual("SELECT name FROM test_create_external"); + assertEquals(actual.getOnlyColumnAsSet(), ImmutableSet.of("hello", "world")); + + assertUpdate("DROP TABLE test_create_external"); + + // file should still exist after drop + assertFile(dataFile); + + deleteRecursively(tempDir.toPath(), ALLOW_INSECURE); + } + + @Test + public void testCreateTableWithSortedBy() + { + @Language("SQL") String createTableSql = format("" + + "CREATE TABLE %s.%s.test_create_sorted (\n" + + " viewTime int,\n" + + " userID bigint\n" + + ")\n" + + "WITH (\n" + + " bucketed_by = ARRAY['userID'],\n" + + " sorted_by = ARRAY['viewTime'],\n" + + " bucket_count = 3,\n" + + " format = 'TEXTFILE'\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get()); + + assertUpdate(createTableSql); + + String expectedSql = format("" + + "CREATE TABLE %s.%s.test_create_sorted (\n" + + " viewtime int,\n" + + " userid bigint\n" + + ")\n" + + "WITH (\n" + + " bucketed_by = ARRAY['userid'],\n" + + " sorted_by = ARRAY['viewtime'],\n" + + " bucket_count = 3,\n" + + " format = 'TEXTFILE'\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get()); + MaterializedResult actual = computeActual("SHOW CREATE TABLE test_create_sorted"); + assertShowCreateTableOutput(actual.getOnlyValue(), expectedSql); + + assertUpdate("DROP TABLE test_create_sorted"); + } + + @Test + public void testCommentTable() + { + String createTableSql = format("" + + "CREATE TABLE %s.%s.%s (\n" + + " c1 bigint\n" + + ")\n" + + "WITH (\n" + + " format = 'RCBINARY'\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get(), + "test_comment_table"); + + assertUpdate(createTableSql); + MaterializedResult actualResult = computeActual("SHOW CREATE TABLE test_comment_table"); + assertShowCreateTableOutput(getOnlyElement(actualResult.getOnlyColumnAsSet()), createTableSql); + + assertUpdate("COMMENT ON TABLE test_comment_table IS 'new comment'"); + String commentedCreateTableSql = format("" + + "CREATE TABLE %s.%s.%s (\n" + + " c1 bigint\n" + + ")\n" + + "COMMENT 'new comment'\n" + + "WITH (\n" + + " format = 'RCBINARY'\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get(), + "test_comment_table"); + actualResult = computeActual("SHOW CREATE TABLE test_comment_table"); + assertShowCreateTableOutput(getOnlyElement(actualResult.getOnlyColumnAsSet()), commentedCreateTableSql); + + assertUpdate("COMMENT ON TABLE test_comment_table IS 'updated comment'"); + commentedCreateTableSql = format("" + + "CREATE TABLE %s.%s.%s (\n" + + " c1 bigint\n" + + ")\n" + + "COMMENT 'updated comment'\n" + + "WITH (\n" + + " format = 'RCBINARY'\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get(), + "test_comment_table"); + actualResult = computeActual("SHOW CREATE TABLE test_comment_table"); + assertShowCreateTableOutput(getOnlyElement(actualResult.getOnlyColumnAsSet()), commentedCreateTableSql); + + assertUpdate("COMMENT ON TABLE test_comment_table IS ''"); + commentedCreateTableSql = format("" + + "CREATE TABLE %s.%s.%s (\n" + + " c1 bigint\n" + + ")\n" + + "COMMENT ''\n" + + "WITH (\n" + + " format = 'RCBINARY'\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get(), + "test_comment_table"); + actualResult = computeActual("SHOW CREATE TABLE test_comment_table"); + assertShowCreateTableOutput(getOnlyElement(actualResult.getOnlyColumnAsSet()), commentedCreateTableSql); + + assertUpdate("DROP TABLE test_comment_table"); + } + + @Test + public void testCreateTableWithHeaderAndFooter() + { + @Language("SQL") String createTableSql = format("" + + "CREATE TABLE %s.%s.test_table_skip_header (\n" + + " name varchar\n" + + ")\n" + + "WITH (\n" + + " format = 'TEXTFILE',\n" + + " textfile_skip_header_line_count = 1\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get()); + + assertUpdate(createTableSql); + + MaterializedResult actual = computeActual("SHOW CREATE TABLE test_table_skip_header"); + assertShowCreateTableOutput(actual.getOnlyValue(), createTableSql); + assertUpdate("DROP TABLE test_table_skip_header"); + + createTableSql = format("" + + "CREATE TABLE %s.%s.test_table_skip_footer (\n" + + " name varchar\n" + + ")\n" + + "WITH (\n" + + " format = 'TEXTFILE',\n" + + " textfile_skip_footer_line_count = 1\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get()); + + assertUpdate(createTableSql); + + actual = computeActual("SHOW CREATE TABLE test_table_skip_footer"); + assertShowCreateTableOutput(actual.getOnlyValue(), createTableSql); + assertUpdate("DROP TABLE test_table_skip_footer"); + + createTableSql = format("" + + "CREATE TABLE %s.%s.test_table_skip_header_footer (\n" + + " name varchar\n" + + ")\n" + + "WITH (\n" + + " format = 'TEXTFILE',\n" + + " textfile_skip_footer_line_count = 1,\n" + + " textfile_skip_header_line_count = 1\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get()); + + assertUpdate(createTableSql); + + actual = computeActual("SHOW CREATE TABLE test_table_skip_header_footer"); + assertShowCreateTableOutput(actual.getOnlyValue(), createTableSql); + assertUpdate("DROP TABLE test_table_skip_header_footer"); + } + + @Test + public void testCreateTableWithInvalidProperties() + { + // ORC + assertThatThrownBy(() -> assertUpdate("CREATE TABLE invalid_table (col1 bigint) WITH (format = 'TEXTFILE', orc_bloom_filter_columns = ARRAY['col1'])")) + .hasMessageMatching("Cannot specify orc_bloom_filter_columns table property for storage format: TEXTFILE"); + + // TEXTFILE + assertThatThrownBy(() -> assertUpdate("CREATE TABLE test_orc_skip_header (col1 bigint) WITH (format = 'ORC', textfile_skip_header_line_count = 1)")) + .hasMessageMatching("Cannot specify textfile_skip_header_line_count table property for storage format: ORC"); + assertThatThrownBy(() -> assertUpdate("CREATE TABLE test_orc_skip_footer (col1 bigint) WITH (format = 'ORC', textfile_skip_footer_line_count = 1)")) + .hasMessageMatching("Cannot specify textfile_skip_footer_line_count table property for storage format: ORC"); + assertThatThrownBy(() -> assertUpdate("CREATE TABLE test_invalid_skip_header (col1 bigint) WITH (format = 'TEXTFILE', textfile_skip_header_line_count = -1)")) + .hasMessageMatching("Invalid value for textfile_skip_header_line_count property: -1"); + assertThatThrownBy(() -> assertUpdate("CREATE TABLE test_invalid_skip_footer (col1 bigint) WITH (format = 'TEXTFILE', textfile_skip_footer_line_count = -1)")) + .hasMessageMatching("Invalid value for textfile_skip_footer_line_count property: -1"); + + // CSV + assertThatThrownBy(() -> assertUpdate("CREATE TABLE invalid_table (col1 bigint) WITH (format = 'ORC', csv_separator = 'S')")) + .hasMessageMatching("Cannot specify csv_separator table property for storage format: ORC"); + assertThatThrownBy(() -> assertUpdate("CREATE TABLE invalid_table (col1 varchar) WITH (format = 'CSV', csv_separator = 'SS')")) + .hasMessageMatching("csv_separator must be a single character string, but was: 'SS'"); + assertThatThrownBy(() -> assertUpdate("CREATE TABLE invalid_table (col1 bigint) WITH (format = 'ORC', csv_quote = 'Q')")) + .hasMessageMatching("Cannot specify csv_quote table property for storage format: ORC"); + assertThatThrownBy(() -> assertUpdate("CREATE TABLE invalid_table (col1 varchar) WITH (format = 'CSV', csv_quote = 'QQ')")) + .hasMessageMatching("csv_quote must be a single character string, but was: 'QQ'"); + assertThatThrownBy(() -> assertUpdate("CREATE TABLE invalid_table (col1 varchar) WITH (format = 'ORC', csv_escape = 'E')")) + .hasMessageMatching("Cannot specify csv_escape table property for storage format: ORC"); + assertThatThrownBy(() -> assertUpdate("CREATE TABLE invalid_table (col1 varchar) WITH (format = 'CSV', csv_escape = 'EE')")) + .hasMessageMatching("csv_escape must be a single character string, but was: 'EE'"); + } + + @Test + public void testPathHiddenColumn() + { + testWithAllStorageFormats(this::testPathHiddenColumn); + } + + private void testPathHiddenColumn(Session session, HiveStorageFormat storageFormat) + { + @Language("SQL") String createTable = "CREATE TABLE test_path " + + "WITH (" + + "format = '" + storageFormat + "'," + + "partitioned_by = ARRAY['col1']" + + ") AS " + + "SELECT * FROM (VALUES " + + "(0, 0), (3, 0), (6, 0), " + + "(1, 1), (4, 1), (7, 1), " + + "(2, 2), (5, 2) " + + " ) t(col0, col1) "; + assertUpdate(session, createTable, 8); + assertTrue(getQueryRunner().tableExists(getSession(), "test_path")); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_path"); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + + List columnNames = ImmutableList.of("col0", "col1", PATH_COLUMN_NAME); + List columnMetadatas = tableMetadata.getColumns(); + assertEquals(columnMetadatas.size(), columnNames.size()); + for (int i = 0; i < columnMetadatas.size(); i++) { + ColumnMetadata columnMetadata = columnMetadatas.get(i); + assertEquals(columnMetadata.getName(), columnNames.get(i)); + if (columnMetadata.getName().equals(PATH_COLUMN_NAME)) { + // $path should be hidden column + assertTrue(columnMetadata.isHidden()); + } + } + assertEquals(getPartitions("test_path").size(), 3); + + MaterializedResult results = computeActual(session, format("SELECT *, \"%s\" FROM test_path", PATH_COLUMN_NAME)); + Map partitionPathMap = new HashMap<>(); + for (int i = 0; i < results.getRowCount(); i++) { + MaterializedRow row = results.getMaterializedRows().get(i); + int col0 = (int) row.getField(0); + int col1 = (int) row.getField(1); + String pathName = (String) row.getField(2); + String parentDirectory = new Path(pathName).getParent().toString(); + + assertTrue(pathName.length() > 0); + assertEquals(col0 % 3, col1); + if (partitionPathMap.containsKey(col1)) { + // the rows in the same partition should be in the same partition directory + assertEquals(partitionPathMap.get(col1), parentDirectory); + } + else { + partitionPathMap.put(col1, parentDirectory); + } + } + assertEquals(partitionPathMap.size(), 3); + + assertUpdate(session, "DROP TABLE test_path"); + assertFalse(getQueryRunner().tableExists(session, "test_path")); + } + + @Test + public void testBucketHiddenColumn() + { + @Language("SQL") String createTable = "CREATE TABLE test_bucket_hidden_column " + + "WITH (" + + "bucketed_by = ARRAY['col0']," + + "bucket_count = 2" + + ") AS " + + "SELECT * FROM (VALUES " + + "(0, 11), (1, 12), (2, 13), " + + "(3, 14), (4, 15), (5, 16), " + + "(6, 17), (7, 18), (8, 19)" + + " ) t (col0, col1) "; + assertUpdate(createTable, 9); + assertTrue(getQueryRunner().tableExists(getSession(), "test_bucket_hidden_column")); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, "test_bucket_hidden_column"); + assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKETED_BY_PROPERTY), ImmutableList.of("col0")); + assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKET_COUNT_PROPERTY), 2); + + List columnNames = ImmutableList.of("col0", "col1", PATH_COLUMN_NAME, BUCKET_COLUMN_NAME); + List columnMetadatas = tableMetadata.getColumns(); + assertEquals(columnMetadatas.size(), columnNames.size()); + for (int i = 0; i < columnMetadatas.size(); i++) { + ColumnMetadata columnMetadata = columnMetadatas.get(i); + assertEquals(columnMetadata.getName(), columnNames.get(i)); + if (columnMetadata.getName().equals(BUCKET_COLUMN_NAME)) { + // $bucket_number should be hidden column + assertTrue(columnMetadata.isHidden()); + } + } + assertEquals(getBucketCount("test_bucket_hidden_column"), 2); + + MaterializedResult results = computeActual(format("SELECT *, \"%1$s\" FROM test_bucket_hidden_column WHERE \"%1$s\" = 1", + BUCKET_COLUMN_NAME)); + for (int i = 0; i < results.getRowCount(); i++) { + MaterializedRow row = results.getMaterializedRows().get(i); + int col0 = (int) row.getField(0); + int col1 = (int) row.getField(1); + int bucket = (int) row.getField(2); + + assertEquals(col1, col0 + 11); + assertTrue(col1 % 2 == 0); + + // Because Hive's hash function for integer n is h(n) = n. + assertEquals(bucket, col0 % 2); + } + assertEquals(results.getRowCount(), 4); + + assertUpdate("DROP TABLE test_bucket_hidden_column"); + assertFalse(getQueryRunner().tableExists(getSession(), "test_bucket_hidden_column")); + } + + @Test + public void testDeleteAndInsert() + { + Session session = getSession(); + + // Partition 1 is untouched + // Partition 2 is altered (dropped and then added back) + // Partition 3 is added + // Partition 4 is dropped + + assertUpdate( + session, + "CREATE TABLE tmp_delete_insert WITH (partitioned_by=array ['z']) AS " + + "SELECT * from (VALUES (CAST (101 AS BIGINT), CAST (1 AS BIGINT)), (201, 2), (202, 2), (401, 4), (402, 4), (403, 4)) t(a, z)", + 6); + + List expectedBefore = MaterializedResult.resultBuilder(session, BIGINT, BIGINT) + .row(101L, 1L) + .row(201L, 2L) + .row(202L, 2L) + .row(401L, 4L) + .row(402L, 4L) + .row(403L, 4L) + .build() + .getMaterializedRows(); + List expectedAfter = MaterializedResult.resultBuilder(session, BIGINT, BIGINT) + .row(101L, 1L) + .row(203L, 2L) + .row(204L, 2L) + .row(205L, 2L) + .row(301L, 2L) + .row(302L, 3L) + .build() + .getMaterializedRows(); + + try { + transaction(getQueryRunner().getTransactionManager(), getQueryRunner().getAccessControl()) + .execute(session, transactionSession -> { + assertUpdate(transactionSession, "DELETE FROM tmp_delete_insert WHERE z >= 2"); + assertUpdate(transactionSession, "INSERT INTO tmp_delete_insert VALUES (203, 2), (204, 2), (205, 2), (301, 2), (302, 3)", 5); + MaterializedResult actualFromAnotherTransaction = computeActual(session, "SELECT * FROM tmp_delete_insert"); + assertEqualsIgnoreOrder(actualFromAnotherTransaction, expectedBefore); + MaterializedResult actualFromCurrentTransaction = computeActual(transactionSession, "SELECT * FROM tmp_delete_insert"); + assertEqualsIgnoreOrder(actualFromCurrentTransaction, expectedAfter); + rollback(); + }); + } + catch (RollbackException e) { + // ignore + } + + MaterializedResult actualAfterRollback = computeActual(session, "SELECT * FROM tmp_delete_insert"); + assertEqualsIgnoreOrder(actualAfterRollback, expectedBefore); + + transaction(getQueryRunner().getTransactionManager(), getQueryRunner().getAccessControl()) + .execute(session, transactionSession -> { + assertUpdate(transactionSession, "DELETE FROM tmp_delete_insert WHERE z >= 2"); + assertUpdate(transactionSession, "INSERT INTO tmp_delete_insert VALUES (203, 2), (204, 2), (205, 2), (301, 2), (302, 3)", 5); + MaterializedResult actualOutOfTransaction = computeActual(session, "SELECT * FROM tmp_delete_insert"); + assertEqualsIgnoreOrder(actualOutOfTransaction, expectedBefore); + MaterializedResult actualInTransaction = computeActual(transactionSession, "SELECT * FROM tmp_delete_insert"); + assertEqualsIgnoreOrder(actualInTransaction, expectedAfter); + }); + + MaterializedResult actualAfterTransaction = computeActual(session, "SELECT * FROM tmp_delete_insert"); + assertEqualsIgnoreOrder(actualAfterTransaction, expectedAfter); + } + + @Test + public void testCreateAndInsert() + { + Session session = getSession(); + + List expected = MaterializedResult.resultBuilder(session, BIGINT, BIGINT) + .row(101L, 1L) + .row(201L, 2L) + .row(202L, 2L) + .row(301L, 3L) + .row(302L, 3L) + .build() + .getMaterializedRows(); + + transaction(getQueryRunner().getTransactionManager(), getQueryRunner().getAccessControl()) + .execute(session, transactionSession -> { + assertUpdate( + transactionSession, + "CREATE TABLE tmp_create_insert WITH (partitioned_by=array ['z']) AS " + + "SELECT * from (VALUES (CAST (101 AS BIGINT), CAST (1 AS BIGINT)), (201, 2), (202, 2)) t(a, z)", + 3); + assertUpdate(transactionSession, "INSERT INTO tmp_create_insert VALUES (301, 3), (302, 3)", 2); + MaterializedResult actualFromCurrentTransaction = computeActual(transactionSession, "SELECT * FROM tmp_create_insert"); + assertEqualsIgnoreOrder(actualFromCurrentTransaction, expected); + }); + + MaterializedResult actualAfterTransaction = computeActual(session, "SELECT * FROM tmp_create_insert"); + assertEqualsIgnoreOrder(actualAfterTransaction, expected); + } + + @Test + public void testAddColumn() + { + assertUpdate("CREATE TABLE test_add_column (a bigint COMMENT 'test comment AAA')"); + assertUpdate("ALTER TABLE test_add_column ADD COLUMN b bigint COMMENT 'test comment BBB'"); + assertQueryFails("ALTER TABLE test_add_column ADD COLUMN a varchar", ".* Column 'a' already exists"); + assertQueryFails("ALTER TABLE test_add_column ADD COLUMN c bad_type", ".* Unknown type 'bad_type' for column 'c'"); + assertQuery("SHOW COLUMNS FROM test_add_column", "VALUES ('a', 'bigint', '', 'test comment AAA'), ('b', 'bigint', '', 'test comment BBB')"); + assertUpdate("DROP TABLE test_add_column"); + } + + @Test + public void testRenameColumn() + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_rename_column\n" + + "WITH (\n" + + " partitioned_by = ARRAY ['orderstatus']\n" + + ")\n" + + "AS\n" + + "SELECT orderkey, orderstatus FROM orders"; + + assertUpdate(createTable, "SELECT count(*) FROM orders"); + assertUpdate("ALTER TABLE test_rename_column RENAME COLUMN orderkey TO new_orderkey"); + assertQuery("SELECT new_orderkey, orderstatus FROM test_rename_column", "SELECT NULL, orderstatus FROM orders where orderstatus != 'dfd'"); + assertQueryFails("ALTER TABLE test_rename_column RENAME COLUMN \"$path\" TO test", ".* Cannot rename hidden column"); + assertQueryFails("ALTER TABLE test_rename_column RENAME COLUMN orderstatus TO new_orderstatus", "Renaming partition columns is not supported"); + assertQuery("SELECT new_orderkey, orderstatus FROM test_rename_column", "SELECT NULL, orderstatus FROM orders"); + assertUpdate("DROP TABLE test_rename_column"); + } + + @Test + public void testDropColumn() + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_drop_column\n" + + "WITH (\n" + + " partitioned_by = ARRAY ['orderstatus']\n" + + ")\n" + + "AS\n" + + "SELECT custkey, orderkey, orderstatus FROM orders"; + + assertUpdate(createTable, "SELECT count(*) FROM orders"); + assertQuery("SELECT orderkey, orderstatus FROM test_drop_column", "SELECT orderkey, orderstatus FROM orders"); + + assertQueryFails("ALTER TABLE test_drop_column DROP COLUMN \"$path\"", ".* Cannot drop hidden column"); + assertQueryFails("ALTER TABLE test_drop_column DROP COLUMN orderstatus", "Cannot drop partition columns"); + assertUpdate("ALTER TABLE test_drop_column DROP COLUMN orderkey"); + assertQueryFails("ALTER TABLE test_drop_column DROP COLUMN custkey", "Cannot drop the only non-partition column in a table"); + assertQuery("SELECT * FROM test_drop_column", "SELECT custkey, orderstatus FROM orders"); + + assertUpdate("DROP TABLE test_drop_column"); + } + + @Test + public void testDropBucketingColumn() + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_drop_bucketing_column\n" + + "WITH (\n" + + " bucket_count = 5, bucketed_by = ARRAY ['orderstatus']\n" + + ")\n" + + "AS\n" + + "SELECT custkey, orderkey, orderstatus FROM orders"; + + assertUpdate(createTable, "SELECT count(*) FROM orders"); + assertQuery("SELECT orderkey, orderstatus FROM test_drop_bucketing_column", "SELECT orderkey, orderstatus FROM orders"); + + assertQueryFails("ALTER TABLE test_drop_bucketing_column DROP COLUMN orderstatus", "Cannot drop bucketing columns"); + assertQuery("SELECT * FROM test_drop_bucketing_column", "SELECT custkey, orderkey, orderstatus FROM orders"); + + assertUpdate("DROP TABLE test_drop_bucketing_column"); + } + + @Test + private void testRenameBucketingColumn() + { + @Language("SQL") String createTable = "" + + "CREATE TABLE test_rename_bucketing_column\n" + + "WITH (\n" + + " bucket_count = 5, bucketed_by = ARRAY ['orderstatus']\n" + + ")\n" + + "AS\n" + + "SELECT custkey, orderkey, orderstatus FROM orders"; + + assertUpdate(createTable, "SELECT count(*) FROM orders"); + assertQuery("SELECT orderkey, orderstatus FROM test_rename_bucketing_column", "SELECT orderkey, orderstatus FROM orders"); + + assertUpdate("ALTER TABLE test_rename_bucketing_column RENAME COLUMN orderstatus TO orderstatus1"); + assertQuery("SELECT orderkey, orderstatus1 FROM test_rename_bucketing_column", "SELECT orderkey, orderstatus FROM orders"); + + assertUpdate("DROP TABLE test_rename_bucketing_column"); + } + + @Test + public void testAvroTypeValidation() + { + assertQueryFails("CREATE TABLE test_avro_types (x map(bigint, bigint)) WITH (format = 'AVRO')", "Column x has a non-varchar map key, which is not supported by Avro"); + assertQueryFails("CREATE TABLE test_avro_types (x tinyint) WITH (format = 'AVRO')", "Column x is tinyint, which is not supported by Avro. Use integer instead."); + assertQueryFails("CREATE TABLE test_avro_types (x smallint) WITH (format = 'AVRO')", "Column x is smallint, which is not supported by Avro. Use integer instead."); + + assertQueryFails("CREATE TABLE test_avro_types WITH (format = 'AVRO') AS SELECT cast(42 AS smallint) z", "Column z is smallint, which is not supported by Avro. Use integer instead."); + } + + @Test + public void testOrderByChar() + { + assertUpdate("CREATE TABLE char_order_by (c_char char(2))"); + assertUpdate("INSERT INTO char_order_by (c_char) VALUES" + + "(CAST('a' as CHAR(2)))," + + "(CAST('a\0' as CHAR(2)))," + + "(CAST('a ' as CHAR(2)))", 3); + + MaterializedResult actual = computeActual(getSession(), + "SELECT * FROM char_order_by ORDER BY c_char ASC"); + + assertUpdate("DROP TABLE char_order_by"); + + MaterializedResult expected = resultBuilder(getSession(), createCharType(2)) + .row("a\0") + .row("a ") + .row("a ") + .build(); + + assertEquals(actual, expected); + } + + /** + * Tests correctness of comparison of char(x) and varchar pushed down to a table scan as a TupleDomain + */ + @Test + public void testPredicatePushDownToTableScan() + { + // Test not specific to Hive, but needs a connector supporting table creation + + assertUpdate("CREATE TABLE test_table_with_char (a char(20))"); + try { + assertUpdate("INSERT INTO test_table_with_char (a) VALUES" + + "(cast('aaa' as char(20)))," + + "(cast('bbb' as char(20)))," + + "(cast('bbc' as char(20)))," + + "(cast('bbd' as char(20)))", 4); + + assertQuery( + "SELECT a, a <= 'bbc' FROM test_table_with_char", + "VALUES (cast('aaa' as char(20)), true), " + + "(cast('bbb' as char(20)), true), " + + "(cast('bbc' as char(20)), true), " + + "(cast('bbd' as char(20)), false)"); + + assertQuery( + "SELECT a FROM test_table_with_char WHERE a <= 'bbc'", + "VALUES cast('aaa' as char(20)), " + + "cast('bbb' as char(20)), " + + "cast('bbc' as char(20))"); + } + finally { + assertUpdate("DROP TABLE test_table_with_char"); + } + } + + @Test + public void testPartitionPruning() + { + assertUpdate("CREATE TABLE test_partition_pruning (v bigint, k varchar) WITH (partitioned_by = array['k'])"); + assertUpdate("INSERT INTO test_partition_pruning (v, k) VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'e')", 4); + + try { + String query = "SELECT * FROM test_partition_pruning WHERE k = 'a'"; + assertQuery(query, "VALUES (1, 'a')"); + assertConstraints( + query, + ImmutableSet.of( + new ColumnConstraint( + "k", + VARCHAR.getTypeSignature(), + new FormattedDomain( + false, + ImmutableSet.of( + new FormattedRange( + new FormattedMarker(Optional.of("a"), EXACTLY), + new FormattedMarker(Optional.of("a"), EXACTLY))))))); + + query = "SELECT * FROM test_partition_pruning WHERE k IN ('a', 'b')"; + assertQuery(query, "VALUES (1, 'a'), (2, 'b')"); + assertConstraints( + query, + ImmutableSet.of( + new ColumnConstraint( + "k", + VARCHAR.getTypeSignature(), + new FormattedDomain( + false, + ImmutableSet.of( + new FormattedRange( + new FormattedMarker(Optional.of("a"), EXACTLY), + new FormattedMarker(Optional.of("a"), EXACTLY)), + new FormattedRange( + new FormattedMarker(Optional.of("b"), EXACTLY), + new FormattedMarker(Optional.of("b"), EXACTLY))))))); + + query = "SELECT * FROM test_partition_pruning WHERE k >= 'b'"; + assertQuery(query, "VALUES (2, 'b'), (3, 'c'), (4, 'e')"); + assertConstraints( + query, + ImmutableSet.of( + new ColumnConstraint( + "k", + VARCHAR.getTypeSignature(), + new FormattedDomain( + false, + ImmutableSet.of( + new FormattedRange( + new FormattedMarker(Optional.of("b"), EXACTLY), + new FormattedMarker(Optional.of("b"), EXACTLY)), + new FormattedRange( + new FormattedMarker(Optional.of("c"), EXACTLY), + new FormattedMarker(Optional.of("c"), EXACTLY)), + new FormattedRange( + new FormattedMarker(Optional.of("e"), EXACTLY), + new FormattedMarker(Optional.of("e"), EXACTLY))))))); + + query = "SELECT * FROM (" + + " SELECT * " + + " FROM test_partition_pruning " + + " WHERE v IN (1, 2, 4) " + + ") t " + + "WHERE t.k >= 'b'"; + assertQuery(query, "VALUES (2, 'b'), (4, 'e')"); + assertConstraints( + query, + ImmutableSet.of( + new ColumnConstraint( + "k", + VARCHAR.getTypeSignature(), + new FormattedDomain( + false, + ImmutableSet.of( + new FormattedRange( + new FormattedMarker(Optional.of("b"), EXACTLY), + new FormattedMarker(Optional.of("b"), EXACTLY)), + new FormattedRange( + new FormattedMarker(Optional.of("c"), EXACTLY), + new FormattedMarker(Optional.of("c"), EXACTLY)), + new FormattedRange( + new FormattedMarker(Optional.of("e"), EXACTLY), + new FormattedMarker(Optional.of("e"), EXACTLY))))))); + } + finally { + assertUpdate("DROP TABLE test_partition_pruning"); + } + } + + @Test + public void testMismatchedBucketing() + { + try { + assertUpdate( + "CREATE TABLE test_mismatch_bucketing16\n" + + "WITH (bucket_count = 16, bucketed_by = ARRAY['key16']) AS\n" + + "SELECT orderkey key16, comment value16 FROM orders", + 15000); + assertUpdate( + "CREATE TABLE test_mismatch_bucketing32\n" + + "WITH (bucket_count = 32, bucketed_by = ARRAY['key32']) AS\n" + + "SELECT orderkey key32, comment value32 FROM orders", + 15000); + assertUpdate( + "CREATE TABLE test_mismatch_bucketingN AS\n" + + "SELECT orderkey keyN, comment valueN FROM orders", + 15000); + + Session withMismatchOptimization = Session.builder(getSession()) + .setSystemProperty(COLOCATED_JOIN, "true") + .setSystemProperty(ENABLE_DYNAMIC_FILTERING, "false") + .setSystemProperty(JOIN_REORDERING_STRATEGY, JoinReorderingStrategy.NONE.name()) + .setSystemProperty(JOIN_DISTRIBUTION_TYPE, JoinDistributionType.PARTITIONED.name()) + .setCatalogSessionProperty(catalog, "optimize_mismatched_bucket_count", "true") + .build(); + Session withoutMismatchOptimization = Session.builder(getSession()) + .setSystemProperty(COLOCATED_JOIN, "true") + .setSystemProperty(ENABLE_DYNAMIC_FILTERING, "false") + .setSystemProperty(JOIN_REORDERING_STRATEGY, JoinReorderingStrategy.NONE.name()) + .setSystemProperty(JOIN_DISTRIBUTION_TYPE, JoinDistributionType.PARTITIONED.name()) + .setCatalogSessionProperty(catalog, "optimize_mismatched_bucket_count", "false") + .build(); + + @Language("SQL") String writeToTableWithMoreBuckets = "CREATE TABLE test_mismatch_bucketing_out32\n" + + "WITH (bucket_count = 32, bucketed_by = ARRAY['key16'])\n" + + "AS\n" + + "SELECT key16, value16, key32, value32, keyN, valueN\n" + + "FROM\n" + + " test_mismatch_bucketing16\n" + + "JOIN\n" + + " test_mismatch_bucketing32\n" + + "ON key16=key32\n" + + "JOIN\n" + + " test_mismatch_bucketingN\n" + + "ON key16=keyN"; + @Language("SQL") String writeToTableWithFewerBuckets = "CREATE TABLE test_mismatch_bucketing_out8\n" + + "WITH (bucket_count = 8, bucketed_by = ARRAY['key16'])\n" + + "AS\n" + + "SELECT key16, value16, key32, value32, keyN, valueN\n" + + "FROM\n" + + " test_mismatch_bucketing16\n" + + "JOIN\n" + + " test_mismatch_bucketing32\n" + + "ON key16=key32\n" + + "JOIN\n" + + " test_mismatch_bucketingN\n" + + "ON key16=keyN"; + + assertUpdate(withoutMismatchOptimization, writeToTableWithMoreBuckets, 15000, assertRemoteExchangesCount(4)); + assertQuery("SELECT * FROM test_mismatch_bucketing_out32", "SELECT orderkey, comment, orderkey, comment, orderkey, comment from orders"); + assertUpdate("DROP TABLE IF EXISTS test_mismatch_bucketing_out32"); + + assertUpdate(withMismatchOptimization, writeToTableWithMoreBuckets, 15000, assertRemoteExchangesCount(2)); + assertQuery("SELECT * FROM test_mismatch_bucketing_out32", "SELECT orderkey, comment, orderkey, comment, orderkey, comment from orders"); + + assertUpdate(withMismatchOptimization, writeToTableWithFewerBuckets, 15000, assertRemoteExchangesCount(2)); + assertQuery("SELECT * FROM test_mismatch_bucketing_out8", "SELECT orderkey, comment, orderkey, comment, orderkey, comment from orders"); + } + finally { + assertUpdate("DROP TABLE IF EXISTS test_mismatch_bucketing16"); + assertUpdate("DROP TABLE IF EXISTS test_mismatch_bucketing32"); + assertUpdate("DROP TABLE IF EXISTS test_mismatch_bucketingN"); + assertUpdate("DROP TABLE IF EXISTS test_mismatch_bucketing_out32"); + assertUpdate("DROP TABLE IF EXISTS test_mismatch_bucketing_out8"); + } + } + + @Test + public void testGroupedExecution() + { + try { + assertUpdate( + "CREATE TABLE test_grouped_join1\n" + + "WITH (bucket_count = 13, bucketed_by = ARRAY['key1']) AS\n" + + "SELECT orderkey key1, comment value1 FROM orders", + 15000); + assertUpdate( + "CREATE TABLE test_grouped_join2\n" + + "WITH (bucket_count = 13, bucketed_by = ARRAY['key2']) AS\n" + + "SELECT orderkey key2, comment value2 FROM orders", + 15000); + assertUpdate( + "CREATE TABLE test_grouped_join3\n" + + "WITH (bucket_count = 13, bucketed_by = ARRAY['key3']) AS\n" + + "SELECT orderkey key3, comment value3 FROM orders", + 15000); + assertUpdate( + "CREATE TABLE test_grouped_join4\n" + + "WITH (bucket_count = 13, bucketed_by = ARRAY['key4_bucket']) AS\n" + + "SELECT orderkey key4_bucket, orderkey key4_non_bucket, comment value4 FROM orders", + 15000); + assertUpdate( + "CREATE TABLE test_grouped_joinN AS\n" + + "SELECT orderkey keyN, comment valueN FROM orders", + 15000); + assertUpdate( + "CREATE TABLE test_grouped_joinDual\n" + + "WITH (bucket_count = 13, bucketed_by = ARRAY['keyD']) AS\n" + + "SELECT orderkey keyD, comment valueD FROM orders CROSS JOIN UNNEST(repeat(NULL, 2))", + 30000); + assertUpdate( + "CREATE TABLE test_grouped_window\n" + + "WITH (bucket_count = 5, bucketed_by = ARRAY['key']) AS\n" + + "SELECT custkey key, orderkey value FROM orders WHERE custkey <= 5 ORDER BY orderkey LIMIT 10", + 10); + + // NOT grouped execution; default + Session notColocated = Session.builder(getSession()) + .setSystemProperty(COLOCATED_JOIN, "false") + .setSystemProperty(GROUPED_EXECUTION, "false") + .setSystemProperty(ENABLE_DYNAMIC_FILTERING, "false") + .build(); + // Co-located JOIN with all groups at once, fixed schedule + Session colocatedAllGroupsAtOnce = Session.builder(getSession()) + .setSystemProperty(COLOCATED_JOIN, "true") + .setSystemProperty(GROUPED_EXECUTION, "true") + .setSystemProperty(CONCURRENT_LIFESPANS_PER_NODE, "0") + .setSystemProperty(DYNAMIC_SCHEDULE_FOR_GROUPED_EXECUTION, "false") + .setSystemProperty(ENABLE_DYNAMIC_FILTERING, "false") + .build(); + // Co-located JOIN, 1 group per worker at a time, fixed schedule + Session colocatedOneGroupAtATime = Session.builder(getSession()) + .setSystemProperty(COLOCATED_JOIN, "true") + .setSystemProperty(GROUPED_EXECUTION, "true") + .setSystemProperty(CONCURRENT_LIFESPANS_PER_NODE, "1") + .setSystemProperty(DYNAMIC_SCHEDULE_FOR_GROUPED_EXECUTION, "false") + .setSystemProperty(ENABLE_DYNAMIC_FILTERING, "false") + .build(); + // Co-located JOIN with all groups at once, dynamic schedule + Session colocatedAllGroupsAtOnceDynamic = Session.builder(getSession()) + .setSystemProperty(COLOCATED_JOIN, "true") + .setSystemProperty(GROUPED_EXECUTION, "true") + .setSystemProperty(CONCURRENT_LIFESPANS_PER_NODE, "0") + .setSystemProperty(DYNAMIC_SCHEDULE_FOR_GROUPED_EXECUTION, "true") + .setSystemProperty(ENABLE_DYNAMIC_FILTERING, "false") + .build(); + // Co-located JOIN, 1 group per worker at a time, dynamic schedule + Session colocatedOneGroupAtATimeDynamic = Session.builder(getSession()) + .setSystemProperty(COLOCATED_JOIN, "true") + .setSystemProperty(GROUPED_EXECUTION, "true") + .setSystemProperty(CONCURRENT_LIFESPANS_PER_NODE, "1") + .setSystemProperty(DYNAMIC_SCHEDULE_FOR_GROUPED_EXECUTION, "true") + .setSystemProperty(ENABLE_DYNAMIC_FILTERING, "false") + .build(); + // Broadcast JOIN, 1 group per worker at a time + Session broadcastOneGroupAtATime = Session.builder(getSession()) + .setSystemProperty(JOIN_DISTRIBUTION_TYPE, BROADCAST.name()) + .setSystemProperty(COLOCATED_JOIN, "true") + .setSystemProperty(GROUPED_EXECUTION, "true") + .setSystemProperty(CONCURRENT_LIFESPANS_PER_NODE, "1") + .setSystemProperty(ENABLE_DYNAMIC_FILTERING, "false") + .build(); + + // Broadcast JOIN, 1 group per worker at a time, dynamic schedule + Session broadcastOneGroupAtATimeDynamic = Session.builder(getSession()) + .setSystemProperty(JOIN_DISTRIBUTION_TYPE, BROADCAST.name()) + .setSystemProperty(COLOCATED_JOIN, "true") + .setSystemProperty(GROUPED_EXECUTION, "true") + .setSystemProperty(CONCURRENT_LIFESPANS_PER_NODE, "1") + .setSystemProperty(DYNAMIC_SCHEDULE_FOR_GROUPED_EXECUTION, "true") + .setSystemProperty(ENABLE_DYNAMIC_FILTERING, "false") + .build(); + + // + // HASH JOIN + // ========= + + @Language("SQL") String joinThreeBucketedTable = + "SELECT key1, value1, key2, value2, key3, value3\n" + + "FROM test_grouped_join1\n" + + "JOIN test_grouped_join2\n" + + "ON key1 = key2\n" + + "JOIN test_grouped_join3\n" + + "ON key2 = key3"; + @Language("SQL") String joinThreeMixedTable = + "SELECT key1, value1, key2, value2, keyN, valueN\n" + + "FROM test_grouped_join1\n" + + "JOIN test_grouped_join2\n" + + "ON key1 = key2\n" + + "JOIN test_grouped_joinN\n" + + "ON key2 = keyN"; + @Language("SQL") String expectedJoinQuery = "SELECT orderkey, comment, orderkey, comment, orderkey, comment from orders"; + @Language("SQL") String leftJoinBucketedTable = + "SELECT key1, value1, key2, value2\n" + + "FROM test_grouped_join1\n" + + "LEFT JOIN (SELECT * FROM test_grouped_join2 WHERE key2 % 2 = 0)\n" + + "ON key1 = key2"; + @Language("SQL") String rightJoinBucketedTable = + "SELECT key1, value1, key2, value2\n" + + "FROM (SELECT * FROM test_grouped_join2 WHERE key2 % 2 = 0)\n" + + "RIGHT JOIN test_grouped_join1\n" + + "ON key1 = key2"; + @Language("SQL") String expectedOuterJoinQuery = "SELECT orderkey, comment, CASE mod(orderkey, 2) WHEN 0 THEN orderkey END, CASE mod(orderkey, 2) WHEN 0 THEN comment END from orders"; + + assertQuery(notColocated, joinThreeBucketedTable, expectedJoinQuery); + assertQuery(notColocated, leftJoinBucketedTable, expectedOuterJoinQuery); + assertQuery(notColocated, rightJoinBucketedTable, expectedOuterJoinQuery); + + assertQuery(colocatedAllGroupsAtOnce, joinThreeBucketedTable, expectedJoinQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedAllGroupsAtOnce, joinThreeMixedTable, expectedJoinQuery, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATime, joinThreeBucketedTable, expectedJoinQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATime, joinThreeMixedTable, expectedJoinQuery, assertRemoteExchangesCount(2)); + assertQuery(colocatedAllGroupsAtOnceDynamic, joinThreeBucketedTable, expectedJoinQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedAllGroupsAtOnceDynamic, joinThreeMixedTable, expectedJoinQuery, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATimeDynamic, joinThreeBucketedTable, expectedJoinQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATimeDynamic, joinThreeMixedTable, expectedJoinQuery, assertRemoteExchangesCount(2)); + + assertQuery(colocatedAllGroupsAtOnce, leftJoinBucketedTable, expectedOuterJoinQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedAllGroupsAtOnce, rightJoinBucketedTable, expectedOuterJoinQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATime, leftJoinBucketedTable, expectedOuterJoinQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATime, rightJoinBucketedTable, expectedOuterJoinQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedAllGroupsAtOnceDynamic, leftJoinBucketedTable, expectedOuterJoinQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedAllGroupsAtOnceDynamic, rightJoinBucketedTable, expectedOuterJoinQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATimeDynamic, leftJoinBucketedTable, expectedOuterJoinQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATimeDynamic, rightJoinBucketedTable, expectedOuterJoinQuery, assertRemoteExchangesCount(1)); + + // + // CROSS JOIN and HASH JOIN mixed + // ============================== + + @Language("SQL") String crossJoin = + "SELECT key1, value1, key2, value2, key3, value3\n" + + "FROM test_grouped_join1\n" + + "JOIN test_grouped_join2\n" + + "ON key1 = key2\n" + + "CROSS JOIN (SELECT * FROM test_grouped_join3 WHERE key3 <= 3)"; + @Language("SQL") String expectedCrossJoinQuery = + "SELECT key1, value1, key1, value1, key3, value3\n" + + "FROM\n" + + " (SELECT orderkey key1, comment value1 FROM orders)\n" + + "CROSS JOIN\n" + + " (SELECT orderkey key3, comment value3 FROM orders where orderkey <= 3)"; + assertQuery(notColocated, crossJoin, expectedCrossJoinQuery); + assertQuery(colocatedAllGroupsAtOnce, crossJoin, expectedCrossJoinQuery, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATime, crossJoin, expectedCrossJoinQuery, assertRemoteExchangesCount(2)); + + // + // Bucketed and unbucketed HASH JOIN mixed + // ======================================= + @Language("SQL") String bucketedAndUnbucketedJoin = + "SELECT key1, value1, keyN, valueN, key2, value2, key3, value3\n" + + "FROM\n" + + " test_grouped_join1\n" + + "JOIN (\n" + + " SELECT *\n" + + " FROM test_grouped_joinN\n" + + " JOIN test_grouped_join2\n" + + " ON keyN = key2\n" + + ")\n" + + "ON key1 = keyN\n" + + "JOIN test_grouped_join3\n" + + "ON key1 = key3"; + @Language("SQL") String expectedBucketedAndUnbucketedJoinQuery = "SELECT orderkey, comment, orderkey, comment, orderkey, comment, orderkey, comment from orders"; + assertQuery(notColocated, bucketedAndUnbucketedJoin, expectedBucketedAndUnbucketedJoinQuery); + assertQuery(colocatedAllGroupsAtOnce, bucketedAndUnbucketedJoin, expectedBucketedAndUnbucketedJoinQuery, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATime, bucketedAndUnbucketedJoin, expectedBucketedAndUnbucketedJoinQuery, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATimeDynamic, bucketedAndUnbucketedJoin, expectedBucketedAndUnbucketedJoinQuery, assertRemoteExchangesCount(2)); + + // + // UNION ALL / GROUP BY + // ==================== + + @Language("SQL") String groupBySingleBucketed = + "SELECT\n" + + " keyD,\n" + + " count(valueD)\n" + + "FROM\n" + + " test_grouped_joinDual\n" + + "GROUP BY keyD"; + @Language("SQL") String expectedSingleGroupByQuery = "SELECT orderkey, 2 from orders"; + @Language("SQL") String groupByOfUnionBucketed = + "SELECT\n" + + " key\n" + + ", arbitrary(value1)\n" + + ", arbitrary(value2)\n" + + ", arbitrary(value3)\n" + + "FROM (\n" + + " SELECT key1 key, value1, NULL value2, NULL value3\n" + + " FROM test_grouped_join1\n" + + "UNION ALL\n" + + " SELECT key2 key, NULL value1, value2, NULL value3\n" + + " FROM test_grouped_join2\n" + + " WHERE key2 % 2 = 0\n" + + "UNION ALL\n" + + " SELECT key3 key, NULL value1, NULL value2, value3\n" + + " FROM test_grouped_join3\n" + + " WHERE key3 % 3 = 0\n" + + ")\n" + + "GROUP BY key"; + @Language("SQL") String groupByOfUnionMixed = + "SELECT\n" + + " key\n" + + ", arbitrary(value1)\n" + + ", arbitrary(value2)\n" + + ", arbitrary(valueN)\n" + + "FROM (\n" + + " SELECT key1 key, value1, NULL value2, NULL valueN\n" + + " FROM test_grouped_join1\n" + + "UNION ALL\n" + + " SELECT key2 key, NULL value1, value2, NULL valueN\n" + + " FROM test_grouped_join2\n" + + " WHERE key2 % 2 = 0\n" + + "UNION ALL\n" + + " SELECT keyN key, NULL value1, NULL value2, valueN\n" + + " FROM test_grouped_joinN\n" + + " WHERE keyN % 3 = 0\n" + + ")\n" + + "GROUP BY key"; + @Language("SQL") String expectedGroupByOfUnion = "SELECT orderkey, comment, CASE mod(orderkey, 2) WHEN 0 THEN comment END, CASE mod(orderkey, 3) WHEN 0 THEN comment END from orders"; + // In this case: + // * left side can take advantage of bucketed execution + // * right side does not have the necessary organization to allow its parent to take advantage of bucketed execution + // In this scenario, we give up bucketed execution altogether. This can potentially be improved. + // + // AGG(key) + // | + // UNION ALL + // / \ + // AGG(key) Scan (not bucketed) + // | + // Scan (bucketed on key) + @Language("SQL") String groupByOfUnionOfGroupByMixed = + "SELECT\n" + + " key, sum(cnt) cnt\n" + + "FROM (\n" + + " SELECT keyD key, count(valueD) cnt\n" + + " FROM test_grouped_joinDual\n" + + " GROUP BY keyD\n" + + "UNION ALL\n" + + " SELECT keyN key, 1 cnt\n" + + " FROM test_grouped_joinN\n" + + ")\n" + + "group by key"; + @Language("SQL") String expectedGroupByOfUnionOfGroupBy = "SELECT orderkey, 3 from orders"; + + // Eligible GROUP BYs run in the same fragment regardless of colocated_join flag + assertQuery(colocatedAllGroupsAtOnce, groupBySingleBucketed, expectedSingleGroupByQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATime, groupBySingleBucketed, expectedSingleGroupByQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATimeDynamic, groupBySingleBucketed, expectedSingleGroupByQuery, assertRemoteExchangesCount(1)); + assertQuery(colocatedAllGroupsAtOnce, groupByOfUnionBucketed, expectedGroupByOfUnion, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATime, groupByOfUnionBucketed, expectedGroupByOfUnion, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATimeDynamic, groupByOfUnionBucketed, expectedGroupByOfUnion, assertRemoteExchangesCount(1)); + + // cannot be executed in a grouped manner but should still produce correct result + assertQuery(colocatedOneGroupAtATime, groupByOfUnionMixed, expectedGroupByOfUnion, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATime, groupByOfUnionOfGroupByMixed, expectedGroupByOfUnionOfGroupBy, assertRemoteExchangesCount(2)); + + // + // GROUP BY and JOIN mixed + // ======================== + @Language("SQL") String joinGroupedWithGrouped = + "SELECT key1, count1, count2\n" + + "FROM (\n" + + " SELECT keyD key1, count(valueD) count1\n" + + " FROM test_grouped_joinDual\n" + + " GROUP BY keyD\n" + + ") JOIN (\n" + + " SELECT keyD key2, count(valueD) count2\n" + + " FROM test_grouped_joinDual\n" + + " GROUP BY keyD\n" + + ")\n" + + "ON key1 = key2"; + @Language("SQL") String expectedJoinGroupedWithGrouped = "SELECT orderkey, 2, 2 from orders"; + @Language("SQL") String joinGroupedWithUngrouped = + "SELECT keyD, countD, valueN\n" + + "FROM (\n" + + " SELECT keyD, count(valueD) countD\n" + + " FROM test_grouped_joinDual\n" + + " GROUP BY keyD\n" + + ") JOIN (\n" + + " SELECT keyN, valueN\n" + + " FROM test_grouped_joinN\n" + + ")\n" + + "ON keyD = keyN"; + @Language("SQL") String expectedJoinGroupedWithUngrouped = "SELECT orderkey, 2, comment from orders"; + @Language("SQL") String joinUngroupedWithGrouped = + "SELECT keyN, valueN, countD\n" + + "FROM (\n" + + " SELECT keyN, valueN\n" + + " FROM test_grouped_joinN\n" + + ") JOIN (\n" + + " SELECT keyD, count(valueD) countD\n" + + " FROM test_grouped_joinDual\n" + + " GROUP BY keyD\n" + + ")\n" + + "ON keyN = keyD"; + @Language("SQL") String expectedJoinUngroupedWithGrouped = "SELECT orderkey, comment, 2 from orders"; + @Language("SQL") String groupOnJoinResult = + "SELECT keyD, count(valueD), count(valueN)\n" + + "FROM\n" + + " test_grouped_joinDual\n" + + "JOIN\n" + + " test_grouped_joinN\n" + + "ON keyD=keyN\n" + + "GROUP BY keyD"; + @Language("SQL") String expectedGroupOnJoinResult = "SELECT orderkey, 2, 2 from orders"; + + @Language("SQL") String groupOnUngroupedJoinResult = + "SELECT key4_bucket, count(value4), count(valueN)\n" + + "FROM\n" + + " test_grouped_join4\n" + + "JOIN\n" + + " test_grouped_joinN\n" + + "ON key4_non_bucket=keyN\n" + + "GROUP BY key4_bucket"; + @Language("SQL") String expectedGroupOnUngroupedJoinResult = "SELECT orderkey, count(*), count(*) from orders group by orderkey"; + + // Eligible GROUP BYs run in the same fragment regardless of colocated_join flag + assertQuery(colocatedAllGroupsAtOnce, joinGroupedWithGrouped, expectedJoinGroupedWithGrouped, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATime, joinGroupedWithGrouped, expectedJoinGroupedWithGrouped, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATimeDynamic, joinGroupedWithGrouped, expectedJoinGroupedWithGrouped, assertRemoteExchangesCount(1)); + assertQuery(colocatedAllGroupsAtOnce, joinGroupedWithUngrouped, expectedJoinGroupedWithUngrouped, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATime, joinGroupedWithUngrouped, expectedJoinGroupedWithUngrouped, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATimeDynamic, joinGroupedWithUngrouped, expectedJoinGroupedWithUngrouped, assertRemoteExchangesCount(2)); + assertQuery(colocatedAllGroupsAtOnce, groupOnJoinResult, expectedGroupOnJoinResult, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATime, groupOnJoinResult, expectedGroupOnJoinResult, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATimeDynamic, groupOnJoinResult, expectedGroupOnJoinResult, assertRemoteExchangesCount(2)); + + assertQuery(broadcastOneGroupAtATime, groupOnJoinResult, expectedGroupOnJoinResult, assertRemoteExchangesCount(2)); + assertQuery(broadcastOneGroupAtATime, groupOnUngroupedJoinResult, expectedGroupOnUngroupedJoinResult, assertRemoteExchangesCount(2)); + assertQuery(broadcastOneGroupAtATimeDynamic, groupOnUngroupedJoinResult, expectedGroupOnUngroupedJoinResult, assertRemoteExchangesCount(2)); + + // cannot be executed in a grouped manner but should still produce correct result + assertQuery(colocatedOneGroupAtATime, joinUngroupedWithGrouped, expectedJoinUngroupedWithGrouped, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATime, groupOnUngroupedJoinResult, expectedGroupOnUngroupedJoinResult, assertRemoteExchangesCount(4)); + + // + // Outer JOIN (that involves LookupOuterOperator) + // ============================================== + + // Chain on the probe side to test duplicating OperatorFactory + @Language("SQL") String chainedOuterJoin = + "SELECT key1, value1, key2, value2, key3, value3\n" + + "FROM\n" + + " (SELECT * FROM test_grouped_join1 where mod(key1, 2) = 0)\n" + + "RIGHT JOIN\n" + + " (SELECT * FROM test_grouped_join2 where mod(key2, 3) = 0)\n" + + "ON key1 = key2\n" + + "FULL JOIN\n" + + " (SELECT * FROM test_grouped_join3 where mod(key3, 5) = 0)\n" + + "ON key2 = key3"; + // Probe is grouped execution, but build is not + @Language("SQL") String sharedBuildOuterJoin = + "SELECT key1, value1, keyN, valueN\n" + + "FROM\n" + + " (SELECT key1, arbitrary(value1) value1 FROM test_grouped_join1 where mod(key1, 2) = 0 group by key1)\n" + + "RIGHT JOIN\n" + + " (SELECT * FROM test_grouped_joinN where mod(keyN, 3) = 0)\n" + + "ON key1 = keyN"; + // The preceding test case, which then feeds into another join + @Language("SQL") String chainedSharedBuildOuterJoin = + "SELECT key1, value1, keyN, valueN, key3, value3\n" + + "FROM\n" + + " (SELECT key1, arbitrary(value1) value1 FROM test_grouped_join1 where mod(key1, 2) = 0 group by key1)\n" + + "RIGHT JOIN\n" + + " (SELECT * FROM test_grouped_joinN where mod(keyN, 3) = 0)\n" + + "ON key1 = keyN\n" + + "FULL JOIN\n" + + " (SELECT * FROM test_grouped_join3 where mod(key3, 5) = 0)\n" + + "ON keyN = key3"; + @Language("SQL") String expectedChainedOuterJoinResult = "SELECT\n" + + " CASE WHEN mod(orderkey, 2 * 3) = 0 THEN orderkey END,\n" + + " CASE WHEN mod(orderkey, 2 * 3) = 0 THEN comment END,\n" + + " CASE WHEN mod(orderkey, 3) = 0 THEN orderkey END,\n" + + " CASE WHEN mod(orderkey, 3) = 0 THEN comment END,\n" + + " CASE WHEN mod(orderkey, 5) = 0 THEN orderkey END,\n" + + " CASE WHEN mod(orderkey, 5) = 0 THEN comment END\n" + + "FROM ORDERS\n" + + "WHERE mod(orderkey, 3) = 0 OR mod(orderkey, 5) = 0"; + @Language("SQL") String expectedSharedBuildOuterJoinResult = "SELECT\n" + + " CASE WHEN mod(orderkey, 2) = 0 THEN orderkey END,\n" + + " CASE WHEN mod(orderkey, 2) = 0 THEN comment END,\n" + + " orderkey,\n" + + " comment\n" + + "FROM ORDERS\n" + + "WHERE mod(orderkey, 3) = 0"; + + assertQuery(notColocated, chainedOuterJoin, expectedChainedOuterJoinResult); + assertQuery(colocatedAllGroupsAtOnce, chainedOuterJoin, expectedChainedOuterJoinResult, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATime, chainedOuterJoin, expectedChainedOuterJoinResult, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATimeDynamic, chainedOuterJoin, expectedChainedOuterJoinResult, assertRemoteExchangesCount(1)); + assertQuery(notColocated, sharedBuildOuterJoin, expectedSharedBuildOuterJoinResult); + assertQuery(colocatedAllGroupsAtOnce, sharedBuildOuterJoin, expectedSharedBuildOuterJoinResult, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATime, sharedBuildOuterJoin, expectedSharedBuildOuterJoinResult, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATimeDynamic, sharedBuildOuterJoin, expectedSharedBuildOuterJoinResult, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATime, chainedSharedBuildOuterJoin, expectedChainedOuterJoinResult, assertRemoteExchangesCount(2)); + assertQuery(colocatedOneGroupAtATimeDynamic, chainedSharedBuildOuterJoin, expectedChainedOuterJoinResult, assertRemoteExchangesCount(2)); + + // + // Window function + // =============== + assertQuery( + colocatedOneGroupAtATime, + "SELECT key, count(*) OVER (PARTITION BY key ORDER BY value) FROM test_grouped_window", + "VALUES\n" + + "(1, 1),\n" + + "(2, 1),\n" + + "(2, 2),\n" + + "(4, 1),\n" + + "(4, 2),\n" + + "(4, 3),\n" + + "(4, 4),\n" + + "(4, 5),\n" + + "(5, 1),\n" + + "(5, 2)", + assertRemoteExchangesCount(1)); + + assertQuery( + colocatedOneGroupAtATime, + "SELECT key, row_number() OVER (PARTITION BY key ORDER BY value) FROM test_grouped_window", + "VALUES\n" + + "(1, 1),\n" + + "(2, 1),\n" + + "(2, 2),\n" + + "(4, 1),\n" + + "(4, 2),\n" + + "(4, 3),\n" + + "(4, 4),\n" + + "(4, 5),\n" + + "(5, 1),\n" + + "(5, 2)", + assertRemoteExchangesCount(1)); + + assertQuery( + colocatedOneGroupAtATime, + "SELECT key, n FROM (SELECT key, row_number() OVER (PARTITION BY key ORDER BY value) AS n FROM test_grouped_window) WHERE n <= 2", + "VALUES\n" + + "(1, 1),\n" + + "(2, 1),\n" + + "(2, 2),\n" + + "(4, 1),\n" + + "(4, 2),\n" + + "(5, 1),\n" + + "(5, 2)", + assertRemoteExchangesCount(1)); + + // + // Filter out all or majority of splits + // ==================================== + @Language("SQL") String noSplits = + "SELECT key1, arbitrary(value1)\n" + + "FROM test_grouped_join1\n" + + "WHERE \"$bucket\" < 0\n" + + "GROUP BY key1"; + @Language("SQL") String joinMismatchedBuckets = + "SELECT key1, value1, key2, value2\n" + + "FROM (\n" + + " SELECT *\n" + + " FROM test_grouped_join1\n" + + " WHERE \"$bucket\"=1\n" + + ")\n" + + "FULL OUTER JOIN (\n" + + " SELECT *\n" + + " FROM test_grouped_join2\n" + + " WHERE \"$bucket\"=11\n" + + ")\n" + + "ON key1=key2"; + @Language("SQL") String expectedNoSplits = "SELECT 1, 'a' WHERE FALSE"; + @Language("SQL") String expectedJoinMismatchedBuckets = "SELECT\n" + + " CASE WHEN mod(orderkey, 13) = 1 THEN orderkey END,\n" + + " CASE WHEN mod(orderkey, 13) = 1 THEN comment END,\n" + + " CASE WHEN mod(orderkey, 13) = 11 THEN orderkey END,\n" + + " CASE WHEN mod(orderkey, 13) = 11 THEN comment END\n" + + "FROM ORDERS\n" + + "WHERE mod(orderkey, 13) IN (1, 11)"; + + assertQuery(notColocated, noSplits, expectedNoSplits); + assertQuery(colocatedAllGroupsAtOnce, noSplits, expectedNoSplits, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATime, noSplits, expectedNoSplits, assertRemoteExchangesCount(1)); + assertQuery(notColocated, joinMismatchedBuckets, expectedJoinMismatchedBuckets); + assertQuery(colocatedAllGroupsAtOnce, joinMismatchedBuckets, expectedJoinMismatchedBuckets, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATime, joinMismatchedBuckets, expectedJoinMismatchedBuckets, assertRemoteExchangesCount(1)); + + Session notColocated1 = Session.builder(notColocated) + .setCatalogSessionProperty(notColocated.getCatalog().get(), "orc_predicate_pushdown_enabled", "true") + .build(); + Session colocatedAllGroupsAtOnce1 = Session.builder(colocatedAllGroupsAtOnce) + .setCatalogSessionProperty(colocatedAllGroupsAtOnce.getCatalog().get(), "orc_predicate_pushdown_enabled", "true") + .build(); + Session colocatedOneGroupAtATime1 = Session.builder(colocatedOneGroupAtATime) + .setCatalogSessionProperty(colocatedOneGroupAtATime.getCatalog().get(), "orc_predicate_pushdown_enabled", "true") + .build(); + assertQuery(notColocated1, noSplits, expectedNoSplits); + + assertQuery(colocatedAllGroupsAtOnce1, noSplits, expectedNoSplits, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATime1, noSplits, expectedNoSplits, assertRemoteExchangesCount(1)); + assertQuery(notColocated1, joinMismatchedBuckets, expectedJoinMismatchedBuckets); + assertQuery(colocatedAllGroupsAtOnce1, joinMismatchedBuckets, expectedJoinMismatchedBuckets, assertRemoteExchangesCount(1)); + assertQuery(colocatedOneGroupAtATime1, joinMismatchedBuckets, expectedJoinMismatchedBuckets, assertRemoteExchangesCount(1)); + } + finally { + assertUpdate("DROP TABLE IF EXISTS test_grouped_join1"); + assertUpdate("DROP TABLE IF EXISTS test_grouped_join2"); + assertUpdate("DROP TABLE IF EXISTS test_grouped_join3"); + assertUpdate("DROP TABLE IF EXISTS test_grouped_join4"); + assertUpdate("DROP TABLE IF EXISTS test_grouped_joinN"); + assertUpdate("DROP TABLE IF EXISTS test_grouped_joinDual"); + assertUpdate("DROP TABLE IF EXISTS test_grouped_window"); + } + } + + private Consumer assertRemoteExchangesCount(int expectedRemoteExchangesCount) + { + return plan -> + { + int actualRemoteExchangesCount = searchFrom(plan.getRoot()) + .where(node -> node instanceof ExchangeNode && ((ExchangeNode) node).getScope() == ExchangeNode.Scope.REMOTE) + .findAll() + .size(); + if (actualRemoteExchangesCount != expectedRemoteExchangesCount) { + Session session = getSession(); + Metadata metadata = ((DistributedQueryRunner) getQueryRunner()).getCoordinator().getMetadata(); + String formattedPlan = textLogicalPlan(plan.getRoot(), plan.getTypes(), metadata, StatsAndCosts.empty(), session, 0, false); + throw new AssertionError(format( + "Expected [\n%s\n] remote exchanges but found [\n%s\n] remote exchanges. Actual plan is [\n\n%s\n]", + expectedRemoteExchangesCount, + actualRemoteExchangesCount, + formattedPlan)); + } + }; + } + + // Presto: Check if there is a partitioned exchange node in logic plan + private Consumer assertRemotePartitionedExchange(String partitionColumn) + { + return plan -> + { + int partitionedExchangeCount = searchFrom(plan.getRoot()) + .where(node -> node instanceof ExchangeNode && ((ExchangeNode) node).getScope() == ExchangeNode.Scope.REMOTE + && ((ExchangeNode) node).getPartitioningScheme().getPartitioning().getHandle().toString().equals("HASH") + && ((ExchangeNode) node).getPartitioningScheme().getPartitioning().getArguments().get(0).getColumn().getName().equals(partitionColumn)) + .findAll() + .size(); + + if (partitionedExchangeCount != 1) { + throw new AssertionError(format( + "Found [\n%s\n] remote partitioned exchanges.", + partitionedExchangeCount)); + } + }; + } + + @Test + public void testRcTextCharDecoding() + { + assertUpdate("CREATE TABLE test_table_with_char_rc WITH (format = 'RCTEXT') AS SELECT CAST('khaki' AS CHAR(7)) char_column", 1); + try { + assertQuery( + "SELECT * FROM test_table_with_char_rc WHERE char_column = 'khaki '", + "VALUES (CAST('khaki' AS CHAR(7)))"); + } + finally { + assertUpdate("DROP TABLE test_table_with_char_rc"); + } + } + + @Test + public void testInvalidPartitionValue() + { + assertUpdate("CREATE TABLE invalid_partition_value (a int, b varchar) WITH (partitioned_by = ARRAY['b'])"); + assertQueryFails( + "INSERT INTO invalid_partition_value VALUES (4, 'test' || chr(13))", + "\\QHive partition keys can only contain printable ASCII characters (0x20 - 0x7E). Invalid value: 74 65 73 74 0D\\E"); + assertUpdate("DROP TABLE invalid_partition_value"); + + assertQueryFails( + "CREATE TABLE invalid_partition_value (a, b) WITH (partitioned_by = ARRAY['b']) AS SELECT 4, chr(9731)", + "\\QHive partition keys can only contain printable ASCII characters (0x20 - 0x7E). Invalid value: E2 98 83\\E"); + } + + @Test + public void testShowColumnMetadata() + { + String tableName = "test_show_column_table"; + + @Language("SQL") String createTable = "CREATE TABLE " + tableName + " (a bigint, b varchar, c double)"; + + Session testSession = testSessionBuilder() + .setIdentity(new Identity("test_access_owner", Optional.empty())) + .setCatalog(getSession().getCatalog().get()) + .setSchema(getSession().getSchema().get()) + .build(); + + assertUpdate(createTable); + + // verify showing columns over a table requires SELECT privileges for the table + assertAccessAllowed("SHOW COLUMNS FROM " + tableName); + assertAccessDenied(testSession, + "SHOW COLUMNS FROM " + tableName, + "Cannot show columns of table .*." + tableName + ".*", + privilege(tableName, SELECT_COLUMN)); + + @Language("SQL") String getColumnsSql = "" + + "SELECT lower(column_name) " + + "FROM information_schema.columns " + + "WHERE table_name = '" + tableName + "'"; + assertEquals(computeActual(getColumnsSql).getOnlyColumnAsSet(), ImmutableSet.of("a", "b", "c")); + + // verify with no SELECT privileges on table, querying information_schema will return empty columns + executeExclusively(() -> { + try { + getQueryRunner().getAccessControl().deny(privilege(tableName, SELECT_COLUMN)); + assertQueryReturnsEmptyResult(testSession, getColumnsSql); + } + finally { + getQueryRunner().getAccessControl().reset(); + } + }); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testCurrentUserInView() + { + checkState(getSession().getCatalog().isPresent(), "catalog is not set"); + checkState(getSession().getSchema().isPresent(), "schema is not set"); + String testAccountsUnqualifiedName = "test_accounts"; + String testAccountsViewUnqualifiedName = "test_accounts_view"; + String testAccountsViewFullyQualifiedName = format("%s.%s.%s", getSession().getCatalog().get(), getSession().getSchema().get(), testAccountsViewUnqualifiedName); + assertUpdate(format("CREATE TABLE %s AS SELECT user_name, account_name" + + " FROM (VALUES ('user1', 'account1'), ('user2', 'account2'))" + + " t (user_name, account_name)", testAccountsUnqualifiedName), 2); + assertUpdate(format("CREATE VIEW %s AS SELECT account_name FROM test_accounts WHERE user_name = CURRENT_USER", testAccountsViewUnqualifiedName)); + assertUpdate(format("GRANT SELECT ON %s TO user1", testAccountsViewFullyQualifiedName)); + assertUpdate(format("GRANT SELECT ON %s TO user2", testAccountsViewFullyQualifiedName)); + + Session user1 = testSessionBuilder() + .setCatalog(getSession().getCatalog().get()) + .setSchema(getSession().getSchema().get()) + .setIdentity(new Identity("user1", getSession().getIdentity().getPrincipal())) + .build(); + + Session user2 = testSessionBuilder() + .setCatalog(getSession().getCatalog().get()) + .setSchema(getSession().getSchema().get()) + .setIdentity(new Identity("user2", getSession().getIdentity().getPrincipal())) + .build(); + + assertQuery(user1, "SELECT account_name FROM test_accounts_view", "VALUES 'account1'"); + assertQuery(user2, "SELECT account_name FROM test_accounts_view", "VALUES 'account2'"); + assertUpdate("DROP VIEW test_accounts_view"); + assertUpdate("DROP TABLE test_accounts"); + } + + @Test + public void testCollectColumnStatisticsOnCreateTable() + { + String tableName = "test_collect_column_statistics_on_create_table"; + assertUpdate(format("" + + "CREATE TABLE %s " + + "WITH ( " + + " partitioned_by = ARRAY['p_varchar'] " + + ") " + + "AS " + + "SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, p_varchar " + + "FROM ( " + + " VALUES " + + " (null, null, null, null, null, null, 'p1'), " + + " (null, null, null, null, null, null, 'p1'), " + + " (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', CAST('abc1' AS VARCHAR), CAST('bcd1' AS VARBINARY), 'p1')," + + " (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', CAST('abc2' AS VARCHAR), CAST('bcd2' AS VARBINARY), 'p1')," + + " (null, null, null, null, null, null, 'p2'), " + + " (null, null, null, null, null, null, 'p2'), " + + " (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', CAST('cba1' AS VARCHAR), CAST('dcb1' AS VARBINARY), 'p2'), " + + " (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), CAST('dcb2' AS VARBINARY), 'p2') " + + ") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, p_varchar)", tableName), 8); + + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1')", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_bigint', null, 2.0E0, 0.5E0, null, '0', '1'), " + + "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null), " + + "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null), " + + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 4.0E0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2')", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2'), " + + "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null), " + + "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null), " + + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 4.0E0, null, null)"); + + // non existing partition + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3')", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 0E0, 0E0, null, null, null), " + + "('c_bigint', null, 0E0, 0E0, null, null, null), " + + "('c_double', null, 0E0, 0E0, null, null, null), " + + "('c_timestamp', null, 0E0, 0E0, null, null, null), " + + "('c_varchar', 0E0, 0E0, 0E0, null, null, null), " + + "('c_varbinary', null, 0E0, 0E0, null, null, null), " + + "('p_varchar', 0E0, 0E0, 0E0, null, null, null), " + + "(null, null, null, null, 0E0, null, null)"); + + assertUpdate(format("DROP TABLE %s", tableName)); + } + + @Test + public void testCollectColumnStatisticsOnInsert() + { + String tableName = "test_collect_column_statistics_on_insert"; + assertUpdate(format("" + + "CREATE TABLE %s ( " + + " c_boolean BOOLEAN, " + + " c_bigint BIGINT, " + + " c_double DOUBLE, " + + " c_timestamp TIMESTAMP, " + + " c_varchar VARCHAR, " + + " c_varbinary VARBINARY, " + + " p_varchar VARCHAR " + + ") " + + "WITH ( " + + " partitioned_by = ARRAY['p_varchar'] " + + ")", tableName)); + + assertUpdate(format("" + + "INSERT INTO %s " + + "SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, p_varchar " + + "FROM ( " + + " VALUES " + + " (null, null, null, null, null, null, 'p1'), " + + " (null, null, null, null, null, null, 'p1'), " + + " (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', CAST('abc1' AS VARCHAR), CAST('bcd1' AS VARBINARY), 'p1')," + + " (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', CAST('abc2' AS VARCHAR), CAST('bcd2' AS VARBINARY), 'p1')," + + " (null, null, null, null, null, null, 'p2'), " + + " (null, null, null, null, null, null, 'p2'), " + + " (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', CAST('cba1' AS VARCHAR), CAST('dcb1' AS VARBINARY), 'p2'), " + + " (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), CAST('dcb2' AS VARBINARY), 'p2') " + + ") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, p_varchar)", tableName), 8); + + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1')", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_bigint', null, 2.0E0, 0.5E0, null, '0', '1'), " + + "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null), " + + "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null), " + + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 4.0E0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2')", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2'), " + + "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3'), " + + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null), " + + "('c_varchar', 8.0E0, 2.0E0, 0.5E0, null, null, null), " + + "('c_varbinary', 8.0E0, null, 0.5E0, null, null, null), " + + "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 4.0E0, null, null)"); + + // non existing partition + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3')", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 0E0, 0E0, null, null, null), " + + "('c_bigint', null, 0E0, 0E0, null, null, null), " + + "('c_double', null, 0E0, 0E0, null, null, null), " + + "('c_timestamp', null, 0E0, 0E0, null, null, null), " + + "('c_varchar', 0E0, 0E0, 0E0, null, null, null), " + + "('c_varbinary', null, 0E0, 0E0, null, null, null), " + + "('p_varchar', 0E0, 0E0, 0E0, null, null, null), " + + "(null, null, null, null, 0E0, null, null)"); + + assertUpdate(format("DROP TABLE %s", tableName)); + } + + @Test + public void testAnalyzePropertiesSystemTable() + { + assertQuery( + "SELECT * FROM system.metadata.analyze_properties WHERE catalog_name = 'hive'", + "SELECT 'hive', 'partitions', '', 'array(array(varchar))', 'Partitions to be analyzed'"); + } + + @Test + public void testAnalyzeEmptyTable() + { + String tableName = "test_analyze_empty_table"; + assertUpdate(format("CREATE TABLE %s (c_bigint BIGINT, c_varchar VARCHAR(2))", tableName)); + assertUpdate("ANALYZE " + tableName, 0); + } + + @Test + public void testInvalidAnalyzePartitionedTable() + { + String tableName = "test_invalid_analyze_partitioned_table"; + + // Test table does not exist + assertQueryFails("ANALYZE " + tableName, format(".*Table 'hive.tpch.%s' does not exist.*", tableName)); + + createPartitionedTableForAnalyzeTest(tableName); + + // Test invalid property + assertQueryFails(format("ANALYZE %s WITH (error = 1)", tableName), ".*'hive' does not support analyze property 'error'.*"); + assertQueryFails(format("ANALYZE %s WITH (partitions = 1)", tableName), ".*\\QCannot convert [1] to array(array(varchar))\\E.*"); + assertQueryFails(format("ANALYZE %s WITH (partitions = NULL)", tableName), ".*Invalid null value for analyze property.*"); + assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[NULL])", tableName), ".*Invalid null value in analyze partitions property.*"); + + // Test non-existed partition + assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['p4', '10']])", tableName), ".*Partition no longer exists.*"); + + // Test partition schema mismatch + assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['p4']])", tableName), "Partition value count does not match partition column count"); + assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['p4', '10', 'error']])", tableName), "Partition value count does not match partition column count"); + + // Drop the partitioned test table + assertUpdate(format("DROP TABLE %s", tableName)); + } + + @Test + public void testInvalidAnalyzeUnpartitionedTable() + { + String tableName = "test_invalid_analyze_unpartitioned_table"; + + // Test table does not exist + assertQueryFails("ANALYZE " + tableName, ".*Table.*does not exist.*"); + + createUnpartitionedTableForAnalyzeTest(tableName); + + // Test partition properties on unpartitioned table + assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[])", tableName), "Partition list provided but table is not partitioned"); + assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['p1']])", tableName), "Partition list provided but table is not partitioned"); + + // Drop the partitioned test table + assertUpdate(format("DROP TABLE %s", tableName)); + } + + @Test + public void testAnalyzePartitionedTable() + { + String tableName = "test_analyze_partitioned_table"; + createPartitionedTableForAnalyzeTest(tableName); + + // No column stats before running analyze + assertQuery("SHOW STATS FOR " + tableName, + "SELECT * FROM VALUES " + + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('p_varchar', 24.0, 3.0, 0.25, null, null, null), " + + "('p_bigint', null, 2.0, 0.25, null, '7', '8'), " + + "(null, null, null, null, 16.0, null, null)"); + + // No column stats after running an empty analyze + assertUpdate(format("ANALYZE %s WITH (partitions = ARRAY[])", tableName), 0); + assertQuery("SHOW STATS FOR " + tableName, + "SELECT * FROM VALUES " + + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('p_varchar', 24.0, 3.0, 0.25, null, null, null), " + + "('p_bigint', null, 2.0, 0.25, null, '7', '8'), " + + "(null, null, null, null, 16.0, null, null)"); + + // Run analyze on 3 partitions including a null partition and a duplicate partition + assertUpdate(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['p1', '7'], ARRAY['p2', '7'], ARRAY['p2', '7'], ARRAY[NULL, NULL]])", tableName), 12); + + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1' AND p_bigint = 7)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '0', '1'), " + + "('c_double', null, 2.0, 0.5, null, '1.2', '2.2'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '7', '7'), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2' AND p_bigint = 7)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '1', '2'), " + + "('c_double', null, 2.0, 0.5, null, '2.3', '3.3'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '7', '7'), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar IS NULL AND p_bigint IS NULL)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 1.0, 0.0, null, null, null), " + + "('c_bigint', null, 4.0, 0.0, null, '4', '7'), " + + "('c_double', null, 4.0, 0.0, null, '4.7', '7.7'), " + + "('c_timestamp', null, 4.0, 0.0, null, null, null), " + + "('c_varchar', 16.0, 4.0, 0.0, null, null, null), " + + "('c_varbinary', 8.0, null, 0.0, null, null, null), " + + "('p_varchar', 0.0, 0.0, 1.0, null, null, null), " + + "('p_bigint', null, 0.0, 1.0, null, null, null), " + + "(null, null, null, null, 4.0, null, null)"); + + // Partition [p3, 8], [e1, 9], [e2, 9] have no column stats + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3' AND p_bigint = 8)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '8', '8'), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'e1' AND p_bigint = 9)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('p_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('p_bigint', null, 0.0, 0.0, null, null, null), " + + "(null, null, null, null, 0.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'e2' AND p_bigint = 9)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('p_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('p_bigint', null, 0.0, 0.0, null, null, null), " + + "(null, null, null, null, 0.0, null, null)"); + + // Run analyze on the whole table + assertUpdate("ANALYZE " + tableName, 16); + + // All partitions except empty partitions have column stats + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1' AND p_bigint = 7)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '0', '1'), " + + "('c_double', null, 2.0, 0.5, null, '1.2', '2.2'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '7', '7'), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2' AND p_bigint = 7)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '1', '2'), " + + "('c_double', null, 2.0, 0.5, null, '2.3', '3.3'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '7', '7'), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar IS NULL AND p_bigint IS NULL)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 1.0, 0.0, null, null, null), " + + "('c_bigint', null, 4.0, 0.0, null, '4', '7'), " + + "('c_double', null, 4.0, 0.0, null, '4.7', '7.7'), " + + "('c_timestamp', null, 4.0, 0.0, null, null, null), " + + "('c_varchar', 16.0, 4.0, 0.0, null, null, null), " + + "('c_varbinary', 8.0, null, 0.0, null, null, null), " + + "('p_varchar', 0.0, 0.0, 1.0, null, null, null), " + + "('p_bigint', null, 0.0, 1.0, null, null, null), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3' AND p_bigint = 8)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '2', '3'), " + + "('c_double', null, 2.0, 0.5, null, '3.4', '4.4'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '8', '8'), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'e1' AND p_bigint = 9)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 0.0, 0.0, null, null, null), " + + "('c_bigint', null, 0.0, 0.0, null, null, null), " + + "('c_double', null, 0.0, 0.0, null, null, null), " + + "('c_timestamp', null, 0.0, 0.0, null, null, null), " + + "('c_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('c_varbinary', 0.0, null, 0.0, null, null, null), " + + "('p_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('p_bigint', null, 0.0, 0.0, null, null, null), " + + "(null, null, null, null, 0.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'e2' AND p_bigint = 9)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 0.0, 0.0, null, null, null), " + + "('c_bigint', null, 0.0, 0.0, null, null, null), " + + "('c_double', null, 0.0, 0.0, null, null, null), " + + "('c_timestamp', null, 0.0, 0.0, null, null, null), " + + "('c_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('c_varbinary', 0.0, null, 0.0, null, null, null), " + + "('p_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('p_bigint', null, 0.0, 0.0, null, null, null), " + + "(null, null, null, null, 0.0, null, null)"); + + // Drop the partitioned test table + assertUpdate(format("DROP TABLE %s", tableName)); + } + + @Test + public void testAnalyzeUnpartitionedTable() + { + String tableName = "test_analyze_unpartitioned_table"; + createUnpartitionedTableForAnalyzeTest(tableName); + + // No column stats before running analyze + assertQuery("SHOW STATS FOR " + tableName, + "SELECT * FROM VALUES " + + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('p_varchar', null, null, null, null, null, null), " + + "('p_bigint', null, null, null, null, null, null), " + + "(null, null, null, null, null, null, null)"); + + // Run analyze on the whole table + assertUpdate("ANALYZE " + tableName, 16); + + assertQuery("SHOW STATS FOR " + tableName, + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0, 0.375, null, null, null), " + + "('c_bigint', null, 8.0, 0.375, null, '0', '7'), " + + "('c_double', null, 10.0, 0.375, null, '1.2', '7.7'), " + + "('c_timestamp', null, 10.0, 0.375, null, null, null), " + + "('c_varchar', 40.0, 10.0, 0.375, null, null, null), " + + "('c_varbinary', 20.0, null, 0.375, null, null, null), " + + "('p_varchar', 24.0, 3.0, 0.25, null, null, null), " + + "('p_bigint', null, 2.0, 0.25, null, '7', '8'), " + + "(null, null, null, null, 16.0, null, null)"); + + // Drop the unpartitioned test table + assertUpdate(format("DROP TABLE %s", tableName)); + } + + protected void createPartitionedTableForAnalyzeTest(String tableName) + { + createTableForAnalyzeTest(tableName, true); + } + + protected void createUnpartitionedTableForAnalyzeTest(String tableName) + { + createTableForAnalyzeTest(tableName, false); + } + + private void createTableForAnalyzeTest(String tableName, boolean partitioned) + { + Session defaultSession = getSession(); + + // Disable column statistics collection when creating the table + Session disableColumnStatsSession = Session.builder(defaultSession) + .setCatalogSessionProperty(defaultSession.getCatalog().get(), "collect_column_statistics_on_write", "false") + .build(); + + assertUpdate( + disableColumnStatsSession, + "" + + "CREATE TABLE " + + tableName + + (partitioned ? " WITH (partitioned_by = ARRAY['p_varchar', 'p_bigint'])\n" : " ") + + "AS " + + "SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, p_varchar, p_bigint " + + "FROM ( " + + " VALUES " + + // p_varchar = 'p1', p_bigint = BIGINT '7' + " (null, null, null, null, null, null, 'p1', BIGINT '7'), " + + " (null, null, null, null, null, null, 'p1', BIGINT '7'), " + + " (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', 'abc1', X'bcd1', 'p1', BIGINT '7'), " + + " (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', 'abc2', X'bcd2', 'p1', BIGINT '7'), " + + // p_varchar = 'p2', p_bigint = BIGINT '7' + " (null, null, null, null, null, null, 'p2', BIGINT '7'), " + + " (null, null, null, null, null, null, 'p2', BIGINT '7'), " + + " (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', 'cba1', X'dcb1', 'p2', BIGINT '7'), " + + " (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', 'cba2', X'dcb2', 'p2', BIGINT '7'), " + + // p_varchar = 'p3', p_bigint = BIGINT '8' + " (null, null, null, null, null, null, 'p3', BIGINT '8'), " + + " (null, null, null, null, null, null, 'p3', BIGINT '8'), " + + " (true, BIGINT '3', DOUBLE '4.4', TIMESTAMP '2012-10-10 01:00', 'bca1', X'cdb1', 'p3', BIGINT '8'), " + + " (false, BIGINT '2', DOUBLE '3.4', TIMESTAMP '2012-10-10 00:00', 'bca2', X'cdb2', 'p3', BIGINT '8'), " + + // p_varchar = NULL, p_bigint = NULL + " (false, BIGINT '7', DOUBLE '7.7', TIMESTAMP '1977-07-07 07:07', 'efa1', X'efa1', NULL, NULL), " + + " (false, BIGINT '6', DOUBLE '6.7', TIMESTAMP '1977-07-07 07:06', 'efa2', X'efa2', NULL, NULL), " + + " (false, BIGINT '5', DOUBLE '5.7', TIMESTAMP '1977-07-07 07:05', 'efa3', X'efa3', NULL, NULL), " + + " (false, BIGINT '4', DOUBLE '4.7', TIMESTAMP '1977-07-07 07:04', 'efa4', X'efa4', NULL, NULL) " + + ") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, p_varchar, p_bigint)", 16); + + if (partitioned) { + // Create empty partitions + assertUpdate(disableColumnStatsSession, String.format("CALL system.create_empty_partition('%s', '%s', ARRAY['p_varchar', 'p_bigint'], ARRAY['%s', '%s'])", TPCH_SCHEMA, tableName, "e1", "9")); + assertUpdate(disableColumnStatsSession, String.format("CALL system.create_empty_partition('%s', '%s', ARRAY['p_varchar', 'p_bigint'], ARRAY['%s', '%s'])", TPCH_SCHEMA, tableName, "e2", "9")); + } + } + + @Test + public void testInsertMultipleColumnsFromSameChannel() + { + String tableName = "test_insert_multiple_columns_same_channel"; + assertUpdate(format("" + + "CREATE TABLE %s ( " + + " c_bigint_1 BIGINT, " + + " c_bigint_2 BIGINT, " + + " p_varchar_1 VARCHAR, " + + " p_varchar_2 VARCHAR " + + ") " + + "WITH ( " + + " partitioned_by = ARRAY['p_varchar_1', 'p_varchar_2'] " + + ")", tableName)); + + assertUpdate(format("" + + "INSERT INTO %s " + + "SELECT 1 c_bigint_1, 1 c_bigint_2, '2' p_varchar_1, '2' p_varchar_2 ", tableName), 1); + + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar_1 = '2' AND p_varchar_2 = '2')", tableName), + "SELECT * FROM VALUES " + + "('c_bigint_1', null, 1.0E0, 0.0E0, null, '1', '1'), " + + "('c_bigint_2', null, 1.0E0, 0.0E0, null, '1', '1'), " + + "('p_varchar_1', 1.0E0, 1.0E0, 0.0E0, null, null, null), " + + "('p_varchar_2', 1.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 1.0E0, null, null)"); + + assertUpdate(format("" + + "INSERT INTO %s (c_bigint_1, c_bigint_2, p_varchar_1, p_varchar_2) " + + "SELECT orderkey, orderkey, orderstatus, orderstatus " + + "FROM orders " + + "WHERE orderstatus='O' AND orderkey = 15008", tableName), 1); + + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar_1 = 'O' AND p_varchar_2 = 'O')", tableName), + "SELECT * FROM VALUES " + + "('c_bigint_1', null, 1.0E0, 0.0E0, null, '15008', '15008'), " + + "('c_bigint_2', null, 1.0E0, 0.0E0, null, '15008', '15008'), " + + "('p_varchar_1', 1.0E0, 1.0E0, 0.0E0, null, null, null), " + + "('p_varchar_2', 1.0E0, 1.0E0, 0.0E0, null, null, null), " + + "(null, null, null, null, 1.0E0, null, null)"); + + assertUpdate(format("DROP TABLE %s", tableName)); + } + + @Test + public void testCreateAvroTableWithSchemaUrl() + throws Exception + { + String tableName = "test_create_avro_table_with_schema_url"; + File schemaFile = createAvroSchemaFile(); + + String createTableSql = getAvroCreateTableSql(tableName, schemaFile.getAbsolutePath()); + String expectedShowCreateTable = getAvroCreateTableSql(tableName, schemaFile.toURI().toString()); + + assertUpdate(createTableSql); + + try { + MaterializedResult actual = computeActual(format("SHOW CREATE TABLE %s", tableName)); + assertShowCreateTableOutput(actual.getOnlyValue(), expectedShowCreateTable); + } + finally { + assertUpdate(format("DROP TABLE %s", tableName)); + verify(schemaFile.delete(), "cannot delete temporary file: %s", schemaFile); + } + } + + @Test + public void testAlterAvroTableWithSchemaUrl() + throws Exception + { + testAlterAvroTableWithSchemaUrl(true, true, true); + } + + protected void testAlterAvroTableWithSchemaUrl(boolean renameColumn, boolean addColumn, boolean dropColumn) + throws Exception + { + String tableName = "test_alter_avro_table_with_schema_url"; + File schemaFile = createAvroSchemaFile(); + + assertUpdate(getAvroCreateTableSql(tableName, schemaFile.getAbsolutePath())); + + try { + if (renameColumn) { + assertQueryFails(format("ALTER TABLE %s RENAME COLUMN dummy_col TO new_dummy_col", tableName), "ALTER TABLE not supported when Avro schema url is set"); + } + if (addColumn) { + assertQueryFails(format("ALTER TABLE %s ADD COLUMN new_dummy_col VARCHAR", tableName), "ALTER TABLE not supported when Avro schema url is set"); + } + if (dropColumn) { + assertQueryFails(format("ALTER TABLE %s DROP COLUMN dummy_col", tableName), "ALTER TABLE not supported when Avro schema url is set"); + } + } + finally { + assertUpdate(format("DROP TABLE %s", tableName)); + verify(schemaFile.delete(), "cannot delete temporary file: %s", schemaFile); + } + } + + private String getAvroCreateTableSql(String tableName, String schemaFile) + { + return format("CREATE TABLE %s.%s.%s (\n" + + " dummy_col varchar,\n" + + " another_dummy_col varchar\n" + + ")\n" + + "WITH (\n" + + " avro_schema_url = '%s',\n" + + " format = 'AVRO'\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get(), + tableName, + schemaFile); + } + + private static File createAvroSchemaFile() + throws Exception + { + File schemaFile = File.createTempFile("avro_single_column-", ".avsc"); + String schema = "{\n" + + " \"namespace\": \"io.prestosql.test\",\n" + + " \"name\": \"single_column\",\n" + + " \"type\": \"record\",\n" + + " \"fields\": [\n" + + " { \"name\":\"string_col\", \"type\":\"string\" }\n" + + "]}"; + asCharSink(schemaFile, UTF_8).write(schema); + return schemaFile; + } + + @Test + public void testCreateOrcTableWithSchemaUrl() + throws Exception + { + @Language("SQL") String createTableSql = format("" + + "CREATE TABLE %s.%s.test_orc (\n" + + " dummy_col varchar\n" + + ")\n" + + "WITH (\n" + + " avro_schema_url = 'dummy.avsc',\n" + + " format = 'ORC'\n" + + ")", + getSession().getCatalog().get(), + getSession().getSchema().get()); + + assertQueryFails(createTableSql, "Cannot specify avro_schema_url table property for storage format: ORC"); + } + + @Test + public void testCtasFailsWithAvroSchemaUrl() + throws Exception + { + @Language("SQL") String ctasSqlWithoutData = "CREATE TABLE create_avro\n" + + "WITH (avro_schema_url = 'dummy_schema')\n" + + "AS SELECT 'dummy_value' as dummy_col WITH NO DATA"; + + assertQueryFails(ctasSqlWithoutData, "CREATE TABLE AS not supported when Avro schema url is set"); + + @Language("SQL") String ctasSql = "CREATE TABLE create_avro\n" + + "WITH (avro_schema_url = 'dummy_schema')\n" + + "AS SELECT * FROM (VALUES('a')) t (a)"; + + assertQueryFails(ctasSql, "CREATE TABLE AS not supported when Avro schema url is set"); + } + + @Test + public void testBucketedTablesFailWithAvroSchemaUrl() + throws Exception + { + @Language("SQL") String createSql = "CREATE TABLE create_avro (dummy VARCHAR)\n" + + "WITH (avro_schema_url = 'dummy_schema',\n" + + " bucket_count = 2, bucketed_by=ARRAY['dummy'])"; + + assertQueryFails(createSql, "Bucketing/Partitioning columns not supported when Avro schema url is set"); + } + + @Test + public void testPartitionedTablesFailWithAvroSchemaUrl() + throws Exception + { + @Language("SQL") String createSql = "CREATE TABLE create_avro (dummy VARCHAR)\n" + + "WITH (avro_schema_url = 'dummy_schema',\n" + + " partitioned_by=ARRAY['dummy'])"; + + assertQueryFails(createSql, "Bucketing/Partitioning columns not supported when Avro schema url is set"); + } + + @Test + public void testPrunePartitionFailure() + { + assertUpdate("CREATE TABLE test_prune_failure\n" + + "WITH (partitioned_by = ARRAY['p']) AS\n" + + "SELECT 123 x, 'abc' p", 1); + + assertQueryReturnsEmptyResult("" + + "SELECT * FROM test_prune_failure\n" + + "WHERE x < 0 AND cast(p AS int) > 0"); + + assertUpdate("DROP TABLE test_prune_failure"); + } + + @Test + public void testTemporaryStagingDirectorySessionProperties() + { + String tableName = "test_temporary_staging_directory_session_properties"; + assertUpdate(format("CREATE TABLE %s(i int)", tableName)); + + Session session = Session.builder(getSession()) + .setCatalogSessionProperty("hive", "temporary_staging_directory_enabled", "false") + .build(); + + HiveInsertTableHandle hiveInsertTableHandle = getHiveInsertTableHandle(session, tableName); + assertEquals(hiveInsertTableHandle.getLocationHandle().getWritePath(), hiveInsertTableHandle.getLocationHandle().getTargetPath()); + + session = Session.builder(getSession()) + .setCatalogSessionProperty("hive", "temporary_staging_directory_enabled", "true") + .build(); + + hiveInsertTableHandle = getHiveInsertTableHandle(session, tableName); + assertNotEquals(hiveInsertTableHandle.getLocationHandle().getWritePath(), hiveInsertTableHandle.getLocationHandle().getTargetPath()); + // Since staging directory is getting created inside table path + assertTrue(hiveInsertTableHandle.getLocationHandle().getWritePath().toString().startsWith(hiveInsertTableHandle.getLocationHandle().getTargetPath().toString())); + + assertUpdate("DROP TABLE " + tableName); + } + + private HiveInsertTableHandle getHiveInsertTableHandle(Session session, String tableName) + { + getQueryRunner().getMetadata().cleanupQuery(session); + + Metadata metadata = ((DistributedQueryRunner) getQueryRunner()).getCoordinator().getMetadata(); + return transaction(getQueryRunner().getTransactionManager(), getQueryRunner().getAccessControl()) + .execute(session, transactionSession -> { + QualifiedObjectName objectName = new QualifiedObjectName(catalog, TPCH_SCHEMA, tableName); + Optional handle = metadata.getTableHandle(transactionSession, objectName); + InsertTableHandle insertTableHandle = metadata.beginInsert(transactionSession, handle.get(), false); + HiveInsertTableHandle hiveInsertTableHandle = (HiveInsertTableHandle) insertTableHandle.getConnectorHandle(); + + metadata.finishInsert(transactionSession, insertTableHandle, ImmutableList.of(), ImmutableList.of()); + return hiveInsertTableHandle; + }); + } + + @Test + public void testSelectWithNoColumns() + { + testWithAllStorageFormats(this::testSelectWithNoColumns); + } + + private void testSelectWithNoColumns(Session session, HiveStorageFormat storageFormat) + { + String tableName = "test_select_with_no_columns"; + @Language("SQL") String createTable = format( + "CREATE TABLE %s (col0) WITH (format = '%s') AS VALUES 5, 6, 7", + tableName, + storageFormat); + assertUpdate(session, createTable, 3); + assertTrue(getQueryRunner().tableExists(getSession(), tableName)); + + assertQuery("SELECT 1 FROM " + tableName, "VALUES 1, 1, 1"); + assertQuery("SELECT count(*) FROM " + tableName, "SELECT 3"); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testUnsupportedCsvTable() + { + assertQueryFails( + "CREATE TABLE create_unsupported_csv(i INT, bound VARCHAR(10), unbound VARCHAR, dummy VARCHAR) WITH (format = 'CSV')", + "\\QHive CSV storage format only supports VARCHAR (unbounded). Unsupported columns: i integer, bound varchar(10)\\E"); + } + + private Session getParallelWriteSession() + { + return Session.builder(getSession()) + .setSystemProperty("task_writer_count", "4") + .build(); + } + + private void assertOneNotNullResult(@Language("SQL") String query) + { + MaterializedResult results = getQueryRunner().execute(getSession(), query).toTestTypes(); + assertEquals(results.getRowCount(), 1); + assertEquals(results.getMaterializedRows().get(0).getFieldCount(), 1); + assertNotNull(results.getMaterializedRows().get(0).getField(0)); + } + + private Type canonicalizeType(Type type) + { + HiveType hiveType = HiveType.toHiveType(typeTranslator, type); + return TYPE_MANAGER.getType(hiveType.getTypeSignature()); + } + + private String canonicalizeTypeName(String type) + { + TypeSignature typeSignature = TypeSignature.parseTypeSignature(type); + return canonicalizeType(TYPE_MANAGER.getType(typeSignature)).toString(); + } + + private void assertColumnType(TableMetadata tableMetadata, String columnName, Type expectedType) + { + assertEquals(tableMetadata.getColumn(columnName).getType(), canonicalizeType(expectedType)); + } + + private void assertConstraints(@Language("SQL") String query, Set expected) + { + MaterializedResult result = computeActual("EXPLAIN (TYPE IO, FORMAT JSON) " + query); + Set constraints = jsonCodec(IoPlan.class).fromJson((String) getOnlyElement(result.getOnlyColumnAsSet())) + .getInputTableColumnInfos().stream() + .findFirst().get() + .getColumnConstraints(); + + assertEquals(constraints, expected); + } + + private void verifyPartition(boolean hasPartition, TableMetadata tableMetadata, List partitionKeys) + { + Object partitionByProperty = tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY); + if (hasPartition) { + assertEquals(partitionByProperty, partitionKeys); + for (ColumnMetadata columnMetadata : tableMetadata.getColumns()) { + boolean partitionKey = partitionKeys.contains(columnMetadata.getName()); + assertEquals(columnMetadata.getExtraInfo(), columnExtraInfo(partitionKey)); + } + } + else { + assertNull(partitionByProperty); + } + } + + private void rollback() + { + throw new RollbackException(); + } + + private static class RollbackException + extends RuntimeException + { + } + + private static ConnectorSession getConnectorSession(Session session) + { + return session.toConnectorSession(new CatalogName(session.getCatalog().get())); + } + + @Test + public void testEmptyBucketedTable() + { + // go through all storage formats to make sure the empty buckets are correctly created + testWithAllStorageFormats(this::testEmptyBucketedTable); + } + + private void testEmptyBucketedTable(Session session, HiveStorageFormat storageFormat) + { + testEmptyBucketedTable(session, storageFormat, true); + testEmptyBucketedTable(session, storageFormat, false); + } + + private void testEmptyBucketedTable(Session session, HiveStorageFormat storageFormat, boolean createEmpty) + { + String tableName = "test_empty_bucketed_table"; + + @Language("SQL") String createTable = "" + + "CREATE TABLE " + tableName + " " + + "(bucket_key VARCHAR, col_1 VARCHAR, col2 VARCHAR) " + + "WITH (" + + "format = '" + storageFormat + "', " + + "bucketed_by = ARRAY[ 'bucket_key' ], " + + "bucket_count = 11 " + + ") "; + + assertUpdate(createTable); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, tableName); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + + assertNull(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY)); + assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKETED_BY_PROPERTY), ImmutableList.of("bucket_key")); + assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKET_COUNT_PROPERTY), 11); + + assertEquals(computeActual("SELECT * from " + tableName).getRowCount(), 0); + + // make sure that we will get one file per bucket regardless of writer count configured + Session parallelWriter = Session.builder(getParallelWriteSession()) + .setCatalogSessionProperty(catalog, "create_empty_bucket_files", String.valueOf(createEmpty)) + .build(); + assertUpdate(parallelWriter, "INSERT INTO " + tableName + " VALUES ('a0', 'b0', 'c0')", 1); + assertUpdate(parallelWriter, "INSERT INTO " + tableName + " VALUES ('a1', 'b1', 'c1')", 1); + + assertQuery("SELECT * from " + tableName, "VALUES ('a0', 'b0', 'c0'), ('a1', 'b1', 'c1')"); + + assertUpdate(session, "DROP TABLE " + tableName); + assertFalse(getQueryRunner().tableExists(session, tableName)); + } + + @Test + public void testBucketedTable() + { + // go through all storage formats to make sure the empty buckets are correctly created + testWithAllStorageFormats(this::testBucketedTable); + } + + private void testBucketedTable(Session session, HiveStorageFormat storageFormat) + { + testBucketedTable(session, storageFormat, true); + testBucketedTable(session, storageFormat, false); + } + + private void testBucketedTable(Session session, HiveStorageFormat storageFormat, boolean createEmpty) + { + String tableName = "test_bucketed_table"; + + @Language("SQL") String createTable = "" + + "CREATE TABLE " + tableName + " " + + "WITH (" + + "format = '" + storageFormat + "', " + + "bucketed_by = ARRAY[ 'bucket_key' ], " + + "bucket_count = 11 " + + ") " + + "AS " + + "SELECT * " + + "FROM (" + + "VALUES " + + " (VARCHAR 'a', VARCHAR 'b', VARCHAR 'c'), " + + " ('aa', 'bb', 'cc'), " + + " ('aaa', 'bbb', 'ccc')" + + ") t (bucket_key, col_1, col_2)"; + + // make sure that we will get one file per bucket regardless of writer count configured + Session parallelWriter = Session.builder(getParallelWriteSession()) + .setCatalogSessionProperty(catalog, "create_empty_bucket_files", String.valueOf(createEmpty)) + .build(); + assertUpdate(parallelWriter, createTable, 3); + + TableMetadata tableMetadata = getTableMetadata(catalog, TPCH_SCHEMA, tableName); + assertEquals(tableMetadata.getMetadata().getProperties().get(STORAGE_FORMAT_PROPERTY), storageFormat); + + assertNull(tableMetadata.getMetadata().getProperties().get(PARTITIONED_BY_PROPERTY)); + assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKETED_BY_PROPERTY), ImmutableList.of("bucket_key")); + assertEquals(tableMetadata.getMetadata().getProperties().get(BUCKET_COUNT_PROPERTY), 11); + + assertQuery("SELECT * from " + tableName, "VALUES ('a', 'b', 'c'), ('aa', 'bb', 'cc'), ('aaa', 'bbb', 'ccc')"); + + assertUpdate(parallelWriter, "INSERT INTO " + tableName + " VALUES ('a0', 'b0', 'c0')", 1); + assertUpdate(parallelWriter, "INSERT INTO " + tableName + " VALUES ('a1', 'b1', 'c1')", 1); + + assertQuery("SELECT * from " + tableName, "VALUES ('a', 'b', 'c'), ('aa', 'bb', 'cc'), ('aaa', 'bbb', 'ccc'), ('a0', 'b0', 'c0'), ('a1', 'b1', 'c1')"); + + assertUpdate(session, "DROP TABLE " + tableName); + assertFalse(getQueryRunner().tableExists(session, tableName)); + } + + @Test + public void testInsertTwiceToSamePartitionedBucket() + { + String tableName = "test_insert_twice_to_same_partitioned_bucket"; + createPartitionedBucketedTable(tableName, HiveStorageFormat.RCBINARY); + + String insert = "INSERT INTO " + tableName + + " VALUES (1, 1, 'first_comment', 'F'), (2, 2, 'second_comment', 'G')"; + assertUpdate(insert, 2); + assertUpdate(insert, 2); + + assertQuery( + "SELECT custkey, custkey2, comment, orderstatus FROM " + tableName + " ORDER BY custkey", + "VALUES (1, 1, 'first_comment', 'F'), (1, 1, 'first_comment', 'F'), (2, 2, 'second_comment', 'G'), (2, 2, 'second_comment', 'G')"); + assertQuery( + "SELECT custkey, custkey2, comment, orderstatus FROM " + tableName + " WHERE custkey = 1 and custkey2 = 1", + "VALUES (1, 1, 'first_comment', 'F'), (1, 1, 'first_comment', 'F')"); + assertUpdate("DROP TABLE " + tableName); + } + + private void testWithAllStorageFormats(BiConsumer test) + { + for (TestingHiveStorageFormat storageFormat : getAllTestingHiveStorageFormat()) { + testWithStorageFormat(storageFormat, test); + } + } + + private static void testWithStorageFormat(TestingHiveStorageFormat storageFormat, BiConsumer test) + { + requireNonNull(storageFormat, "storageFormat is null"); + requireNonNull(test, "test is null"); + Session session = storageFormat.getSession(); + try { + test.accept(session, storageFormat.getFormat()); + } + catch (Exception | AssertionError e) { + fail(format("Failure for format %s with properties %s", storageFormat.getFormat(), session.getConnectorProperties()), e); + } + } + + private List getAllTestingHiveStorageFormat() + { + Session session = getSession(); + ImmutableList.Builder formats = ImmutableList.builder(); + for (HiveStorageFormat hiveStorageFormat : HiveStorageFormat.values()) { + if (hiveStorageFormat == HiveStorageFormat.CSV) { + // CSV supports only unbounded VARCHAR type + continue; + } + formats.add(new TestingHiveStorageFormat(session, hiveStorageFormat)); + } + return formats.build(); + } + + /* + * Expected output is the original CREATE TABLE query + * While actual output has additional data like location and external table properties + * Verify that all the lines of expected result are present in actual result + */ + private void assertShowCreateTableOutput(Object actual, String expected) + { + List expectedLines = Stream.of( + expected.split("\n")) + .map(line -> line.lastIndexOf(',') == (line.length() - 1) ? line.substring(0, line.length() - 1) : line) + .collect(Collectors.toList()); + + List absentLines = expectedLines.stream().filter(line -> !actual.toString().contains(line)).collect(Collectors.toList()); + + assertTrue(absentLines.isEmpty(), format("Expected %s\nFound %s\nMissing lines in output %s", expected, actual, absentLines)); + } + + private static class TestingHiveStorageFormat + { + private final Session session; + private final HiveStorageFormat format; + + TestingHiveStorageFormat(Session session, HiveStorageFormat format) + { + this.session = requireNonNull(session, "session is null"); + this.format = requireNonNull(format, "format is null"); + } + + public Session getSession() + { + return session; + } + + public HiveStorageFormat getFormat() + { + return format; + } + } + + @Test + public void testAutoVacuum() + { + assertUpdate(autoVacuumSession, "CREATE TABLE auto_vacuum_test_table1 (a int) with (format='orc', transactional=true)"); + + TableMetadata tableMetadata = getTableMetadata(autoVacuumSession.getCatalog().get(), autoVacuumSession.getSchema().get(), + "auto_vacuum_test_table1"); + + for (int i = 0; i <= 10; i++) { + String query = format("INSERT INTO auto_vacuum_test_table1 VALUES(%d), (%d)", i, i * 2); + assertUpdate(autoVacuumSession, query, 2); + } + + String tablePath = String.valueOf(tableMetadata.getMetadata().getProperties().get("location")); + + checkBaseDirectoryExists(tablePath, true); + + assertUpdate(autoVacuumSession, "DROP TABLE auto_vacuum_test_table1"); + } + + @Test + public void testAutoVacuumOnPartitionTable() + { + assertUpdate(autoVacuumSession, "CREATE TABLE auto_vacuum_test_table2 (a int, b int)" + + " with (format='orc', transactional=true, partitioned_by=Array['b'])"); + + TableMetadata tableMetadata = getTableMetadata(autoVacuumSession.getCatalog().get(), autoVacuumSession.getSchema().get(), + "auto_vacuum_test_table2"); + + for (int i = 0; i <= 10; i++) { + String query = format("INSERT INTO auto_vacuum_test_table2 VALUES(%d, 1), (%d, 2)", i, i * 2); + assertUpdate(autoVacuumSession, query, 2); + } + + String tablePath = String.valueOf(tableMetadata.getMetadata().getProperties().get("location")); + + checkBaseDirectoryExists(tablePath + "/b=1", true); + checkBaseDirectoryExists(tablePath + "/b=2", false); + + assertUpdate(autoVacuumSession, "DROP TABLE auto_vacuum_test_table2"); + } + + private void checkBaseDirectoryExists(String path, boolean delayRequired) + { + try { + // Since auto-vacuum runs asynchronously + if (delayRequired) { + TimeUnit.SECONDS.sleep(80); + } + } + catch (InterruptedException e) { + // Ignore + } + if (path.startsWith("file:")) { + path = path.replace("file:", ""); + } + String[] actualDirectoryList = new File(path).list(new FilenameFilter() + { + @Override + public boolean accept(File file, String s) + { + return s.startsWith("base"); + } + }); + + assertEquals(actualDirectoryList.length, 1); + } + + @Test + public void testUnsupportedFunctions() + { + testWithAllStorageFormats(this::testUnsupportedFunctions); + } + + private void testUnsupportedFunctions(Session session, HiveStorageFormat storageFormat) + { + Session session1 = Session.builder(session) + .setCatalogSessionProperty(session.getCatalog().get(), "orc_predicate_pushdown_enabled", "true") + .build(); + Session session2 = Session.builder(session) + .setCatalogSessionProperty(session.getCatalog().get(), "orc_predicate_pushdown_enabled", "true") + .setCatalogSessionProperty(session.getCatalog().get(), "orc_disjunct_predicate_pushdown_enabled", "false") + .build(); + try { + assertUpdate(session1, "CREATE TABLE test_unsupport (a int, b int) with (transactional=true, format='orc')"); + assertUpdate(session1, "INSERT INTO test_unsupport VALUES (1, 2),(11,22)", 2); + assertEquals(computeActual(session1, "SELECT * from test_unsupport").getRowCount(), 02L); + assertUpdate(session1, "UPDATE test_unsupport set a=111 where a=1", 01L); + assertUpdate(session1, "DELETE from test_unsupport where a=111", 01L); + + assertUpdate(session1, "CREATE TABLE map_test with (format='orc') AS SELECT MAP(ARRAY[0,1], ARRAY[2,NULL]) AS col", 1); + assertQuery(session1, "SELECT col[0] FROM map_test", "SELECT 2"); + assertUpdate(session1, "CREATE TABLE array_test AS SELECT ARRAY[1, 2, NULL] AS col", 1); + assertQuery(session1, "SELECT col[2] FROM array_test", "SELECT 2"); + assertUpdate(session1, "CREATE TABLE alldtype (id1 int, id4 double, id5 float, id6 decimal(5,2), id7 varchar(10), id8 char(10)) with (format='orc')"); + assertUpdate(session1, "INSERT Into alldtype values(1,4.5,5.6,6.7,'rajeev','male')", 1); + assertQuery(session1, "select * from alldtype where id1=1", "SELECT 1,4.5,5.6,6.7,'rajeev','male'"); + assertQuery(session1, "select count(1) from alldtype where id4=4.5", "SELECT 1"); + assertUpdate(session1, "CREATE TABLE part2key (id1 int, id2 int, id3 int) with (format='orc', partitioned_by=ARRAY['id2','id3'])"); + assertUpdate(session1, "INSERT Into part2key values(1,2,3)", 1); + assertQuery(session1, "select * from part2key where id2=2 and id3=3", "SELECT 1,2,3"); + assertQuery(session2, "select * from part2key where id2=2 and id3=3", "SELECT 1,2,3"); + assertUpdate(session1, "CREATE TABLE multiin (id1 int, id2 int, id3 varchar(10)) with (format='orc')"); + assertUpdate(session1, "INSERT Into multiin values(1,2,'abc'), (11,22,'xyz'), (111,222,'abcd'), (1111,2222,'zxy')", 4); + assertQuery(session1, "select * from multiin where id3 in ('abc', 'xyz', 'abcd', 'zxy') order by id1", "SELECT * from (values(1,2,'abc'), (11,22,'xyz'), (111,222,'abcd'), (1111,2222,'zxy'))"); + assertQuery(session1, "select * from multiin where id3 in ('abc', 'yzx', 'adbc', 'abcde')", "SELECT 1,2,'abc'"); + assertUpdate(session1, "create table inperftest(id1 float, id2 double, id3 decimal(19,2), id4 int) with (format='orc')"); + assertUpdate(session1, "insert into inperftest values(1.2,2.2,3.2,4), (11.22, 22.22, 33.22, 44),(111.33,222.33, 333.33, 444)", 3); + assertQuery(session1, "select * from inperftest where id1 in (1.2,5.3,11.22, 111.33)", "select * from (values(1.2, 2.2 , 3.20, 4), (11.22, 22.22, 33.22, 44), (111.33,222.33, 333.33, 444))"); + assertQuery(session1, "select * from inperftest where id1 in (1.2,5.3)", "select * from (values(1.2, 2.2 , 3.20, 4))"); + assertQuery(session1, "select * from inperftest where id2 in (2.2,5.3,22.22, 222.33)", "select * from (values(1.2, 2.2 , 3.20, 4), (11.22, 22.22, 33.22, 44), (111.33,222.33, 333.33, 444))"); + assertQuery(session1, "select * from inperftest where id2 in (2.2,5.3)", "select * from (values(1.2, 2.2 , 3.20, 4))"); + assertQuery(session1, "select * from inperftest where id3 in (3.2,5.3,33.22, 333.331, 333.33)", "select * from (values(1.2, 2.2 , 3.20, 4), (11.22, 22.22, 33.22, 44), (111.33,222.33, 333.33, 444))"); + assertQuery(session1, "select * from inperftest where id3 in (3.2,5.3, 33.21)", "select * from (values(1.2, 2.2 , 3.20, 4))"); + assertQuery(session1, "select * from inperftest where id3 in (3.2,5.3, 33.221)", "select * from (values(1.2, 2.2 , 3.20, 4))"); + } + finally { + assertUpdate("DROP TABLE IF EXISTS test_unsupport"); + assertUpdate("DROP TABLE IF EXISTS part2key"); + assertUpdate("DROP TABLE IF EXISTS alldtype"); + assertUpdate("DROP TABLE IF EXISTS map_test"); + assertUpdate("DROP TABLE IF EXISTS array_test"); + assertUpdate("DROP TABLE IF EXISTS multiin"); + assertUpdate("DROP TABLE IF EXISTS inperftest"); + } + } + + @Test + public void testNonEqualDynamicFilter() + { + Session session = getSession(); + Session sessionTest = Session.builder(session) + .setCatalogSessionProperty(session.getCatalog().get(), "orc_predicate_pushdown_enabled", "true") + .setCatalogSessionProperty(session.getCatalog().get(), "dynamic_filtering_filter_rows_threshold", "20000") + .setSystemProperty("dynamic_filtering_max_per_driver_value_count", "100000") + .setSystemProperty("enable_dynamic_filtering", "true") + .setSystemProperty("optimize_dynamic_filter_generation", "false") + .build(); + + Session sessionCtrl = Session.builder(session) + .setCatalogSessionProperty(session.getCatalog().get(), "orc_predicate_pushdown_enabled", "false") + .setSystemProperty("enable_dynamic_filtering", "false") + .build(); + + String query = "SELECT COUNT(*) FROM " + + "(SELECT orderkey FROM lineitem WHERE orderkey < 1000) a " + + "JOIN " + + "(SELECT orderkey FROM orders) b " + + "ON NOT (a.orderkey <= b.orderkey)"; + + MaterializedResult expected = computeActual(query); + MaterializedResult expected2 = computeActual(sessionCtrl, query); + MaterializedResult resultDynamicFilter = computeActual(sessionTest, query); + + assertEquals(expected.getMaterializedRows(), resultDynamicFilter.getMaterializedRows()); + assertEquals(expected2.getMaterializedRows(), resultDynamicFilter.getMaterializedRows()); + + System.out.println(">>>>>>>>> result " + resultDynamicFilter); + } + + @Test + public void testPushdownWithNullRows() + { + Session session = getSession(); + Session session1 = Session.builder(session) + .setCatalogSessionProperty(session.getCatalog().get(), "orc_predicate_pushdown_enabled", "true") + .build(); + Session session2 = Session.builder(session) + .setCatalogSessionProperty(session.getCatalog().get(), "orc_predicate_pushdown_enabled", "true") + .setCatalogSessionProperty(session.getCatalog().get(), "orc_disjunct_predicate_pushdown_enabled", "false") + .build(); + + String[] types = {"double", "decimal(7,2)", "decimal(38,7)", "integer", "bigint", "string", "boolean"}; + for (String type : types) { + testPushdownNullForType(session1, session2, type); + testPushdownGetAllNULLsForType(session1, session2, type); + } + } + + private void testPushdownNullForType(Session sessionWithOr, Session sessionWithoutOR, String type) + { + try { + assertUpdate(sessionWithOr, "CREATE TABLE test_predicate_or_NULL (a " + type + ", b " + type + ", c int) with (transactional=false, format='orc')"); + assertUpdate(sessionWithOr, "INSERT INTO test_predicate_or_NULL VALUES " + + "(cast(0 as " + type + "), cast(0 as " + type + "),0)," + + "(cast(1 as " + type + "), NULL, 1)," + + "(NULL,cast(2 as " + type + "), 2)," + + "(NULL,NULL,3)," + + "(cast(4 as " + type + "), cast(4 as " + type + "),4)", 5); + + List queries = new ArrayList<>(); + queries.add("SELECT * FROM test_predicate_or_NULL WHERE " + + "c BETWEEN 0 AND 5 AND (a BETWEEN cast(0 as " + type + ") AND cast(5 as " + type + ") or b BETWEEN cast(0 as " + type + ") AND cast(5 as " + type + ")) " + + "ORDER BY a,b,c"); + queries.add("SELECT * FROM test_predicate_or_NULL WHERE " + + "c BETWEEN 0 AND 5 " + + "AND (" + + "a BETWEEN cast(0 as " + type + ") and cast(5 as " + type + ") " + + "OR b BETWEEN cast(0 as " + type + ") and cast(5 as " + type + ") " + + "OR a IS NULL) " + + "ORDER BY a,b,c"); + queries.add("SELECT * FROM test_predicate_or_NULL WHERE " + + "c BETWEEN 0 AND 5 " + + "AND (" + + "a BETWEEN cast(0 as " + type + ") and cast(5 as " + type + ") " + + "OR b BETWEEN cast(0 as " + type + ") and cast(5 as " + type + ") " + + "OR a IS NULL " + + "OR b IS NULL" + + ") ORDER BY a,b,c"); + queries.add("SELECT * FROM test_predicate_or_NULL WHERE " + + "c BETWEEN 0 AND 5 " + + "AND (" + + "a BETWEEN cast(0 as " + type + ") and cast(5 as " + type + ") " + + "OR b BETWEEN cast(0 as " + type + ") and cast(1 as " + type + ") " + + "OR a IS NOT NULL " + + "OR a BETWEEN cast(3 as " + type + ") and cast(5 as " + type + ") " + + ") ORDER BY a,b,c"); + + MaterializedResult expected; + MaterializedResult resultPushdownOr; + MaterializedResult resultPushdown; + for (String query : queries) { + expected = computeActual(query); + resultPushdownOr = computeActual(sessionWithOr, query); + resultPushdown = computeActual(sessionWithoutOR, query); + + assertEquals(expected.getMaterializedRows(), resultPushdown.getMaterializedRows()); + assertEquals(expected.getMaterializedRows(), resultPushdownOr.getMaterializedRows()); + System.out.println("Type(" + type + ")\n-------------\n" + resultPushdown.getMaterializedRows()); + } + } + finally { + assertUpdate("DROP TABLE IF EXISTS test_predicate_or_NULL"); + } + } + + private void testPushdownGetAllNULLsForType(Session sessionWithOr, Session sessionWithoutOR, String type) + { + try { + assertUpdate(sessionWithOr, "CREATE TABLE test_predicate_or_NULL_tmp (a " + type + ", b " + type + ", c int) with (transactional=false, format='orc')"); + assertUpdate(sessionWithOr, "INSERT INTO test_predicate_or_NULL_tmp VALUES " + + "(cast(0 as " + type + "),NULL,0)," + + "(NULL,NULL,1)," + + "(cast(2 as " + type + "),NULL,2)," + + "(NULL,NULL,3)," + + "(cast(4 as " + type + "),NULL,4)," + + "(NULL,NULL,NULL)", + 6); + assertUpdate(sessionWithOr, "INSERT INTO test_predicate_or_NULL_tmp SELECT * from test_predicate_or_NULL_tmp", 6); /* 12 rows */ + assertUpdate(sessionWithOr, "INSERT INTO test_predicate_or_NULL_tmp SELECT * from test_predicate_or_NULL_tmp", 12); /* 24 rows */ + assertUpdate(sessionWithOr, "INSERT INTO test_predicate_or_NULL_tmp SELECT * from test_predicate_or_NULL_tmp", 24); /* 48 rows */ + assertUpdate(sessionWithOr, "INSERT INTO test_predicate_or_NULL_tmp SELECT * from test_predicate_or_NULL_tmp", 48); /* 96 rows */ + assertUpdate(sessionWithOr, "INSERT INTO test_predicate_or_NULL_tmp SELECT * from test_predicate_or_NULL_tmp", 96); /* 192 rows */ + assertUpdate(sessionWithOr, "INSERT INTO test_predicate_or_NULL_tmp SELECT * from test_predicate_or_NULL_tmp", 192); /* 384 rows */ + assertUpdate(sessionWithOr, "INSERT INTO test_predicate_or_NULL_tmp SELECT * from test_predicate_or_NULL_tmp", 384); /* 768 rows */ + assertUpdate(sessionWithOr, "INSERT INTO test_predicate_or_NULL_tmp SELECT * from test_predicate_or_NULL_tmp", 768); /* 1536 rows */ + + assertUpdate(sessionWithOr, "CREATE TABLE test_predicate_or_NULL WITH (transactional=false, format='orc')" + + " AS SELECT * FROM test_predicate_or_NULL_tmp ORDER BY a, b, c", + 1536); + + List queries = new ArrayList<>(); + queries.add("SELECT a FROM test_predicate_or_NULL WHERE c = 2 ORDER BY 1"); + queries.add("SELECT a FROM test_predicate_or_NULL WHERE c >= 1 and c <= 3 ORDER BY 1"); + queries.add("SELECT a FROM test_predicate_or_NULL WHERE c IN (1,3,5) ORDER BY 1"); + queries.add("SELECT a FROM test_predicate_or_NULL WHERE c IN (0,2,4) ORDER BY 1"); + queries.add("SELECT a FROM test_predicate_or_NULL WHERE c IS NULL ORDER BY 1"); + + queries.add("SELECT a FROM test_predicate_or_NULL WHERE b IS NULL ORDER BY 1"); + queries.add("SELECT a,c FROM test_predicate_or_NULL WHERE b IS NULL ORDER BY 1,2"); + + queries.add("SELECT b FROM test_predicate_or_NULL WHERE c = 2 ORDER BY 1"); + queries.add("SELECT b FROM test_predicate_or_NULL WHERE c >= 1 and c <= 3 ORDER BY 1"); + queries.add("SELECT b FROM test_predicate_or_NULL WHERE c IN (1,3,5) ORDER BY 1"); + queries.add("SELECT B FROM test_predicate_or_NULL WHERE c IN (0,2,4) ORDER BY 1"); + queries.add("SELECT b FROM test_predicate_or_NULL WHERE c IS NULL ORDER BY 1"); + + queries.add("SELECT a,b FROM test_predicate_or_NULL WHERE c = 2 ORDER BY 1,2"); + queries.add("SELECT b,a FROM test_predicate_or_NULL WHERE c >= 1 and c <= 3 ORDER BY 1,2"); + queries.add("SELECT a,b FROM test_predicate_or_NULL WHERE c IN (1,3,5) ORDER BY 1,2"); + queries.add("SELECT b,a FROM test_predicate_or_NULL WHERE c IN (0,2,4) ORDER BY 1,2"); + queries.add("SELECT a,b,a FROM test_predicate_or_NULL WHERE c IS NULL ORDER BY 1,2"); + + MaterializedResult expected; + MaterializedResult resultPushdownOr; + MaterializedResult resultPushdown; + for (String query : queries) { + expected = computeActual(query); + resultPushdownOr = computeActual(sessionWithOr, query); + resultPushdown = computeActual(sessionWithoutOR, query); + System.out.println("Query [ " + query + " ]"); + + assertEquals(expected.getMaterializedRows(), resultPushdown.getMaterializedRows()); + assertEquals(expected.getMaterializedRows(), resultPushdownOr.getMaterializedRows()); + System.out.println("Type(" + type + ")\n-------------\n" + resultPushdown.getMaterializedRows().size()); + } + } + finally { + assertUpdate("DROP TABLE IF EXISTS test_predicate_or_NULL_tmp"); + assertUpdate("DROP TABLE IF EXISTS test_predicate_or_NULL"); + } + } + + @Test + public void testUpdateAndDeleteForBooleanColumn() + { + assertUpdate("DROP TABLE IF EXISTS tab_bkt_009"); + assertUpdate(autoVacuumSession, "CREATE TABLE tab_bkt_009 (aa tinyint, bb smallint, cc int, " + + "dd bigint, ee boolean, ff real, gg double, hh varchar(10), ii varbinary, jj timestamp, kk decimal,ll decimal(10, 8),mm date,nn char(6)) " + + "with (bucket_count=2, bucketed_by=array ['dd'], format='orc', transactional=true)"); + + assertUpdate("insert into tab_bkt_009 values (tinyint'21', smallint'31', 810, 11111, boolean'0', 111.111," + + "111111111.111111, 'hello_111', varbinary'/', timestamp'2019-09-11 01:00:00'," + + "51, 11.11, date '2019-09-11', 'work_1')", 1); + assertUpdate("insert into tab_bkt_009 values (tinyint'22', smallint'32', 820, 22222, boolean'0', 222.222," + + "222222222.222222, 'hello_222', varbinary'/', timestamp'2019-09-14 02:00:00', 52, 22.22," + + "date '2019-09-14', 'work_2')", 1); + assertUpdate("update tab_bkt_009 set bb=smallint'10' where ee=boolean'0'", 2); + + assertUpdate("insert into tab_bkt_009 values (tinyint'23', smallint'33', 830, 999930, boolean'0', 3.3, " + + "3.03,'hello_3', varbinary'/', timestamp'2019-09-13 15:00:03', 53, 30.33, date '2019-09-13', 'work_3')", 1); + assertUpdate("insert into tab_bkt_009 values (tinyint'24', smallint'34', 840, 999940, boolean'1', 4.4, " + + "4.04,'hello_4', varbinary'/', timestamp'2019-09-14 15:00:04', 54, 40.34, date '2019-09-14', 'work_4')", 1); + assertUpdate("insert into tab_bkt_009 values (tinyint'26', smallint'36', 860, 999960, boolean'0', 6.6, " + + "6.06,'hello_6', varbinary'/', timestamp'2019-09-16 15:00:06', 56, 60.36, date '2019-09-16', 'work_6')", 1); + assertUpdate("delete from tab_bkt_009 where mm=date'2019-09-14'", 2); + assertUpdate("delete from tab_bkt_009 where mm=date'2019-09-16'", 1); + + assertUpdate(String.format("DROP TABLE tab_bkt_009")); + } + + @Test + public void testVacuumForBooleanColumn() + { + assertUpdate("DROP TABLE IF EXISTS tab_bkt_010"); + assertUpdate(autoVacuumSession, "CREATE TABLE tab_bkt_010 (aa tinyint, bb smallint, cc int, " + + "dd bigint, ee boolean, ff real, gg double, hh varchar(10), ii varbinary, jj timestamp, kk decimal,ll decimal(10, 8),mm date,nn char(6)) " + + "with (bucket_count=2, bucketed_by=array ['dd'], format='orc', transactional=true)"); + + assertUpdate("insert into tab_bkt_010 values (tinyint'23', smallint'33', 830, 999930, boolean'0', 3.3, 3.03," + + "'hello_3', varbinary'/', timestamp'2019-09-13 15:00:03', 53, 30.33, date '2019-09-13', 'work_3')", 1); + assertUpdate("insert into tab_bkt_010 values (tinyint'24', smallint'34', 840, 999940, boolean'1', 4.4, 4.04," + + "'hello_4', varbinary'/', timestamp'2019-09-14 15:00:04', 54, 40.34, date '2019-09-14', 'work_4')", 1); + assertUpdate("insert into tab_bkt_010 values (tinyint'23', smallint'33', 830, 999930, boolean'0', 3.3, 3.03," + + "'hello_3', varbinary'/', timestamp'2019-09-13 15:00:03', 53, 30.33, date '2019-09-13', 'work_3')", 1); + assertUpdate("insert into tab_bkt_010 values (tinyint'24', smallint'34', 840, 999940, boolean'1', 4.4, 4.04," + + "'hello_4', varbinary'/', timestamp'2019-09-14 15:00:04', 54, 40.34, date '2019-09-14', 'work_4')", 1); + assertUpdate(String.format("VACUUM TABLE tab_bkt_010 AND WAIT"), 4); + assertUpdate("delete from tab_bkt_010 where mm=date'2019-09-14'", 2); + + assertUpdate(String.format("DROP TABLE tab_bkt_010")); + } + + @Test + public void testCteReuse() + { + MaterializedResult result = getQueryRunner().execute("with customer_total_return " + + " as (select lineitem.orderkey, sum(totalprice) as finalprice " + + " from lineitem, " + + "orders " + + " where lineitem.orderkey=orders.orderkey " + + " group by lineitem.orderkey) " + + "select ctr1.orderkey " + + "from customer_total_return ctr1, orders " + + "where ctr1.finalprice < (select Avg(finalprice) * 1.2 " + + "from customer_total_return ctr2 " + + "where ctr2.orderkey=ctr1.orderkey) " + + "and ctr1.orderkey=orders.orderkey limit 100"); + assertEquals(result.getRowCount(), 100); + + result = getQueryRunner().execute("with ss as (select * from orders), sd as (select * from ss) " + + " select * from ss,sd where ss.orderkey = sd.orderkey"); + assertEquals(result.getRowCount(), 15000); + } + + private void setUpNodes() + { + ImmutableList.Builder nodeBuilder = ImmutableList.builder(); + nodeBuilder.add(new InternalNode("other1", URI.create("http://10.0.0.1:11"), io.prestosql.client.NodeVersion.UNKNOWN, false)); + nodeBuilder.add(new InternalNode("other2", URI.create("http://10.0.0.1:12"), io.prestosql.client.NodeVersion.UNKNOWN, false)); + nodeBuilder.add(new InternalNode("other3", URI.create("http://10.0.0.1:13"), NodeVersion.UNKNOWN, false)); + ImmutableList nodes = nodeBuilder.build(); + nodeManager.addNode(CONNECTOR_ID, nodes); + } + + @Test + public void testRuseExchangeGroupSplitsMatchingBetweenProducerConsumer() + { + setUpNodes(); + NodeTaskMap nodeTaskMap = new NodeTaskMap(new FinalizerService()); + StageId stageId = new StageId(new QueryId("query"), 0); + UUID uuid = UUID.randomUUID(); + + PlanFragment testFragmentProducer = createTableScanPlanFragment("build", ReuseExchangeOperator.STRATEGY.REUSE_STRATEGY_PRODUCER, uuid, 1); + + PlanNodeId tableScanNodeId = new PlanNodeId("plan_id"); + StageExecutionPlan producerStageExecutionPlan = new StageExecutionPlan( + testFragmentProducer, + ImmutableMap.of(tableScanNodeId, new ConnectorAwareSplitSource(CONNECTOR_ID, createFixedSplitSource(0, TestingSplit::createRemoteSplit))), + ImmutableList.of(), + ImmutableMap.of(tableScanNodeId, new TableInfo(new QualifiedObjectName("test", TEST_SCHEMA, "test"), TupleDomain.all()))); + + SqlStageExecution producerStage = createSqlStageExecution( + stageId, + new TestSqlTaskManager.MockLocationFactory().createStageLocation(stageId), + producerStageExecutionPlan.getFragment(), + producerStageExecutionPlan.getTables(), + new MockRemoteTaskFactory(remoteTaskExecutor, remoteTaskScheduledExecutor), + TEST_SESSION_REUSE, + true, + nodeTaskMap, + remoteTaskExecutor, + new NoOpFailureDetector(), + new SplitSchedulerStats(), + new DynamicFilterService(new LocalStateStoreProvider( + new SeedStoreManager(new FileSystemClientManager()))), + new QuerySnapshotManager(stageId.getQueryId(), NOOP_SNAPSHOT_UTILS, TEST_SESSION)); + + Set splits = createAndGetSplits(10); + Multimap producerAssignment = nodeSelector.computeAssignments(splits, ImmutableList.copyOf(taskMap.values()), Optional.of(producerStage)).getAssignments(); + PlanFragment testFragmentConsumer = createTableScanPlanFragment("build", ReuseExchangeOperator.STRATEGY.REUSE_STRATEGY_CONSUMER, uuid, 1); + StageExecutionPlan consumerStageExecutionPlan = new StageExecutionPlan( + testFragmentConsumer, + ImmutableMap.of(tableScanNodeId, new ConnectorAwareSplitSource(CONNECTOR_ID, createFixedSplitSource(0, TestingSplit::createRemoteSplit))), + ImmutableList.of(), + ImmutableMap.of(tableScanNodeId, new TableInfo(new QualifiedObjectName("test", TEST_SCHEMA, "test"), TupleDomain.all()))); + + SqlStageExecution stage = createSqlStageExecution( + stageId, + new TestSqlTaskManager.MockLocationFactory().createStageLocation(stageId), + consumerStageExecutionPlan.getFragment(), + consumerStageExecutionPlan.getTables(), + new MockRemoteTaskFactory(remoteTaskExecutor, remoteTaskScheduledExecutor), + TEST_SESSION_REUSE, + true, + nodeTaskMap, + remoteTaskExecutor, + new NoOpFailureDetector(), + new SplitSchedulerStats(), + new DynamicFilterService(new LocalStateStoreProvider( + new SeedStoreManager(new FileSystemClientManager()))), + new QuerySnapshotManager(stageId.getQueryId(), NOOP_SNAPSHOT_UTILS, TEST_SESSION)); + Multimap consumerAssignment = nodeSelector.computeAssignments(splits, ImmutableList.copyOf(taskMap.values()), Optional.of(stage)).getAssignments(); + + assertEquals(consumerAssignment.size(), consumerAssignment.size()); + for (InternalNode node : consumerAssignment.keySet()) { + List splitList = new ArrayList<>(); + List splitList2 = new ArrayList<>(); + boolean b = producerAssignment.containsEntry(node, consumerAssignment.get(node)); + Collection producerSplits = producerAssignment.get(node); + Collection consumerSplits = producerAssignment.get(node); + producerSplits.forEach(s -> splitList.add(s)); + List splitList1 = splitList.get(0).getSplits(); + consumerSplits.forEach(s -> splitList2.add(s)); + int i = 0; + for (Split split3 : splitList1) { + SplitKey splitKey1 = new SplitKey(split3, TEST_CATALOG, TEST_SCHEMA, TEST_TABLE); + SplitKey splitKey2 = new SplitKey(splitList1.get(i), TEST_CATALOG, TEST_SCHEMA, TEST_TABLE); + boolean f = splitKey1.equals(splitKey2); + assertEquals(true, f); + i++; + } + } + } + + @Test + public void testRuseExchangeSplitsGroupNotMatchingBetweenProducerConsumer() + { + setUpNodes(); + NodeTaskMap nodeTaskMap = new NodeTaskMap(new FinalizerService()); + StageId stageId = new StageId(new QueryId("query"), 0); + UUID uuid = UUID.randomUUID(); + + PlanFragment testFragmentProducer = createTableScanPlanFragment("build", ReuseExchangeOperator.STRATEGY.REUSE_STRATEGY_PRODUCER, uuid, 1); + + PlanNodeId tableScanNodeId = new PlanNodeId("plan_id"); + StageExecutionPlan producerStageExecutionPlan = new StageExecutionPlan( + testFragmentProducer, + ImmutableMap.of(tableScanNodeId, new ConnectorAwareSplitSource(CONNECTOR_ID, createFixedSplitSource(0, TestingSplit::createRemoteSplit))), + ImmutableList.of(), + ImmutableMap.of(tableScanNodeId, new TableInfo(new QualifiedObjectName("test", TEST_SCHEMA, "test"), TupleDomain.all()))); + + SqlStageExecution producerStage = createSqlStageExecution( + stageId, + new TestSqlTaskManager.MockLocationFactory().createStageLocation(stageId), + producerStageExecutionPlan.getFragment(), + producerStageExecutionPlan.getTables(), + new MockRemoteTaskFactory(remoteTaskExecutor, remoteTaskScheduledExecutor), + TEST_SESSION_REUSE, + true, + nodeTaskMap, + remoteTaskExecutor, + new NoOpFailureDetector(), + new SplitSchedulerStats(), + new DynamicFilterService(new LocalStateStoreProvider( + new SeedStoreManager(new FileSystemClientManager()))), + new QuerySnapshotManager(stageId.getQueryId(), NOOP_SNAPSHOT_UTILS, TEST_SESSION)); + + Set producerSplits = createAndGetSplits(10); + Multimap producerAssignment = nodeSelector.computeAssignments(producerSplits, ImmutableList.copyOf(taskMap.values()), Optional.of(producerStage)).getAssignments(); + PlanFragment testFragmentConsumer = createTableScanPlanFragment("build", ReuseExchangeOperator.STRATEGY.REUSE_STRATEGY_CONSUMER, uuid, 1); + StageExecutionPlan consumerStageExecutionPlan = new StageExecutionPlan( + testFragmentConsumer, + ImmutableMap.of(tableScanNodeId, new ConnectorAwareSplitSource(CONNECTOR_ID, createFixedSplitSource(0, TestingSplit::createRemoteSplit))), + ImmutableList.of(), + ImmutableMap.of(tableScanNodeId, new TableInfo(new QualifiedObjectName("test", TEST_SCHEMA, "test"), TupleDomain.all()))); + + SqlStageExecution stage = createSqlStageExecution( + stageId, + new TestSqlTaskManager.MockLocationFactory().createStageLocation(stageId), + consumerStageExecutionPlan.getFragment(), + consumerStageExecutionPlan.getTables(), + new MockRemoteTaskFactory(remoteTaskExecutor, remoteTaskScheduledExecutor), + TEST_SESSION_REUSE, + true, + nodeTaskMap, + remoteTaskExecutor, + new NoOpFailureDetector(), + new SplitSchedulerStats(), + new DynamicFilterService(new LocalStateStoreProvider( + new SeedStoreManager(new FileSystemClientManager()))), + new QuerySnapshotManager(stageId.getQueryId(), NOOP_SNAPSHOT_UTILS, TEST_SESSION)); + Set consumerSplits = createAndGetSplits(50); + + try { + Multimap consumerAssignment = nodeSelector.computeAssignments(consumerSplits, ImmutableList.copyOf(taskMap.values()), Optional.of(stage)).getAssignments(); + } + catch (PrestoException e) { + assertEquals("Producer & consumer splits are not same", e.getMessage()); + return; + } + assertEquals(false, true); + } + + private void initSortBasedAggregation() + { + synchronized (TestHiveIntegrationSmokeTest.this) { + if (null == testSessionSort) { + this.testSessionSort = Session.builder(getSession()) + .setSystemProperty("sort_based_aggregation_enabled", "true") + .build(); + + this.testSessionSortPrcntDrv50 = Session.builder(getSession()) + .setSystemProperty("sort_based_aggregation_enabled", "true") + .setSystemProperty("prcnt_drivers_for_partial_aggr", "33") + .build(); + + this.testSessionSortPrcntDrv25 = Session.builder(getSession()) + .setSystemProperty("sort_based_aggregation_enabled", "true") + .setSystemProperty("prcnt_drivers_for_partial_aggr", "25") + .build(); + + this.testSessionSortPrcntDrv40 = Session.builder(getSession()) + .setSystemProperty("sort_based_aggregation_enabled", "true") + .setSystemProperty("prcnt_drivers_for_partial_aggr", "25") + .build(); + } + } + } + + @Test + public void sortAggSingleSort() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists unsorttable"); + assertUpdate("drop table if exists sorttable"); + computeActual("create table unsorttable (orderkey int, year int) WITH (transactional = true , " + + "format = 'ORC', partitioned_by = ARRAY[ 'year' ] )"); + assertUpdate("insert into unsorttable values (1,2011)", 1); + assertUpdate("insert into unsorttable values (2,2012)", 1); + assertUpdate("insert into unsorttable values (2,2012)", 1); + assertUpdate("insert into unsorttable values (2,2012)", 1); + assertUpdate("insert into unsorttable values (3,2013)", 1); + assertUpdate("insert into unsorttable values (3,2013)", 1); + assertUpdate("insert into unsorttable values (3,2014)", 1); + + computeActual("create table sorttable with(transactional = false, " + + "format = 'ORC', bucketed_by=array['year'], bucket_count=1, sorted_by = ARRAY['year']) as select * from unsorttable order by year"); + + MaterializedResult sortResult = computeActual(testSessionSort, "select avg(orderkey), count(year)," + + "year from sorttable group by year order by year"); + + MaterializedResult hashResult = computeActual("select avg(orderkey), count(year)," + + "year from sorttable group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv50, "select avg(orderkey), count(year), year from sorttable group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE sorttable"); + assertUpdate("DROP TABLE unsorttable"); + } + + @Test + public void sortAggSingleSortNoAggregation() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists unsorttable1"); + assertUpdate("drop table if exists sorttable1"); + computeActual("create table unsorttable1 (orderkey int, year bigint) WITH (transactional = true , " + + "format = 'ORC', partitioned_by = ARRAY[ 'year'] )"); + assertUpdate("insert into unsorttable1 values (1,2011)", 1); + assertUpdate("insert into unsorttable1 values (2,2012)", 1); + assertUpdate("insert into unsorttable1 values (2,2012)", 1); + assertUpdate("insert into unsorttable1 values (2,2012)", 1); + assertUpdate("insert into unsorttable1 values (3,2013)", 1); + assertUpdate("insert into unsorttable1 values (3,2013)", 1); + assertUpdate("insert into unsorttable1 values (3,2014)", 1); + + computeActual("create table sorttable1 with(transactional = false, " + + "format = 'ORC', bucketed_by=array['year'], bucket_count=1, sorted_by = ARRAY['year']) as select * from unsorttable1 order by year"); + + MaterializedResult sortResult = computeActual(testSessionSort, "select year from sorttable1 group by year order by year"); + MaterializedResult hashResult = computeActual("select year from sorttable1 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv50, "select year from sorttable1 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE sorttable1"); + assertUpdate("DROP TABLE unsorttable1"); + } + + @Test + public void sortAggBigint() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists unsorttable2"); + assertUpdate("drop table if exists sorttable2"); + computeActual("create table unsorttable2 (orderkey int, year bigint) WITH (transactional = true , " + + "format = 'ORC', partitioned_by = ARRAY[ 'year'] )"); + assertUpdate("insert into unsorttable2 values (1,null)", 1); + assertUpdate("insert into unsorttable2 values (2,null)", 1); + assertUpdate("insert into unsorttable2 values (2,2012)", 1); + assertUpdate("insert into unsorttable2 values (2,2012)", 1); + assertUpdate("insert into unsorttable2 values (3,2013)", 1); + assertUpdate("insert into unsorttable2 values (3,2013)", 1); + assertUpdate("insert into unsorttable2 values (3,2014)", 1); + + computeActual("create table sorttable2 with(transactional = false, " + + "format = 'ORC', bucketed_by=array['year'], bucket_count=10, sorted_by = ARRAY['year']) as select * from unsorttable2 order by year"); + + assertUpdate("set session sort_based_aggregation_enabled=true"); + MaterializedResult sortResult = computeActual(testSessionSort, "select avg(orderkey), count(year), year from sorttable2 group by year order by year"); + MaterializedResult hashResult = computeActual("select avg(orderkey), count(year), year from sorttable2 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv50, "select avg(orderkey), count(year), year from sorttable2 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE sorttable2"); + assertUpdate("DROP TABLE unsorttable2"); + } + + @Test + public void sortAggMultipleSort() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists unsorttable3"); + assertUpdate("drop table if exists sorttable3"); + computeActual("create table unsorttable3 (number int, orderkey double, year double) WITH (transactional = true , " + + "format = 'ORC')"); + assertUpdate("insert into unsorttable3 values (1,11,2011)", 1); + assertUpdate("insert into unsorttable3 values (2,22,2012)", 1); + assertUpdate("insert into unsorttable3 values (3,33,2012)", 1); + assertUpdate("insert into unsorttable3 values (4,33,2012)", 1); + assertUpdate("insert into unsorttable3 values (4,44,2012)", 1); + assertUpdate("insert into unsorttable3 values (5,55,2013)", 1); + assertUpdate("insert into unsorttable3 values (6,66,2013)", 1); + assertUpdate("insert into unsorttable3 values (7,77,2014)", 1); + + computeActual("create table sorttable3 with(transactional = false, " + + "format = 'ORC', bucketed_by=array['orderkey', 'year'], bucket_count=2, sorted_by = ARRAY['orderkey', 'year']) as select * from unsorttable3 order by orderkey,year"); + + MaterializedResult sortResult = computeActual(testSessionSort, "select avg(orderkey), count(year)," + + "year from sorttable3 group by orderkey,year order by orderkey,year"); + + MaterializedResult hashResult = computeActual("select avg(orderkey), count(year)," + + "year from sorttable3 group by orderkey,year order by orderkey,year"); + + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSort, "select avg(orderkey), count(year)," + + "year from sorttable3 group by year order by year"); + + hashResult = computeActual("select avg(orderkey), count(year)," + + "year from sorttable3 group by year order by year"); + + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv50, "select avg(orderkey), count(year), year from sorttable3 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE sorttable3"); + assertUpdate("DROP TABLE unsorttable3"); + } + + @Test + public void sortAggDateType() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists unsorttable4"); + assertUpdate("drop table if exists sorttable4"); + computeActual("create table unsorttable4 (number int, orderkey decimal(10,4), year date) WITH (transactional = true , " + + "format = 'ORC')"); + assertUpdate("insert into unsorttable4 values (1,11.1,date '2011-07-20')", 1); + assertUpdate("insert into unsorttable4 values (2,22.2,date '2012-07-20')", 1); + assertUpdate("insert into unsorttable4 values (3,33.3,date '2013-07-20')", 1); + assertUpdate("insert into unsorttable4 values (4,33.3,date '2013-07-20')", 1); + assertUpdate("insert into unsorttable4 values (5,55.5,date '2013-07-20')", 1); + assertUpdate("insert into unsorttable4 values (6,66.6,date '2014-07-20')", 1); + assertUpdate("insert into unsorttable4 values (7,77.7,date '2015-07-20')", 1); + + computeActual("create table sorttable4 with(transactional = false, " + + "format = 'ORC', bucketed_by=array['year'], bucket_count=2, sorted_by = ARRAY['year']) as select * from unsorttable4 order by year"); + String query = "select avg(orderkey), count(year), year from sorttable4 group by year order by year"; + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv40, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE sorttable4"); + assertUpdate("DROP TABLE unsorttable4"); + } + + @Test + public void sortAggVarchar() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists unsorttable5"); + assertUpdate("drop table if exists sorttable5"); + computeActual("create table unsorttable5 (number int, orderkey decimal(10,4), year varchar) WITH (transactional = true , " + + "format = 'ORC')"); + assertUpdate("insert into unsorttable5 values (1,11.1, '2011-07-20')", 1); + assertUpdate("insert into unsorttable5 values (2,22.2, '2012-07-20')", 1); + assertUpdate("insert into unsorttable5 values (3,33.3, '2013-07-20')", 1); + assertUpdate("insert into unsorttable5 values (4,33.3, '2013-07-20')", 1); + assertUpdate("insert into unsorttable5 values (5,55.5, '2013-07-20')", 1); + assertUpdate("insert into unsorttable5 values (6,66.6, '2014-07-20')", 1); + assertUpdate("insert into unsorttable5 values (7,77.7, '2015-07-20')", 1); + + computeActual("create table sorttable5 with(transactional = false, " + + "format = 'ORC', bucketed_by=array['year'], bucket_count=2, sorted_by = ARRAY['year']) as select * from unsorttable5 order by year"); + + MaterializedResult sortResult = computeActual(testSessionSort, "select avg(orderkey), count(year)," + + "year from sorttable5 group by year order by year"); + + MaterializedResult hashResult = computeActual("select avg(orderkey), count(year)," + + "year from sorttable5 group by year order by year"); + + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv50, "select avg(orderkey), count(year), year from sorttable5 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE sorttable5"); + assertUpdate("DROP TABLE unsorttable5"); + } + + @Test + public void sortAggSmallint() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists unsorttable6"); + assertUpdate("drop table if exists sorttable6"); + computeActual("create table unsorttable6 (orderkey int, year smallint) WITH (transactional = true , " + + "format = 'ORC', partitioned_by = ARRAY[ 'year' ] )"); + assertUpdate("insert into unsorttable6 values (1,smallint '2011')", 1); + assertUpdate("insert into unsorttable6 values (2,smallint '2012')", 1); + assertUpdate("insert into unsorttable6 values (2,smallint '2012')", 1); + assertUpdate("insert into unsorttable6 values (2,smallint '2012')", 1); + assertUpdate("insert into unsorttable6 values (3,smallint '2013')", 1); + assertUpdate("insert into unsorttable6 values (3,smallint '2014')", 1); + assertUpdate("insert into unsorttable6 values (3,smallint '2015')", 1); + + computeActual("create table sorttable6 with(transactional = false, " + + "format = 'ORC', bucketed_by=array['year'], bucket_count=1, sorted_by = ARRAY['year']) as select * from unsorttable6 order by year"); + + MaterializedResult sortResult = computeActual(testSessionSort, "select avg(orderkey), count(year)," + + "year from sorttable6 group by year order by year"); + + MaterializedResult hashResult = computeActual("select avg(orderkey), count(year)," + + "year from sorttable6 group by year order by year"); + + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv50, "select avg(orderkey), count(year), year from sorttable6 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE sorttable6"); + assertUpdate("DROP TABLE unsorttable6"); + } + + @Test + public void sortAggBoolean() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists unsorttable7"); + assertUpdate("drop table if exists sorttable7"); + computeActual("create table unsorttable7 (orderkey int, year int, iscurrentemployee boolean ) WITH (transactional = true , " + + "format = 'ORC')"); + assertUpdate("insert into unsorttable7 values (1,2011, true)", 1); + assertUpdate("insert into unsorttable7 values (2,2012, true)", 1); + assertUpdate("insert into unsorttable7 values (2,2012, true)", 1); + assertUpdate("insert into unsorttable7 values (2,2012, false)", 1); + assertUpdate("insert into unsorttable7 values (3,2013, false)", 1); + assertUpdate("insert into unsorttable7 values (3,2013, true)", 1); + assertUpdate("insert into unsorttable7 values (3,2014, false)", 1); + + computeActual("create table sorttable7 with(transactional = false, " + + "format = 'ORC', bucketed_by=array['iscurrentemployee'], bucket_count=1, sorted_by = ARRAY['iscurrentemployee'])" + + " as select * from unsorttable7 order by iscurrentemployee"); + + MaterializedResult sortResult = computeActual(testSessionSort, "select avg(orderkey), count(year)," + + "iscurrentemployee from sorttable7 group by iscurrentemployee order by iscurrentemployee"); + + MaterializedResult hashResult = computeActual("select avg(orderkey), count(year)," + + "iscurrentemployee from sorttable7 group by iscurrentemployee order by iscurrentemployee"); + + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv50, "select avg(orderkey), count(year), iscurrentemployee from sorttable7 group by iscurrentemployee order by iscurrentemployee"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE sorttable7"); + assertUpdate("DROP TABLE unsorttable7"); + } + + @Test + public void sortAggSplitWithMultiplePagesBigint() + { + initSortBasedAggregation(); + // this Test case we will insert many rows , so that single split will yield many pages, groub & sort by bigint + + assertUpdate("drop table if exists unsorttable8"); + assertUpdate("drop table if exists sorttable8"); + computeActual("create table unsorttable8 (orderkey int, year bigint) WITH (transactional = false , " + + "format = 'ORC')"); + + String str = generateNumberOfRowsForTwoColumns(2500, 10); + + assertUpdate("insert into unsorttable8 values " + str, 2510); + + computeActual("create table sorttable8 with(transactional = false, " + + "format = 'ORC', bucketed_by=array['year'], bucket_count=1, sorted_by = ARRAY['year']) as select * from unsorttable8 order by year"); + + MaterializedResult sortResult = computeActual(testSessionSort, "select avg(orderkey), count(year)," + + "year from sorttable8 group by year order by year"); + + MaterializedResult hashResult = computeActual("select avg(orderkey), count(year)," + + "year from sorttable8 group by year order by year"); + + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv50, "select avg(orderkey), count(year)," + + "year from sorttable8 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv25, "select avg(orderkey), count(year)," + + "year from sorttable8 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE sorttable8"); + assertUpdate("DROP TABLE unsorttable8"); + } + + @Test + public void sortAggSplitWithMultiplePagesBigintMultiSort() + { + // this Test case we will insert many rows , so that single split will yield many pages, groub & sort by bigint + assertUpdate("drop table if exists unsorttable9"); + assertUpdate("drop table if exists sorttable9"); + computeActual("create table unsorttable9 (code int, orderkey int, year bigint) WITH (transactional = false , " + + "format = 'ORC')"); + + String str = generateNumberOfRowsForThreeColumns(2500); + + assertUpdate("insert into unsorttable9 values " + str, 5000); + + computeActual("create table sorttable9 with(transactional = false, " + + "format = 'ORC', bucketed_by=array['year'], bucket_count=10, sorted_by = ARRAY['year', 'orderkey']) as select * from unsorttable9"); + + MaterializedResult sortResult = computeActual(testSessionSort, "select avg(orderkey), count(year)," + + "year from sorttable9 group by year order by year"); + + MaterializedResult hashResult = computeActual("select avg(orderkey), count(year)," + + "year from sorttable9 group by year order by year"); + + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv50, "select avg(orderkey), count(year), year from sorttable9 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv25, "select avg(orderkey), count(year), year from sorttable9 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE sorttable9"); + assertUpdate("DROP TABLE unsorttable9"); + } + + @Test + public void sortAggSplitWithMultiplePagesInt() + { + initSortBasedAggregation(); + // this Test case we will insert many rows , so that single split will yield many pages, groub & sort by int + assertUpdate("drop table if exists unsorttable10"); + assertUpdate("drop table if exists sorttable10"); + computeActual("create table unsorttable10 (orderkey int, year int) WITH (transactional = false , " + + "format = 'ORC')"); + + String str = generateNumberOfRowsForTwoColumns(2500, 5); + + assertUpdate("insert into unsorttable10 values " + str, 2505); + + computeActual("create table sorttable10 with(transactional = false, " + + "format = 'ORC', bucketed_by=array['year'], bucket_count=1, sorted_by = ARRAY['year']) as select * from unsorttable10 order by year"); + + MaterializedResult sortResult = computeActual(testSessionSort, "select count(orderkey), count(year)," + + "year from sorttable10 group by year order by year"); + + MaterializedResult hashResult = computeActual("select count(orderkey), count(year), " + + "year from sorttable10 group by year order by year"); + + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv25, "select count(orderkey), count(year)," + + "year from sorttable10 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + sortResult = computeActual(testSessionSortPrcntDrv50, "select count(orderkey), count(year)," + + "year from sorttable10 group by year order by year"); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + assertUpdate("DROP TABLE sorttable10"); + assertUpdate("DROP TABLE unsorttable10"); + } + + @Test + public void sortAggNullAndZero() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists unsorttable11"); + assertUpdate("drop table if exists sorttable11"); + computeActual("create table unsorttable11 (number int, orderkey int, year int) WITH (transactional = true , " + + "format = 'ORC')"); + assertUpdate("insert into unsorttable11 values (1, null, null)", 1); + assertUpdate("insert into unsorttable11 values (1, null, null)", 1); + assertUpdate("insert into unsorttable11 values (2, 0, null)", 1); + assertUpdate("insert into unsorttable11 values (2, 0, null)", 1); + assertUpdate("insert into unsorttable11 values (4, null, 0)", 1); + assertUpdate("insert into unsorttable11 values (5, null, 0)", 1); + assertUpdate("insert into unsorttable11 values (6, 0, 33)", 1); + assertUpdate("insert into unsorttable11 values (7, 0, 33)", 1); + assertUpdate("insert into unsorttable11 values (8, 33, 0)", 1); + assertUpdate("insert into unsorttable11 values (9, 33, 0)", 1); + assertUpdate("insert into unsorttable11 values (10, 33, null)", 1); + assertUpdate("insert into unsorttable11 values (11, 33, null)", 1); + assertUpdate("insert into unsorttable11 values (12, 33, null)", 1); + assertUpdate("insert into unsorttable11 values (13, null, 33)", 1); + assertUpdate("insert into unsorttable11 values (13, null, 33)", 1); + assertUpdate("insert into unsorttable11 values (12, null, 33)", 1); + assertUpdate("insert into unsorttable11 values (14, 33, 66)", 1); + assertUpdate("insert into unsorttable11 values (15, 55, 77 )", 1); + assertUpdate("insert into unsorttable11 values (16, 66, 88)", 1); + assertUpdate("insert into unsorttable11 values (17, 77, 99)", 1); + + computeActual("create table sorttable11 with(transactional = false, " + + "format = 'ORC', bucketed_by=array['year', 'orderkey'], bucket_count=1, sorted_by = ARRAY['year', 'orderkey']) as select * from unsorttable11 order by year"); + String query = "select sum (number), avg(orderkey ), count(year)," + + "year from sorttable11 group by year order by year"; + + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + query = "select sum (number), avg(CASE WHEN orderkey IS NULL THEN 0 ELSE orderkey END), count(CASE WHEN year IS NULL THEN 0 ELSE year END)," + + "year from sorttable11 group by year, orderkey order by year, orderkey"; + + sortResult = computeActual(testSessionSort, query); + hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE sorttable11"); + assertUpdate("DROP TABLE unsorttable11"); + } + + private String generateNumberOfRowsForTwoColumns(int numberOfRows, int numberOfNullRows) + { + String str = ""; + String str1; + for (int i = 0; i < numberOfRows; i++) { + str1 = " ( " + (i + 200) + " , " + i + " ) "; + str = str.concat(str1); + if (i != numberOfRows - 1) { + str = str.concat(","); + } + } + + if (numberOfNullRows != 0) { + str = str.concat(","); + } + for (int i = 0; i < numberOfNullRows; i++) { + str1 = " ( " + (i + 200) + " , null ) "; + str = str.concat(str1); + if (i != numberOfNullRows - 1) { + str = str.concat(","); + } + } + return str; + } + + private String generateNumberOfRowsForThreeColumns(int numberOfRows) + { + String str = ""; + String str1; + for (int i = 0; i < numberOfRows; i++) { + //str1 = " ( " + (i + 200) + " , " + i + " ) "; + str1 = " ( " + (i + 300) + " , " + (i + 200) + " , " + i + " ), "; + str = str.concat(str1); + str1 = " ( " + (i + 600) + " , " + (i + 500) + " , " + i + " ) "; + str = str.concat(str1); + if (i != numberOfRows - 1) { + str = str.concat(","); + } + } + return str; + } + + private Set createAndGetSplits(long start) + { + HiveConfig config = new HiveConfig(); + config.setHiveStorageFormat(HiveStorageFormat.ORC); + config.setHiveCompressionCodec(NONE); + Properties splitProperties = new Properties(); + splitProperties.setProperty(FILE_INPUT_FORMAT, config.getHiveStorageFormat().getInputFormat()); + splitProperties.setProperty(SERIALIZATION_LIB, config.getHiveStorageFormat().getSerDe()); + splitProperties.setProperty("columns", Joiner.on(',').join(TestHivePageSink.getColumnHandles().stream().map(HiveColumnHandle::getName).collect(toList()))); + splitProperties.setProperty("columns.types", Joiner.on(',').join(TestHivePageSink.getColumnHandles().stream().map(HiveColumnHandle::getHiveType).map(hiveType -> hiveType.getHiveTypeName().toString()).collect(toList()))); + List connectorSplits1 = new ArrayList<>(); + + for (long j = start; j < start + 30; j += 10) { + List hiveSplitList = new ArrayList<>(); + for (int i = 0; i < 3; i++) { + HiveSplit hiveSplit = new HiveSplit( + TEST_SCHEMA, + TEST_TABLE, + "", + "file:///", + i + j, + 100 + i + j, + 100 + i + j, + 0, + splitProperties, + ImmutableList.of(), + ImmutableList.of(), + OptionalInt.empty(), + false, + ImmutableMap.of(), + Optional.empty(), + false, + Optional.empty(), + Optional.empty(), + false, + ImmutableMap.of()); + hiveSplitList.add(hiveSplit); + } + + HiveSplitWrapper split2 = HiveSplitWrapper.wrap(hiveSplitList, OptionalInt.empty()); + connectorSplits1.add(split2); + } + + ImmutableList.Builder result = ImmutableList.builder(); + for (ConnectorSplit connectorSplit : connectorSplits1) { + result.add(new Split(CONNECTOR_ID, connectorSplit, Lifespan.taskWide())); + } + List splitList = result.build(); + Set set = splitList.stream().collect(Collectors.toSet()); + return set; + } + + @Test + public void sortAggBasicAggreTestOnTpch() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists sortLineitem"); + assertUpdate("drop table if exists orders_orderkey_totalprice"); + + computeActual("create table sortLineitem with(transactional = false, " + + "format = 'ORC', bucketed_by=array['orderkey', 'partkey'], bucket_count=4, sorted_by = ARRAY['orderkey', 'partkey'])" + + " as select * from tpch.tiny.lineitem"); + + String query = "select count(partkey), count(orderkey), orderkey from sortLineitem group by orderkey, partkey order by orderkey, partkey"; + MaterializedResult sortResult = computeActual(testSessionSort, query); + + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.toString(), hashResult.toString()); + + computeActual("create table orders_orderkey_totalprice with(transactional = false, " + + "format = 'ORC', bucketed_by=array['orderkey', 'totalprice'], bucket_count=4, sorted_by = ARRAY['orderkey', 'totalprice'])" + + " as select * from tpch.tiny.orders"); + + sortResult = computeActual(testSessionSortPrcntDrv50, "select count(totalprice), count(orderkey)," + + "orderkey from orders_orderkey_totalprice group by orderkey, totalprice order by orderkey, totalprice"); + + hashResult = computeActual("select count(totalprice), count(orderkey)," + + "orderkey from orders_orderkey_totalprice group by orderkey, totalprice order by orderkey, totalprice"); + assertEquals(sortResult.toString(), hashResult.toString()); + + assertUpdate("DROP TABLE sortLineitem"); + assertUpdate("DROP TABLE orders_orderkey_totalprice"); + } + + @Test + public void sortAggInnerJoin() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists lineitemSortBy_orderkey_inner"); + assertUpdate("drop table if exists ordersSortBy_orderkey_inner"); + computeActual("create table lineitemSortBy_orderkey_inner with(transactional = false, " + + "format = 'ORC', bucketed_by=array['orderkey'], bucket_count=1, sorted_by = ARRAY['orderkey'])" + + " as select * from tpch.tiny.lineitem"); + + computeActual("create table ordersSortBy_orderkey_inner with(transactional = false, " + + "format = 'ORC', bucketed_by=array['orderkey'], bucket_count=1, sorted_by = ARRAY['orderkey'])" + + " as select * from tpch.tiny.orders"); + assertUpdate("set session sort_based_aggregation_enabled=true"); + String query = "select avg(lineitemSortBy_orderkey_inner.orderkey),lineitemSortBy_orderkey_inner.orderkey from lineitemSortBy_orderkey_inner " + + "INNER JOIN ordersSortBy_orderkey_inner ON lineitemSortBy_orderkey_inner.orderkey = ordersSortBy_orderkey_inner.orderkey " + + "group by lineitemSortBy_orderkey_inner.orderkey " + + "order by lineitemSortBy_orderkey_inner.orderkey"; + + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.toString(), hashResult.toString()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.toString(), hashResult.toString()); + sortResult = computeActual(testSessionSortPrcntDrv25, query); + assertEquals(sortResult.toString(), hashResult.toString()); + + assertUpdate("DROP TABLE lineitemSortBy_orderkey_inner"); + assertUpdate("DROP TABLE ordersSortBy_orderkey_inner"); + } + + @Test + public void sortAggLeftJoin() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists lineitemSortBy_orderkey_left"); + assertUpdate("drop table if exists ordersSortBy_orderkey_left"); + computeActual("create table lineitemSortBy_orderkey_left with(transactional = false, " + + "format = 'ORC', bucketed_by=array['orderkey'], bucket_count=1, sorted_by = ARRAY['orderkey'])" + + " as select * from tpch.tiny.lineitem"); + + computeActual("create table ordersSortBy_orderkey_left with(transactional = false, " + + "format = 'ORC', bucketed_by=array['orderkey'], bucket_count=1, sorted_by = ARRAY['orderkey'])" + + " as select * from tpch.tiny.orders"); + assertUpdate("set session sort_based_aggregation_enabled=true"); + String query = "select avg(lineitemSortBy_orderkey_left.orderkey),lineitemSortBy_orderkey_left.orderkey from lineitemSortBy_orderkey_left " + + "LEFT JOIN " + + "ordersSortBy_orderkey_left ON lineitemSortBy_orderkey_left.orderkey = ordersSortBy_orderkey_left.orderkey " + + "group by lineitemSortBy_orderkey_left.orderkey " + + "order by lineitemSortBy_orderkey_left.orderkey"; + + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv25, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv40, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE lineitemSortBy_orderkey_left"); + assertUpdate("DROP TABLE ordersSortBy_orderkey_left"); + } + + @Test + public void sortAggRightJoin() + { + initSortBasedAggregation(); + assertUpdate("drop table if exists lineitemSortBy_orderkey_right"); + + computeActual("create table lineitemSortBy_orderkey_right with(transactional = false, " + + "format = 'ORC', bucketed_by=array['orderkey'], bucket_count=1, sorted_by = ARRAY['orderkey'])" + + " as select * from tpch.tiny.lineitem"); + assertUpdate("set session sort_based_aggregation_enabled=true"); + String query = "select count(lineitemSortBy_orderkey_right.orderkey), lineitemSortBy_orderkey_right.orderkey from lineitemSortBy_orderkey_right " + + "RIGHT JOIN " + + "tpch.tiny.orders ON lineitemSortBy_orderkey_right.orderkey = tpch.tiny.orders.orderkey " + + "group by " + + "lineitemSortBy_orderkey_right.orderkey " + + "order by " + + "lineitemSortBy_orderkey_right.orderkey"; + + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv25, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv40, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE lineitemSortBy_orderkey_right"); + } + + @Test + public void sortAggInnerLeftJoin() + { + initSortBasedAggregation(); + computeActual("create table lineitem_orderkey_partkey_InnerLeftJoin with(transactional = false, " + + "format = 'ORC', bucketed_by=array['orderkey', 'partkey'], bucket_count=1, sorted_by = ARRAY['orderkey', 'partkey'])" + + " as select * from tpch.tiny.lineitem"); + + computeActual("create table shortlineitem_InnerLeftJoin with(transactional = false, format = 'ORC') as select * from tpch.tiny.lineitem limit 10000"); + String query = "select avg(lineitem_orderkey_partkey_InnerLeftJoin.orderkey), lineitem_orderkey_partkey_InnerLeftJoin.orderkey " + + "from " + + "lineitem_orderkey_partkey_InnerLeftJoin " + + "INNER JOIN " + + "shortlineitem_InnerLeftJoin ON lineitem_orderkey_partkey_InnerLeftJoin.orderkey = shortlineitem_InnerLeftJoin.orderkey " + + "Left JOIN " + + " tpch.tiny.orders ON lineitem_orderkey_partkey_InnerLeftJoin.orderkey = tpch.tiny.orders.orderkey " + + "group by lineitem_orderkey_partkey_InnerLeftJoin.orderkey, lineitem_orderkey_partkey_InnerLeftJoin.partkey " + + "order by lineitem_orderkey_partkey_InnerLeftJoin.orderkey, lineitem_orderkey_partkey_InnerLeftJoin.partkey"; + + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv40, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE lineitem_orderkey_partkey_InnerLeftJoin"); + assertUpdate("DROP TABLE shortlineitem_InnerLeftJoin"); + } + + @Test + public void sortAggInnerRightJoin() + { + initSortBasedAggregation(); + computeActual("create table lineitem_orderkey_partkey_innerRight with(transactional = false, " + + "format = 'ORC', bucketed_by=array['orderkey', 'partkey'], bucket_count=4, sorted_by = ARRAY['orderkey', 'partkey'])" + + " as select * from tpch.tiny.lineitem"); + + computeActual("create table shortlineitem_InnerRightJoin with(transactional = false, format = 'ORC') as select * from tpch.tiny.lineitem limit 10000"); + String query = "select avg(lineitem_orderkey_partkey_innerRight.orderkey), lineitem_orderkey_partkey_innerRight.orderkey " + + "from lineitem_orderkey_partkey_innerRight " + + "INNER JOIN " + + "shortlineitem_InnerRightJoin ON lineitem_orderkey_partkey_innerRight.orderkey = shortlineitem_InnerRightJoin.orderkey " + + "RIGHT JOIN " + + "tpch.tiny.orders ON lineitem_orderkey_partkey_innerRight.orderkey = tpch.tiny.orders.orderkey " + + "group by " + + "lineitem_orderkey_partkey_innerRight.orderkey, lineitem_orderkey_partkey_innerRight.partkey " + + "order by " + + "lineitem_orderkey_partkey_innerRight.orderkey, lineitem_orderkey_partkey_innerRight.partkey"; + + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv40, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + + assertUpdate("DROP TABLE lineitem_orderkey_partkey_innerRight"); + assertUpdate("DROP TABLE shortlineitem_InnerRightJoin"); + } + + @Test + public void testCachedPlanTableValidation() + { + assertUpdate("CREATE TABLE table_plan_cache_001 (id int)"); + assertUpdate("INSERT INTO table_plan_cache_001 VALUES(1)", 1); + assertUpdate("INSERT INTO table_plan_cache_001 VALUES(2)", 1); + assertUpdate("INSERT INTO table_plan_cache_001 VALUES(3)", 1); + MaterializedResult result = computeActual("SELECT * from table_plan_cache_001 where id = 1"); + assertEquals(result.getRowCount(), 1); + assertUpdate("DROP TABLE table_plan_cache_001"); + assertUpdate("CREATE TABLE table_plan_cache_001 (id int) with (transactional=true)"); + assertUpdate("INSERT INTO table_plan_cache_001 VALUES(1)", 1); + assertUpdate("INSERT INTO table_plan_cache_001 VALUES(2)", 1); + assertUpdate("INSERT INTO table_plan_cache_001 VALUES(3)", 1); + result = computeActual("SELECT * from table_plan_cache_001 where id = 1"); + assertEquals(result.getRowCount(), 1); + assertUpdate("DROP TABLE table_plan_cache_001"); + } + + @Test + public void sortAggPartitionBucketCount1() + { + initSortBasedAggregation(); + computeActual("create table lineitem_orderkey_partkey_partition with(transactional = false, " + + "format = 'ORC', partitioned_by = ARRAY['comment'], bucketed_by=array['orderkey'], bucket_count=1, sorted_by = ARRAY['orderkey', 'partkey'])" + + " as select * from tpch.tiny.lineitem limit 100"); + + String query = "select avg(lineitem_orderkey_partkey_partition.orderkey), lineitem_orderkey_partkey_partition.orderkey " + + "from lineitem_orderkey_partkey_partition " + + "group by " + + "comment, orderkey, partkey " + + "order by " + + "comment, orderkey, partkey "; + + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv40, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + assertUpdate("DROP TABLE lineitem_orderkey_partkey_partition"); + } + + @Test + public void sortAggPartitionBucketCount2() + { + initSortBasedAggregation(); + computeActual("create table lineitem_orderkey_partkey_partition2 with(transactional = false, " + + "format = 'ORC', partitioned_by = ARRAY['comment'], bucketed_by=array['orderkey', 'partkey'], bucket_count=2, sorted_by = ARRAY['orderkey', 'partkey'])" + + " as select * from tpch.tiny.lineitem limit 100"); + + String query = "select avg(lineitem_orderkey_partkey_partition2.orderkey), lineitem_orderkey_partkey_partition2.orderkey " + + "from lineitem_orderkey_partkey_partition2 " + + "group by comment, orderkey, partkey " + + "order by comment, orderkey, partkey "; + + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv40, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + assertUpdate("DROP TABLE lineitem_orderkey_partkey_partition2"); + } + + @Test + public void sortAggPartitionBucketCount1With2BucketColumns() + { + initSortBasedAggregation(); + computeActual("create table lineitem_orderkey_partkey_partition3 with(transactional = false, " + + "format = 'ORC', partitioned_by = ARRAY['comment'], bucketed_by=array['orderkey', 'partkey'], bucket_count=1, sorted_by = ARRAY['orderkey', 'partkey'])" + + " as select * from tpch.tiny.lineitem limit 100"); + + String query = "select avg(lineitem_orderkey_partkey_partition3.orderkey), lineitem_orderkey_partkey_partition3.orderkey " + + "from lineitem_orderkey_partkey_partition3 " + + "group by comment, orderkey, partkey " + + "order by comment, orderkey, partkey "; + + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv40, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + assertUpdate("DROP TABLE lineitem_orderkey_partkey_partition3"); + } + + @Test + public void sortAggPartition2BucketCount1With2BucketColumns() + { + initSortBasedAggregation(); + computeActual("create table lineitem_orderkey_partkey_partition4 with(transactional = false, " + + "format = 'ORC', partitioned_by = ARRAY['shipmode', 'comment'], bucketed_by=array['orderkey', 'partkey'], bucket_count=1, sorted_by = ARRAY['orderkey', 'partkey'])" + + " as select * from tpch.tiny.lineitem limit 100"); + + String query = "select avg(lineitem_orderkey_partkey_partition4.orderkey), lineitem_orderkey_partkey_partition4.orderkey " + + "from lineitem_orderkey_partkey_partition4 " + + "group by shipmode, comment, orderkey, partkey " + + "order by shipmode, comment, orderkey, partkey "; + + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv40, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + assertUpdate("DROP TABLE lineitem_orderkey_partkey_partition4"); + } + + @Test + public void sortAggSemiJoin() + { + initSortBasedAggregation(); + + computeActual("create table lineitem_orderkey_partkey_SemiJoin with(transactional = false, " + + "format = 'ORC', bucketed_by=array['orderkey', 'partkey'], bucket_count=1, sorted_by = ARRAY['orderkey', 'partkey'])" + + " as select * from tpch.tiny.lineitem"); + + computeActual("create table lineitem_semiJoin with(transactional = false, format = 'ORC')" + + " as select * from tpch.tiny.lineitem where partkey is not null"); + + String query = "select avg(lineitem_orderkey_partkey_SemiJoin.orderkey), lineitem_orderkey_partkey_SemiJoin.orderkey " + + " from lineitem_orderkey_partkey_SemiJoin " + + " where orderkey " + + " in (select orderkey from lineitem_semiJoin) " + + " group by orderkey, partkey " + + " order by orderkey, partkey "; + + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv40, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + assertUpdate("DROP TABLE lineitem_orderkey_partkey_SemiJoin"); + assertUpdate("DROP TABLE lineitem_semiJoin"); + } + + @Test + public void SortAggreDistinct() + { + initSortBasedAggregation(); + computeActual("create table lineitem_orderkey_partkey_Distinct with(transactional = false, " + + "format = 'ORC', bucketed_by=array['orderkey', 'partkey'], bucket_count=1, sorted_by = ARRAY['orderkey', 'partkey'])" + + " as select * from tpch.tiny.lineitem"); + + String query = "select sum(distinct(partkey)), orderkey from lineitem_orderkey_partkey_Distinct " + + "group by orderkey order by orderkey"; + + MaterializedResult sortResult = computeActual(testSessionSort, query); + MaterializedResult hashResult = computeActual(query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv50, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + sortResult = computeActual(testSessionSortPrcntDrv40, query); + assertEquals(sortResult.getMaterializedRows(), hashResult.getMaterializedRows()); + assertUpdate("DROP TABLE lineitem_orderkey_partkey_Distinct"); + } + + @Test + public void testFailRefreshMetaCache() + { + assertQueryFails("REFRESH META CACHE FOR abc", "Catalog does not exist:abc"); + assertQueryFails("REFRESH META CACHE FOR abc.def", "Catalog does not exist:abc.def"); + } + + @Test + public void testCachedPlanForTablesWithSameName() + { + String table = "tab2"; + String schema = "default"; + assertUpdate(String.format("CREATE SCHEMA IF NOT EXISTS %s", schema)); + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int, c int) with (partitioned_by = ARRAY['c'])", schema, table)); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 1, 1)", schema, table), 1); + assertQuery(String.format("SELECT * FROM %s.%s", schema, table), "VALUES (1, 1, 1)"); + assertUpdate(String.format("DROP TABLE %s.%s", schema, table)); + assertUpdate(String.format("CREATE TABLE %s.%s (a int, b int, c int)", schema, table)); + assertUpdate(String.format("INSERT INTO %s.%s VALUES (1, 1, 1)", schema, table), 1); + assertQuery(String.format("SELECT * FROM %s.%s", schema, table), "VALUES (1, 1, 1)"); + assertUpdate(String.format("DROP TABLE %s.%s", schema, table)); + } + + @Test + public void testAcidFormatColumnNameConflict() + { + assertUpdate(String.format("CREATE TABLE test_acid_columnname_conflict (originalTransaction int, currentTransaction int," + + " rowId int, bucket int, row int )" + + "with (transactional=true, format='orc')")); + + assertUpdate(String.format("INSERT INTO test_acid_columnname_conflict VALUES (1, 2, 3, 4, 5)"), 1); + + assertQuery("SELECT * FROM test_acid_columnname_conflict", "VALUES (1, 2, 3, 4, 5)"); + + assertUpdate(String.format("DROP TABLE test_acid_columnname_conflict")); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveLocationService.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveLocationService.java new file mode 100644 index 00000000..dd65898d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveLocationService.java @@ -0,0 +1,108 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.TestBackgroundHiveSplitLoader.TestingHdfsEnvironment; +import io.prestosql.spi.PrestoException; +import org.apache.hadoop.fs.Path; +import org.testng.annotations.Test; + +import java.util.Optional; + +import static io.prestosql.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_EXISTING_DIRECTORY; +import static io.prestosql.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_NEW_DIRECTORY; +import static io.prestosql.plugin.hive.LocationHandle.WriteMode.STAGE_AND_MOVE_TO_TARGET_DIRECTORY; +import static org.testng.Assert.assertEquals; + +public class TestHiveLocationService +{ + @Test + public void testGetTableWriteInfoAppend() + { + assertThat(locationHandle(STAGE_AND_MOVE_TO_TARGET_DIRECTORY), false) + .producesWriteInfo(new LocationService.WriteInfo( + new Path("/target"), + new Path("/write"), + STAGE_AND_MOVE_TO_TARGET_DIRECTORY)); + + assertThat(locationHandle(DIRECT_TO_TARGET_EXISTING_DIRECTORY, "/target", "/target"), false) + .producesWriteInfo(new LocationService.WriteInfo( + new Path("/target"), + new Path("/target"), + DIRECT_TO_TARGET_EXISTING_DIRECTORY)); + + assertThat(locationHandle(DIRECT_TO_TARGET_NEW_DIRECTORY, "/target", "/target"), false) + .producesWriteInfo(new LocationService.WriteInfo( + new Path("/target"), + new Path("/target"), + DIRECT_TO_TARGET_NEW_DIRECTORY)); + } + + @Test + public void testGetTableWriteInfoOverwriteSuccess() + { + assertThat(locationHandle(STAGE_AND_MOVE_TO_TARGET_DIRECTORY), true) + .producesWriteInfo(new LocationService.WriteInfo( + new Path("/target"), + new Path("/write"), + STAGE_AND_MOVE_TO_TARGET_DIRECTORY)); + } + + @Test(expectedExceptions = PrestoException.class, expectedExceptionsMessageRegExp = "Overwriting unpartitioned table not supported when writing directly to target directory") + public void testGetTableWriteInfoOverwriteFailDirectNew() + { + assertThat(locationHandle(DIRECT_TO_TARGET_NEW_DIRECTORY, "/target", "/target"), true); + } + + @Test(expectedExceptions = PrestoException.class, expectedExceptionsMessageRegExp = "Overwriting unpartitioned table not supported when writing directly to target directory") + public void testGetTableWriteInfoOverwriteFailDirectExisting() + { + assertThat(locationHandle(DIRECT_TO_TARGET_EXISTING_DIRECTORY, "/target", "/target"), true); + } + + private static Assertion assertThat(LocationHandle locationHandle, boolean overwrite) + { + return new Assertion(locationHandle, overwrite); + } + + public static class Assertion + { + private final LocationService.WriteInfo actual; + + public Assertion(LocationHandle locationHandle, boolean overwrite) + { + HdfsEnvironment hdfsEnvironment = new TestingHdfsEnvironment(ImmutableList.of()); + LocationService service = new HiveLocationService(hdfsEnvironment); + this.actual = service.getTableWriteInfo(locationHandle, overwrite); + } + + public void producesWriteInfo(LocationService.WriteInfo expected) + { + assertEquals(actual.getWritePath(), expected.getWritePath()); + assertEquals(actual.getTargetPath(), expected.getTargetPath()); + assertEquals(actual.getWriteMode(), expected.getWriteMode()); + } + } + + private static LocationHandle locationHandle(LocationHandle.WriteMode writeMode) + { + return locationHandle(writeMode, "/target", "/write"); + } + + private static LocationHandle locationHandle(LocationHandle.WriteMode writeMode, String targetPath, String writePath) + { + return new LocationHandle(new Path(targetPath), new Path(writePath), true, writeMode, Optional.empty()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveMetadata.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveMetadata.java new file mode 100644 index 00000000..8d7c5158 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveMetadata.java @@ -0,0 +1,88 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.airlift.slice.Slices; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.predicate.NullableValue; +import io.prestosql.spi.type.TypeSignature; +import io.prestosql.spi.type.VarcharType; +import org.testng.annotations.Test; + +import java.util.Optional; + +import static io.prestosql.plugin.hive.HiveMetadata.createPredicate; + +public class TestHiveMetadata +{ + private static final HiveColumnHandle TEST_COLUMN_HANDLE = new HiveColumnHandle( + "test", + HiveType.HIVE_STRING, + TypeSignature.parseTypeSignature("varchar"), + 0, + HiveColumnHandle.ColumnType.PARTITION_KEY, + Optional.empty()); + + @Test(timeOut = 5000) + public void testCreatePredicate() + { + ImmutableList.Builder partitions = ImmutableList.builder(); + + for (int i = 0; i < 5_000; i++) { + partitions.add(new HivePartition( + new SchemaTableName("test", "test"), + Integer.toString(i), + ImmutableMap.of(TEST_COLUMN_HANDLE, NullableValue.of(VarcharType.VARCHAR, Slices.utf8Slice(Integer.toString(i)))))); + } + + createPredicate(ImmutableList.of(TEST_COLUMN_HANDLE), partitions.build()); + } + + @Test + public void testCreateOnlyNullsPredicate() + { + ImmutableList.Builder partitions = ImmutableList.builder(); + + for (int i = 0; i < 5; i++) { + partitions.add(new HivePartition( + new SchemaTableName("test", "test"), + Integer.toString(i), + ImmutableMap.of(TEST_COLUMN_HANDLE, NullableValue.asNull(VarcharType.VARCHAR)))); + } + + createPredicate(ImmutableList.of(TEST_COLUMN_HANDLE), partitions.build()); + } + + @Test + public void testCreateMixedPredicate() + { + ImmutableList.Builder partitions = ImmutableList.builder(); + + for (int i = 0; i < 5; i++) { + partitions.add(new HivePartition( + new SchemaTableName("test", "test"), + Integer.toString(i), + ImmutableMap.of(TEST_COLUMN_HANDLE, NullableValue.of(VarcharType.VARCHAR, Slices.utf8Slice(Integer.toString(i)))))); + } + + partitions.add(new HivePartition( + new SchemaTableName("test", "test"), + "null", + ImmutableMap.of(TEST_COLUMN_HANDLE, NullableValue.asNull(VarcharType.VARCHAR)))); + + createPredicate(ImmutableList.of(TEST_COLUMN_HANDLE), partitions.build()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHivePageSink.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHivePageSink.java new file mode 100644 index 00000000..932e0a5f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHivePageSink.java @@ -0,0 +1,431 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.util.concurrent.ListeningExecutorService; +import io.airlift.json.JsonCodec; +import io.airlift.slice.Slices; +import io.airlift.tpch.LineItem; +import io.airlift.tpch.LineItemColumn; +import io.airlift.tpch.LineItemGenerator; +import io.airlift.tpch.TpchColumnType; +import io.airlift.tpch.TpchColumnTypes; +import io.prestosql.GroupByHashPageIndexerFactory; +import io.prestosql.plugin.hive.authentication.GenericExceptionAction; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.HivePageSinkMetadata; +import io.prestosql.spi.Page; +import io.prestosql.spi.PageBuilder; +import io.prestosql.spi.PageIndexer; +import io.prestosql.spi.PageIndexerFactory; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.block.IntArrayBlock; +import io.prestosql.spi.connector.ConnectorPageSink; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.sql.gen.JoinCompiler; +import io.prestosql.testing.MaterializedResult; +import io.prestosql.testing.TestingConnectorSession; +import io.prestosql.testing.TestingNodeManager; +import org.apache.hadoop.fs.Path; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.stream.Stream; + +import static com.google.common.collect.Iterables.getOnlyElement; +import static com.google.common.io.MoreFiles.deleteRecursively; +import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static io.airlift.concurrent.MoreFutures.getFutureValue; +import static io.airlift.testing.Assertions.assertGreaterThan; +import static io.prestosql.metadata.MetadataManager.createTestMetadataManager; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveCompressionCodec.NONE; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveSelectiveFactories; +import static io.prestosql.plugin.hive.HiveType.HIVE_DATE; +import static io.prestosql.plugin.hive.HiveType.HIVE_DOUBLE; +import static io.prestosql.plugin.hive.HiveType.HIVE_INT; +import static io.prestosql.plugin.hive.HiveType.HIVE_LONG; +import static io.prestosql.plugin.hive.HiveType.HIVE_STRING; +import static io.prestosql.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_NEW_DIRECTORY; +import static io.prestosql.plugin.hive.metastore.file.FileHiveMetastore.createTestingFileHiveMetastore; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.VarcharType.createUnboundedVarcharType; +import static io.prestosql.testing.assertions.Assert.assertEquals; +import static java.lang.Math.round; +import static java.lang.String.format; +import static java.nio.file.Files.createTempDirectory; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; +import static org.mockito.Matchers.anyInt; +import static org.mockito.Matchers.anyObject; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertTrue; + +public class TestHivePageSink +{ + private static final int NUM_ROWS = 1000; + private static final String CLIENT_ID = "client_id"; + private static final String SCHEMA_NAME = "test"; + private static final String TABLE_NAME = "test"; + + @Test + public void testAllFormats() + throws Exception + { + HiveConfig config = new HiveConfig(); + File tempDir = createTempDirectory(getClass().getName()).toFile(); + try { + HiveMetastore metastore = createTestingFileHiveMetastore(new File(tempDir, "metastore")); + for (HiveStorageFormat format : HiveStorageFormat.values()) { + if (format == HiveStorageFormat.CSV) { + // CSV supports only unbounded VARCHAR type, which is not provided by lineitem + continue; + } + config.setHiveStorageFormat(format); + config.setHiveCompressionCodec(NONE); + long uncompressedLength = writeTestFile(config, metastore, makeFileName(tempDir, config)); + assertGreaterThan(uncompressedLength, 0L); + + for (HiveCompressionCodec codec : HiveCompressionCodec.values()) { + if (codec == NONE) { + continue; + } + config.setHiveCompressionCodec(codec); + long length = writeTestFile(config, metastore, makeFileName(tempDir, config)); + assertTrue(uncompressedLength > length, format("%s with %s compressed to %s which is not less than %s", format, codec, length, uncompressedLength)); + } + } + } + finally { + deleteRecursively(tempDir.toPath(), ALLOW_INSECURE); + } + } + + private static String makeFileName(File tempDir, HiveConfig config) + { + return tempDir.getAbsolutePath() + "/" + config.getHiveStorageFormat().name() + "." + config.getHiveCompressionCodec().name(); + } + + private static long writeTestFile(HiveConfig config, HiveMetastore metastore, String outputPath) + { + HiveTransactionHandle transaction = new HiveTransactionHandle(); + HiveWriterStats stats = new HiveWriterStats(); + ConnectorPageSink pageSink = createPageSink(transaction, config, metastore, new Path("file:///" + outputPath), stats); + List columns = getTestColumns(); + List columnTypes = columns.stream() + .map(LineItemColumn::getType) + .map(TestHivePageSink::getHiveType) + .map(hiveType -> hiveType.getType(HiveTestUtils.TYPE_MANAGER)) + .collect(toList()); + + PageBuilder pageBuilder = new PageBuilder(columnTypes); + int rows = 0; + for (LineItem lineItem : new LineItemGenerator(0.01, 1, 1)) { + rows++; + if (rows >= NUM_ROWS) { + break; + } + pageBuilder.declarePosition(); + for (int i = 0; i < columns.size(); i++) { + LineItemColumn column = columns.get(i); + BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(i); + switch (column.getType().getBase()) { + case IDENTIFIER: + BIGINT.writeLong(blockBuilder, column.getIdentifier(lineItem)); + break; + case INTEGER: + INTEGER.writeLong(blockBuilder, column.getInteger(lineItem)); + break; + case DATE: + DATE.writeLong(blockBuilder, column.getDate(lineItem)); + break; + case DOUBLE: + DOUBLE.writeDouble(blockBuilder, column.getDouble(lineItem)); + break; + case VARCHAR: + createUnboundedVarcharType().writeSlice(blockBuilder, Slices.utf8Slice(column.getString(lineItem))); + break; + default: + throw new IllegalArgumentException("Unsupported type " + column.getType()); + } + } + } + Page page = pageBuilder.build(); + pageSink.appendPage(page); + getFutureValue(pageSink.finish()); + + File outputDir = new File(outputPath); + List files = ImmutableList.copyOf(outputDir.listFiles((dir, name) -> !name.endsWith(".crc"))); + File outputFile = getOnlyElement(files); + long length = outputFile.length(); + + ConnectorPageSource pageSource = createPageSource(transaction, config, outputFile); + + List pages = new ArrayList<>(); + while (!pageSource.isFinished()) { + Page nextPage = pageSource.getNextPage(); + if (nextPage != null) { + pages.add(nextPage.getLoadedPage()); + } + } + MaterializedResult expectedResults = toMaterializedResult(getSession(config), columnTypes, ImmutableList.of(page)); + MaterializedResult results = toMaterializedResult(getSession(config), columnTypes, pages); + assertEquals(results, expectedResults); + assertEquals(round(stats.getInputPageSizeInBytes().getAllTime().getMax()), page.getRetainedSizeInBytes()); + return length; + } + + public static MaterializedResult toMaterializedResult(ConnectorSession session, List types, List pages) + { + // materialize pages + MaterializedResult.Builder resultBuilder = MaterializedResult.resultBuilder(session, types); + for (Page outputPage : pages) { + resultBuilder.page(outputPage); + } + return resultBuilder.build(); + } + + private static ConnectorPageSource createPageSource(HiveTransactionHandle transaction, HiveConfig config, File outputFile) + { + Properties splitProperties = new Properties(); + splitProperties.setProperty(FILE_INPUT_FORMAT, config.getHiveStorageFormat().getInputFormat()); + splitProperties.setProperty(SERIALIZATION_LIB, config.getHiveStorageFormat().getSerDe()); + splitProperties.setProperty("columns", Joiner.on(',').join(getColumnHandles().stream().map(HiveColumnHandle::getName).collect(toList()))); + splitProperties.setProperty("columns.types", Joiner.on(',').join(getColumnHandles().stream().map(HiveColumnHandle::getHiveType).map(hiveType -> hiveType.getHiveTypeName().toString()).collect(toList()))); + HiveSplitWrapper split = HiveSplitWrapper.wrap(new HiveSplit( + SCHEMA_NAME, + TABLE_NAME, + "", + "file:///" + outputFile.getAbsolutePath(), + 0, + outputFile.length(), + outputFile.length(), + 0, + splitProperties, + ImmutableList.of(), + ImmutableList.of(), + OptionalInt.empty(), + false, + ImmutableMap.of(), + Optional.empty(), + false, + Optional.empty(), + Optional.empty(), + false, + ImmutableMap.of())); + ConnectorTableHandle table = new HiveTableHandle(SCHEMA_NAME, TABLE_NAME, ImmutableMap.of(), ImmutableList.of(), Optional.empty()); + HivePageSourceProvider provider = new HivePageSourceProvider(config, HiveTestUtils.createTestHdfsEnvironment(config), HiveTestUtils.getDefaultHiveRecordCursorProvider(config), HiveTestUtils.getDefaultHiveDataStreamFactories(config), HiveTestUtils.TYPE_MANAGER, HiveTestUtils.getNoOpIndexCache(), getDefaultHiveSelectiveFactories(config)); + return provider.createPageSource(transaction, getSession(config), split, table, ImmutableList.copyOf(getColumnHandles())); + } + + private static ConnectorPageSink createPageSink(HiveTransactionHandle transaction, HiveConfig config, HiveMetastore metastore, Path outputPath, HiveWriterStats stats) + { + ConnectorSession session = getSession(config); + HiveIdentity identity = new HiveIdentity(session); + LocationHandle locationHandle = new LocationHandle(outputPath, outputPath, false, DIRECT_TO_TARGET_NEW_DIRECTORY, Optional.empty()); + HiveOutputTableHandle handle = new HiveOutputTableHandle( + SCHEMA_NAME, + TABLE_NAME, + getColumnHandles(), + new HivePageSinkMetadata(new SchemaTableName(SCHEMA_NAME, TABLE_NAME), metastore.getTable(identity, SCHEMA_NAME, TABLE_NAME), ImmutableMap.of()), + locationHandle, + config.getHiveStorageFormat(), + config.getHiveStorageFormat(), + ImmutableList.of(), + Optional.empty(), + "test", + ImmutableMap.of()); + JsonCodec partitionUpdateCodec = JsonCodec.jsonCodec(PartitionUpdate.class); + HdfsEnvironment hdfsEnvironment = HiveTestUtils.createTestHdfsEnvironment(config); + HivePageSinkProvider provider = new HivePageSinkProvider( + HiveTestUtils.getDefaultHiveFileWriterFactories(config), + hdfsEnvironment, + HiveTestUtils.PAGE_SORTER, + metastore, + new GroupByHashPageIndexerFactory(new JoinCompiler(createTestMetadataManager())), + HiveTestUtils.TYPE_MANAGER, + config, + new HiveLocationService(hdfsEnvironment), + partitionUpdateCodec, + new TestingNodeManager("fake-environment"), + new HiveEventClient(), + new HiveSessionProperties(config, new OrcFileWriterConfig(), new ParquetFileWriterConfig()), + stats, + HiveTestUtils.getDefaultOrcFileWriterFactory(config)); + return provider.createPageSink(transaction, getSession(config), handle); + } + + private static TestingConnectorSession getSession(HiveConfig config) + { + return new TestingConnectorSession(new HiveSessionProperties(config, new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + } + + public static List getColumnHandles() + { + ImmutableList.Builder handles = ImmutableList.builder(); + List columns = getTestColumns(); + for (int i = 0; i < columns.size(); i++) { + LineItemColumn column = columns.get(i); + HiveType hiveType = getHiveType(column.getType()); + handles.add(new HiveColumnHandle(column.getColumnName(), hiveType, hiveType.getTypeSignature(), i, REGULAR, Optional.empty())); + } + return handles.build(); + } + + private static List getTestColumns() + { + return Stream.of(LineItemColumn.values()) + // Not all the formats support DATE + .filter(column -> !column.getType().equals(TpchColumnTypes.DATE)) + .collect(toList()); + } + + private static HiveType getHiveType(TpchColumnType type) + { + switch (type.getBase()) { + case IDENTIFIER: + return HIVE_LONG; + case INTEGER: + return HIVE_INT; + case DATE: + return HIVE_DATE; + case DOUBLE: + return HIVE_DOUBLE; + case VARCHAR: + return HIVE_STRING; + default: + throw new UnsupportedOperationException(); + } + } + + // Used to test snapshot. Input pages has 1 row and 1 column. Partition is based on this column. + private HivePageSink prepareHivePageSink() + throws IOException + { + // Mock all relevant dependencies + HiveWriterFactory writerFactory = mock(HiveWriterFactory.class); + HiveColumnHandle hiveColumnHandle = mock(HiveColumnHandle.class); + HdfsEnvironment hdfsEnvironment = mock(HdfsEnvironment.class); + PageIndexerFactory pageIndexerFactory = mock(PageIndexerFactory.class); + PageIndexer pageIndexer = mock(PageIndexer.class); + JsonCodec jsonCodec = mock(JsonCodec.class); + ConnectorSession connectorSession = mock(ConnectorSession.class); + + // Mocked necessary but uninteresting methods + when(connectorSession.isSnapshotEnabled()).thenReturn(true); + when(connectorSession.getTaskId()).thenReturn(OptionalInt.of(1)); + when(pageIndexerFactory.createPageIndexer(anyObject())).thenReturn(pageIndexer); + when(jsonCodec.toJsonBytes(anyObject())).thenReturn(new byte[0]); + when(writerFactory.isTxnTable()).thenReturn(false); + HiveWriter hiveWriter = mock(HiveWriter.class); + when(hiveWriter.getVerificationTask()).thenReturn(Optional.empty()); + when(writerFactory.createWriter(anyObject(), anyObject(), anyObject())).thenReturn(hiveWriter); + when(writerFactory.createWriterForSnapshotMerge(anyObject(), anyObject(), anyObject())).thenReturn(hiveWriter); + when(writerFactory.getPartitionName(anyObject(), anyInt())).thenReturn(Optional.empty()); + when(hiveColumnHandle.isPartitionKey()).thenReturn(true); + + // When hdfsEnvironment.doAs() is called, simply invoke the passed in action + when(hdfsEnvironment.doAs(anyObject(), (GenericExceptionAction) anyObject())).thenAnswer(invocation -> + ((GenericExceptionAction) invocation.getArguments()[1]).run()); + doAnswer(invocation -> { + ((Runnable) invocation.getArguments()[1]).run(); + return null; + }).when(hdfsEnvironment).doAs(anyObject(), (Runnable) anyObject()); + + // The only entry in the page is a integer. We use it to determine partition index. + // That is, page1 with value 0 is in partition 0; page2 with value 1 is in partition 1. + // Some functions' return values depend on the number of partitions. + // Store that as an array entry below, so that other mocked methods can use it. + int[] maxIndex = new int[1]; + when(pageIndexer.indexPage(anyObject())) + .thenAnswer(invocation -> { + maxIndex[0] = (int) ((Page) invocation.getArguments()[0]).getBlock(0).get(0); + return new int[] {maxIndex[0]}; + }); + when(pageIndexer.getMaxIndex()).thenAnswer(invocation -> maxIndex[0]); + doAnswer(invocation -> { + assertEquals(((List) invocation.getArguments()[0]).size(), maxIndex[0] + 1); + return null; + }).when(writerFactory).mergeSubFiles(anyObject()); + + return new HivePageSink( + writerFactory, + Collections.singletonList(hiveColumnHandle), + Optional.empty(), + pageIndexerFactory, + mock(TypeManager.class), + hdfsEnvironment, + 10, + mock(ListeningExecutorService.class), + jsonCodec, + connectorSession, + HiveACIDWriteType.INSERT, + mock(HiveWritableTableHandle.class)); + } + + @Test + public void testSnapshotFinish() + throws IOException + { + HivePageSink hivePageSink = prepareHivePageSink(); + Page page1 = new Page(new IntArrayBlock(1, Optional.empty(), new int[]{0})); + Page page2 = new Page(new IntArrayBlock(1, Optional.empty(), new int[]{1})); + hivePageSink.appendPage(page1); + Object state = hivePageSink.capture(null); + hivePageSink.appendPage(page2); + hivePageSink.capture(null); + hivePageSink.restore(state, null, 2); + hivePageSink.appendPage(page2); + hivePageSink.finish(); + } + + @Test + public void testSnapshotAbort() + throws IOException + { + HivePageSink hivePageSink = prepareHivePageSink(); + + Page page1 = new Page(new IntArrayBlock(1, Optional.empty(), new int[]{0})); + Page page2 = new Page(new IntArrayBlock(1, Optional.empty(), new int[]{1})); + hivePageSink.appendPage(page1); + Object state = hivePageSink.capture(null); + hivePageSink.appendPage(page2); + hivePageSink.capture(null); + hivePageSink.restore(state, null, 2); + hivePageSink.appendPage(page2); + hivePageSink.abort(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHivePageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHivePageSource.java new file mode 100644 index 00000000..134344b0 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHivePageSource.java @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.spi.Page; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.block.LongArrayBlockBuilder; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.dynamicfilter.BloomFilterDynamicFilter; +import io.prestosql.spi.dynamicfilter.DynamicFilter; +import io.prestosql.spi.type.BigintType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.util.BloomFilter; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HivePageSource.filter; +import static io.prestosql.plugin.hive.HiveType.HIVE_INT; +import static io.prestosql.spi.type.StandardTypes.INTEGER; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; +import static org.testng.Assert.assertEquals; + +public class TestHivePageSource +{ + @DataProvider(name = "data") + public static Object[][] primeNumbers() + { + return new Object[][] { + {0, 10000, 10, "Both columns filter same rows"}, + {0, 10010, 0, "Both columns filter different rows"}, + {0, 10005, 5, "Both column filter partial of same rows"}, + {1024, 0, 0, "Both columns filter all rows"}}; + } + + @Test(dataProvider = "data") + public void testFilterRows(int columnOffset1, int columnOffset2, int expectedPositionCount, String message) + { + final Type[] types = new Type[] {BigintType.BIGINT, BigintType.BIGINT}; + final int numValues = 1024; + BlockBuilder builder = new LongArrayBlockBuilder(null, numValues); + for (int i = 0; i < numValues; i++) { + builder.writeLong(i); + } + Block dayBlock = builder.build(); + builder = new LongArrayBlockBuilder(null, numValues); + for (int i = 0; i < numValues; i++) { + builder.writeLong(10000 + i); + } + Block appBlock = builder.build(); + + Page page = new Page(dayBlock, appBlock); + + Map dynamicFilter = new HashMap<>(); + ColumnHandle dayColumn = new HiveColumnHandle("pt_d", HIVE_INT, parseTypeSignature(INTEGER), 0, REGULAR, Optional.empty()); + ColumnHandle appColumn = new HiveColumnHandle("app_d", HIVE_INT, parseTypeSignature(INTEGER), 1, REGULAR, Optional.empty()); + + BloomFilter dayFilter = new BloomFilter(1024 * 1024, 0.01); + BloomFilter appFilter = new BloomFilter(1024 * 1024, 0.01); + + for (int i = 0; i < 10; i++) { + dayFilter.add(columnOffset1 + i); + appFilter.add(columnOffset2 + i); + } + dynamicFilter.put(dayColumn, new BloomFilterDynamicFilter("1", dayColumn, dayFilter, DynamicFilter.Type.GLOBAL)); + dynamicFilter.put(appColumn, new BloomFilterDynamicFilter("2", appColumn, appFilter, DynamicFilter.Type.GLOBAL)); + + List> dynamicFilters = new ArrayList<>(); + dynamicFilters.add(dynamicFilter); + + List> eligibleColumns = ImmutableList.of(ImmutableMap.of(0, dayColumn, 1, appColumn)); + + Page filteredPage = filter(dynamicFilters, page, eligibleColumns, types); + + assertEquals(filteredPage.getPositionCount(), expectedPositionCount, message); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHivePageSourceProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHivePageSourceProvider.java new file mode 100644 index 00000000..fdc63354 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHivePageSourceProvider.java @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.spi.connector.QualifiedObjectName; +import io.prestosql.spi.function.BuiltInFunctionHandle; +import io.prestosql.spi.function.FunctionKind; +import io.prestosql.spi.function.Signature; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.ValueSet; +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.VariableReferenceExpression; +import io.prestosql.spi.type.TypeSignature; +import org.eclipse.jetty.util.URIUtil; +import org.testng.annotations.Test; + +import java.net.URI; +import java.util.Collection; +import java.util.HashSet; +import java.util.Optional; + +import static io.prestosql.plugin.hive.HivePageSourceProvider.modifyDomain; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static org.testng.Assert.assertEquals; + +public class TestHivePageSourceProvider +{ + @Test + public void testEncodePath() + { + URI splitUri0 = URI.create(URIUtil.encodePath("hdfs://localhost:9000/user/hive/warehouse/part=part_1/20200405#12345#abcdefgh")); + assertEquals("/user/hive/warehouse/part=part_1/20200405%2312345%23abcdefgh", splitUri0.getRawPath()); + URI splitUri1 = URI.create(URIUtil.encodePath("hdfs://localhost:9000/user/hive/warehouse/part=part_1/20200405%12345%abcdefgh")); + assertEquals("/user/hive/warehouse/part=part_1/20200405%2512345%25abcdefgh", splitUri1.getRawPath()); + URI splitUri2 = URI.create(URIUtil.encodePath("hdfs://localhost:9000/user/hive/warehouse/part=part_1/20200405?12345?abcdefgh")); + assertEquals("/user/hive/warehouse/part=part_1/20200405%3F12345%3Fabcdefgh", splitUri2.getRawPath()); + URI splitUri3 = URI.create(URIUtil.encodePath("hdfs://localhost:9000/user/hive/warehouse/part=part_1/20200405 12345 abcdefgh")); + assertEquals("/user/hive/warehouse/part=part_1/20200405%2012345%20abcdefgh", splitUri3.getRawPath()); + URI splitUri4 = URI.create(URIUtil.encodePath("hdfs://localhost:9000/user/hive/warehouse/part=part_1/20200405^12345^abcdefgh")); + assertEquals("/user/hive/warehouse/part=part_1/20200405%5E12345%5Eabcdefgh", splitUri4.getRawPath()); + URI splitUri5 = URI.create(URIUtil.encodePath("hdfs://localhost:9000/user/hive/warehouse/part=part_1/20200405>12345>abcdefgh")); + assertEquals("/user/hive/warehouse/part=part_1/20200405%3E12345%3Eabcdefgh", splitUri5.getRawPath()); + URI splitUri6 = URI.create(URIUtil.encodePath("hdfs://localhost:9000/user/hive/warehouse/part=part_1/20200405+12345+abcdefgh")); + assertEquals("/user/hive/warehouse/part=part_1/20200405+12345+abcdefgh", splitUri6.getRawPath()); + URI splitUri7 = URI.create(URIUtil.encodePath("hdfs://localhost:9000/user/hive/warehouse/part=part_1/20200405-12345-abcdefgh")); + assertEquals("/user/hive/warehouse/part=part_1/20200405-12345-abcdefgh", splitUri7.getRawPath()); + URI splitUri8 = URI.create(URIUtil.encodePath("hdfs://localhost:9000/user/hive/warehouse/part=part_1/20200405*12345*abcdefgh")); + assertEquals("/user/hive/warehouse/part=part_1/20200405*12345*abcdefgh", splitUri8.getRawPath()); + URI splitUri9 = URI.create(URIUtil.encodePath("hdfs://localhost:9000/user/hive/warehouse/part=part_1/20200405<12345 valueSet = new HashSet<>(); + valueSet.add(Long.valueOf(40)); + VariableReferenceExpression argument1 = new VariableReferenceExpression("arg_1", BIGINT); + VariableReferenceExpression argument2 = new VariableReferenceExpression("arg_2", BIGINT); + QualifiedObjectName objectName = new QualifiedObjectName("presto", "default", "$operator$greater_than_or_equal"); + + BuiltInFunctionHandle functionHandle = new BuiltInFunctionHandle(new Signature(objectName, FunctionKind.SCALAR, ImmutableList.of(), ImmutableList.of(), new TypeSignature("boolean"), ImmutableList.of(new TypeSignature("bigint"), new TypeSignature("bigint")), false)); + CallExpression filter = new CallExpression("GREATER_THAN_OR_EQUAL", functionHandle, BOOLEAN, ImmutableList.of(argument1, argument2)); + Domain domain = Domain.create(ValueSet.copyOf(BIGINT, valueSet), false); + domain = modifyDomain(domain, Optional.of(filter)); + assertEquals(domain.getValues().getRanges().getSpan().getHigh().getValueBlock(), Optional.empty()); + assertEquals(domain.getValues().getRanges().getSpan().getLow().getValue(), Long.valueOf(40)); + } + + @Test + public void testModifyDomainGreaterThan() + { + Collection valueSet = new HashSet<>(); + valueSet.add(Long.valueOf(40)); + VariableReferenceExpression argument1 = new VariableReferenceExpression("arg_1", BIGINT); + VariableReferenceExpression argument2 = new VariableReferenceExpression("arg_2", BIGINT); + QualifiedObjectName objectName = new QualifiedObjectName("presto", "default", "$operator$greater_than"); + + BuiltInFunctionHandle functionHandle = new BuiltInFunctionHandle(new Signature(objectName, FunctionKind.SCALAR, ImmutableList.of(), ImmutableList.of(), new TypeSignature("boolean"), ImmutableList.of(new TypeSignature("bigint"), new TypeSignature("bigint")), false)); + CallExpression filter = new CallExpression("GREATER_THAN", functionHandle, BOOLEAN, ImmutableList.of(argument1, argument2)); + Domain domain = Domain.create(ValueSet.copyOf(BIGINT, valueSet), false); + domain = modifyDomain(domain, Optional.of(filter)); + assertEquals(domain.getValues().getRanges().getSpan().getHigh().getValueBlock(), Optional.empty()); + assertEquals(domain.getValues().getRanges().getSpan().getLow().getValue(), Long.valueOf(40)); + } + + @Test + public void testModifyDomainLessThanOrEqual() + { + Collection valueSet = new HashSet<>(); + valueSet.add(Long.valueOf(40)); + VariableReferenceExpression argument1 = new VariableReferenceExpression("arg_1", BIGINT); + VariableReferenceExpression argument2 = new VariableReferenceExpression("arg_2", BIGINT); + QualifiedObjectName objectName = new QualifiedObjectName("presto", "default", "$operator$less_than_or_equal"); + + BuiltInFunctionHandle functionHandle = new BuiltInFunctionHandle(new Signature(objectName, FunctionKind.SCALAR, ImmutableList.of(), ImmutableList.of(), new TypeSignature("boolean"), ImmutableList.of(new TypeSignature("bigint"), new TypeSignature("bigint")), false)); + CallExpression filter = new CallExpression("LESS_THAN", functionHandle, BOOLEAN, ImmutableList.of(argument1, argument2)); + Domain domain = Domain.create(ValueSet.copyOf(BIGINT, valueSet), false); + domain = modifyDomain(domain, Optional.of(filter)); + assertEquals(domain.getValues().getRanges().getSpan().getHigh().getValue(), Long.valueOf(40)); + assertEquals(domain.getValues().getRanges().getSpan().getLow().getValueBlock(), Optional.empty()); + } + + @Test + public void testModifyDomainLessThan() + { + Collection valueSet = new HashSet<>(); + valueSet.add(Long.valueOf(40)); + VariableReferenceExpression argument1 = new VariableReferenceExpression("arg_1", BIGINT); + VariableReferenceExpression argument2 = new VariableReferenceExpression("arg_2", BIGINT); + QualifiedObjectName objectName = new QualifiedObjectName("presto", "default", "$operator$less_than"); + + BuiltInFunctionHandle functionHandle = new BuiltInFunctionHandle(new Signature(objectName, FunctionKind.SCALAR, ImmutableList.of(), ImmutableList.of(), new TypeSignature("boolean"), ImmutableList.of(new TypeSignature("bigint"), new TypeSignature("bigint")), false)); + CallExpression filter = new CallExpression("LESS_THAN_OR_EQUAL", functionHandle, BOOLEAN, ImmutableList.of(argument1, argument2)); + Domain domain = Domain.create(ValueSet.copyOf(BIGINT, valueSet), false); + domain = modifyDomain(domain, Optional.of(filter)); + assertEquals(domain.getValues().getRanges().getSpan().getHigh().getValue(), Long.valueOf(40)); + assertEquals(domain.getValues().getRanges().getSpan().getLow().getValueBlock(), Optional.empty()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveRoles.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveRoles.java new file mode 100644 index 00000000..523adda9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveRoles.java @@ -0,0 +1,497 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.prestosql.Session; +import io.prestosql.spi.security.Identity; +import io.prestosql.spi.security.SelectedRole; +import io.prestosql.spi.type.Type; +import io.prestosql.testing.MaterializedResult; +import io.prestosql.tests.AbstractTestQueryFramework; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static io.prestosql.spi.type.VarcharType.createUnboundedVarcharType; +import static io.prestosql.tests.QueryAssertions.assertContains; +import static io.prestosql.tests.QueryAssertions.assertEqualsIgnoreOrder; +import static org.testng.Assert.assertEquals; + +@Test(singleThreaded = true) +public class TestHiveRoles + extends AbstractTestQueryFramework +{ + protected TestHiveRoles() + { + super(HiveQueryRunner::createQueryRunner); + } + + @AfterMethod(alwaysRun = true) + public void afterMethod() + { + for (String role : listRoles()) { + executeFromAdmin("DROP ROLE " + role); + } + } + + @Test + public void testCreateRole() + { + executeFromAdmin("CREATE ROLE role1"); + assertEquals(listRoles(), ImmutableSet.of("role1")); + assertEquals(listRoles(), ImmutableSet.of("role1")); + } + + @Test + public void testCreateDuplicateRole() + { + executeFromAdmin("CREATE ROLE duplicate_role"); + assertQueryFails(createAdminSession(), "CREATE ROLE duplicate_role", ".*?Role 'duplicate_role' already exists"); + } + + @Test + public void testCreateRoleWithAdminOption() + { + assertQueryFails(createAdminSession(), "CREATE ROLE role1 WITH ADMIN admin", ".*?Hive Connector does not support WITH ADMIN statement"); + } + + @Test + public void testCreateReservedRole() + { + assertQueryFails(createAdminSession(), "CREATE ROLE all", "Role name cannot be one of the reserved roles \\(case insensitive\\): \\[all, default, none\\]"); + assertQueryFails(createAdminSession(), "CREATE ROLE default", "Role name cannot be one of the reserved roles \\(case insensitive\\): \\[all, default, none\\]"); + assertQueryFails(createAdminSession(), "CREATE ROLE none", "Role name cannot be one of the reserved roles \\(case insensitive\\): \\[all, default, none\\]"); + assertQueryFails(createAdminSession(), "CREATE ROLE None", "Role name cannot be one of the reserved roles \\(case insensitive\\): \\[all, default, none\\]"); + } + + @Test + public void testCreateRoleByNonAdminUser() + { + assertQueryFails(createUserSession("non_admin_user"), "CREATE ROLE role1", "Access Denied: Cannot create role role1"); + } + + @Test + public void testDropRole() + { + executeFromAdmin("CREATE ROLE role1"); + assertEquals(listRoles(), ImmutableSet.of("role1")); + executeFromAdmin("DROP ROLE role1"); + assertEquals(listRoles(), ImmutableSet.of()); + } + + @Test + public void testDropNonExistentRole() + { + assertQueryFails(createAdminSession(), "DROP ROLE non_existent_role", ".*?Role 'non_existent_role' does not exist"); + } + + @Test + public void testDropRoleByNonAdminUser() + { + assertQueryFails(createUserSession("non_admin_user"), "DROP ROLE role1", "Access Denied: Cannot drop role role1"); + } + + @Test + public void testListRolesByNonAdminUser() + { + assertQueryFails(createUserSession("non_admin_user"), "SELECT * FROM hive.information_schema.roles", "Access Denied: Cannot select from table information_schema.roles"); + } + + @Test + public void testPublicRoleIsGrantedToAnyone() + { + assertContains(listApplicableRoles("some_user"), applicableRoles("some_user", "USER", "public", "NO")); + } + + @Test + public void testAdminRoleIsGrantedToAdmin() + { + assertContains(listApplicableRoles("admin"), applicableRoles("admin", "USER", "admin", "YES")); + } + + @Test + public void testGrantRoleToUser() + { + executeFromAdmin("CREATE ROLE role1"); + executeFromAdmin("GRANT role1 TO USER user"); + assertContains(listApplicableRoles("user"), applicableRoles("user", "USER", "role1", "NO")); + } + + @Test + public void testGrantRoleToRole() + { + executeFromAdmin("CREATE ROLE role1"); + executeFromAdmin("CREATE ROLE role2"); + executeFromAdmin("GRANT role1 TO USER user"); + executeFromAdmin("GRANT role2 TO ROLE role1"); + assertContains(listApplicableRoles("user"), applicableRoles( + "user", "USER", "role1", "NO", + "role1", "ROLE", "role2", "NO")); + } + + @Test + public void testGrantRoleWithAdminOption() + { + executeFromAdmin("CREATE ROLE role1"); + executeFromAdmin("CREATE ROLE role2"); + executeFromAdmin("GRANT role1 TO USER user WITH ADMIN OPTION"); + executeFromAdmin("GRANT role2 TO ROLE role1 WITH ADMIN OPTION"); + assertContains(listApplicableRoles("user"), applicableRoles( + "user", "USER", "role1", "YES", + "role1", "ROLE", "role2", "YES")); + } + + @Test + public void testGrantRoleMultipleTimes() + { + executeFromAdmin("CREATE ROLE role1"); + executeFromAdmin("CREATE ROLE role2"); + executeFromAdmin("GRANT role1 TO USER user"); + executeFromAdmin("GRANT role1 TO USER user"); + executeFromAdmin("GRANT role2 TO ROLE role1"); + executeFromAdmin("GRANT role2 TO ROLE role1"); + executeFromAdmin("GRANT role1 TO USER user WITH ADMIN OPTION"); + executeFromAdmin("GRANT role1 TO USER user WITH ADMIN OPTION"); + executeFromAdmin("GRANT role2 TO ROLE role1 WITH ADMIN OPTION"); + executeFromAdmin("GRANT role2 TO ROLE role1 WITH ADMIN OPTION"); + assertContains(listApplicableRoles("user"), applicableRoles( + "user", "USER", "role1", "YES", + "role1", "ROLE", "role2", "YES")); + } + + @Test + public void testGrantNonExistingRole() + { + assertQueryFails("GRANT grant_revoke_role_existing_1 TO USER grant_revoke_existing_user_1", ".*?Role 'grant_revoke_role_existing_1' does not exist"); + executeFromAdmin("CREATE ROLE grant_revoke_role_existing_1"); + assertQueryFails("GRANT grant_revoke_role_existing_1 TO ROLE grant_revoke_role_existing_2", ".*?Role 'grant_revoke_role_existing_2' does not exist"); + } + + @Test + public void testRevokeRoleFromUser() + { + executeFromAdmin("CREATE ROLE role1"); + executeFromAdmin("GRANT role1 TO USER user"); + assertContains(listApplicableRoles("user"), applicableRoles("user", "USER", "role1", "NO")); + + executeFromAdmin("REVOKE role1 FROM USER user"); + assertEqualsIgnoreOrder(listApplicableRoles("user"), applicableRoles("user", "USER", "public", "NO")); + } + + @Test + public void testRevokeRoleFromRole() + { + executeFromAdmin("CREATE ROLE role1"); + executeFromAdmin("CREATE ROLE role2"); + executeFromAdmin("GRANT role1 TO USER user"); + executeFromAdmin("GRANT role2 TO ROLE role1"); + assertContains(listApplicableRoles("user"), applicableRoles( + "user", "USER", "role1", "NO", + "role1", "ROLE", "role2", "NO")); + + executeFromAdmin("REVOKE role2 FROM ROLE role1"); + assertEqualsIgnoreOrder(listApplicableRoles("user"), applicableRoles( + "user", "USER", "public", "NO", + "user", "USER", "role1", "NO")); + } + + @Test + public void testDropGrantedRole() + { + executeFromAdmin("CREATE ROLE role1"); + executeFromAdmin("GRANT role1 TO USER user"); + assertContains(listApplicableRoles("user"), applicableRoles("user", "USER", "role1", "NO")); + + executeFromAdmin("DROP ROLE role1"); + assertEqualsIgnoreOrder(listApplicableRoles("user"), applicableRoles("user", "USER", "public", "NO")); + } + + @Test + public void testRevokeTransitiveRoleFromUser() + { + executeFromAdmin("CREATE ROLE role1"); + executeFromAdmin("CREATE ROLE role2"); + executeFromAdmin("CREATE ROLE role3"); + executeFromAdmin("GRANT role1 TO USER user"); + executeFromAdmin("GRANT role2 TO ROLE role1"); + executeFromAdmin("GRANT role3 TO ROLE role2"); + assertContains(listApplicableRoles("user"), applicableRoles( + "user", "USER", "role1", "NO", + "role1", "ROLE", "role2", "NO", + "role2", "ROLE", "role3", "NO")); + + executeFromAdmin("REVOKE role1 FROM USER user"); + assertEqualsIgnoreOrder(listApplicableRoles("user"), applicableRoles("user", "USER", "public", "NO")); + } + + @Test + public void testRevokeTransitiveRoleFromRole() + { + executeFromAdmin("CREATE ROLE role1"); + executeFromAdmin("CREATE ROLE role2"); + executeFromAdmin("CREATE ROLE role3"); + executeFromAdmin("GRANT role1 TO USER user"); + executeFromAdmin("GRANT role2 TO ROLE role1"); + executeFromAdmin("GRANT role3 TO ROLE role2"); + assertContains(listApplicableRoles("user"), applicableRoles( + "user", "USER", "role1", "NO", + "role1", "ROLE", "role2", "NO", + "role2", "ROLE", "role3", "NO")); + + executeFromAdmin("REVOKE role2 FROM ROLE role1"); + assertEqualsIgnoreOrder(listApplicableRoles("user"), applicableRoles( + "user", "USER", "public", "NO", + "user", "USER", "role1", "NO")); + } + + @Test + public void testDropTransitiveRole() + { + executeFromAdmin("CREATE ROLE role1"); + executeFromAdmin("CREATE ROLE role2"); + executeFromAdmin("CREATE ROLE role3"); + executeFromAdmin("GRANT role1 TO USER user"); + executeFromAdmin("GRANT role2 TO ROLE role1"); + executeFromAdmin("GRANT role3 TO ROLE role2"); + assertContains(listApplicableRoles("user"), applicableRoles( + "user", "USER", "role1", "NO", + "role1", "ROLE", "role2", "NO", + "role2", "ROLE", "role3", "NO")); + + executeFromAdmin("DROP ROLE role2"); + assertEqualsIgnoreOrder(listApplicableRoles("user"), applicableRoles( + "user", "USER", "public", "NO", + "user", "USER", "role1", "NO")); + } + + @Test + public void testRevokeAdminOption() + { + executeFromAdmin("CREATE ROLE role1"); + executeFromAdmin("CREATE ROLE role2"); + executeFromAdmin("GRANT role1 TO USER user WITH ADMIN OPTION"); + executeFromAdmin("GRANT role2 TO ROLE role1 WITH ADMIN OPTION"); + assertContains(listApplicableRoles("user"), applicableRoles( + "user", "USER", "role1", "YES", + "role1", "ROLE", "role2", "YES")); + + executeFromAdmin("REVOKE ADMIN OPTION FOR role1 FROM USER user"); + executeFromAdmin("REVOKE ADMIN OPTION FOR role2 FROM ROLE role1"); + assertContains(listApplicableRoles("user"), applicableRoles( + "user", "USER", "role1", "NO", + "role1", "ROLE", "role2", "NO")); + } + + @Test + public void testRevokeRoleMultipleTimes() + { + executeFromAdmin("CREATE ROLE role1"); + executeFromAdmin("CREATE ROLE role2"); + executeFromAdmin("GRANT role1 TO USER user WITH ADMIN OPTION"); + executeFromAdmin("GRANT role2 TO ROLE role1 WITH ADMIN OPTION"); + assertContains(listApplicableRoles("user"), applicableRoles( + "user", "USER", "role1", "YES", + "role1", "ROLE", "role2", "YES")); + + executeFromAdmin("REVOKE ADMIN OPTION FOR role1 FROM USER user"); + executeFromAdmin("REVOKE ADMIN OPTION FOR role1 FROM USER user"); + executeFromAdmin("REVOKE ADMIN OPTION FOR role2 FROM ROLE role1"); + executeFromAdmin("REVOKE ADMIN OPTION FOR role2 FROM ROLE role1"); + assertContains(listApplicableRoles("user"), applicableRoles( + "user", "USER", "role1", "NO", + "role1", "ROLE", "role2", "NO")); + + executeFromAdmin("REVOKE role1 FROM USER user"); + executeFromAdmin("REVOKE role1 FROM USER user"); + executeFromAdmin("REVOKE role2 FROM ROLE role1"); + executeFromAdmin("REVOKE role2 FROM ROLE role1"); + assertEqualsIgnoreOrder(listApplicableRoles("user"), applicableRoles("user", "USER", "public", "NO")); + } + + @Test + public void testRevokeNonExistingRole() + { + assertQueryFails(createAdminSession(), "REVOKE grant_revoke_role_existing_1 FROM USER grant_revoke_existing_user_1", ".*?Role 'grant_revoke_role_existing_1' does not exist"); + executeFromAdmin("CREATE ROLE grant_revoke_role_existing_1"); + assertQueryFails(createAdminSession(), "REVOKE grant_revoke_role_existing_1 FROM ROLE grant_revoke_role_existing_2", ".*?Role 'grant_revoke_role_existing_2' does not exist"); + } + + @Test + public void testSetRole() + { + executeFromAdmin("CREATE ROLE set_role_1"); + executeFromAdmin("CREATE ROLE set_role_2"); + executeFromAdmin("CREATE ROLE set_role_3"); + executeFromAdmin("CREATE ROLE set_role_4"); + executeFromAdmin("GRANT set_role_1 TO USER set_user_1"); + executeFromAdmin("GRANT set_role_2 TO ROLE set_role_1"); + executeFromAdmin("GRANT set_role_3 TO ROLE set_role_2"); + + Session unsetRole = Session.builder(getQueryRunner().getDefaultSession()) + .setIdentity(new Identity("set_user_1", Optional.empty())) + .build(); + Session setRoleAll = Session.builder(getQueryRunner().getDefaultSession()) + .setIdentity(new Identity("set_user_1", Optional.empty(), ImmutableMap.of("hive", new SelectedRole(SelectedRole.Type.ALL, Optional.empty())))) + .build(); + Session setRoleNone = Session.builder(getQueryRunner().getDefaultSession()) + .setIdentity(new Identity("set_user_1", Optional.empty(), ImmutableMap.of("hive", new SelectedRole(SelectedRole.Type.NONE, Optional.empty())))) + .build(); + Session setRole1 = Session.builder(getQueryRunner().getDefaultSession()) + .setIdentity(new Identity("set_user_1", Optional.empty(), ImmutableMap.of("hive", new SelectedRole(SelectedRole.Type.ROLE, Optional.of("set_role_1"))))) + .build(); + Session setRole2 = Session.builder(getQueryRunner().getDefaultSession()) + .setIdentity(new Identity("set_user_1", Optional.empty(), ImmutableMap.of("hive", new SelectedRole(SelectedRole.Type.ROLE, Optional.of("set_role_2"))))) + .build(); + Session setRole3 = Session.builder(getQueryRunner().getDefaultSession()) + .setIdentity(new Identity("set_user_1", Optional.empty(), ImmutableMap.of("hive", new SelectedRole(SelectedRole.Type.ROLE, Optional.of("set_role_3"))))) + .build(); + Session setRole4 = Session.builder(getQueryRunner().getDefaultSession()) + .setIdentity(new Identity("set_user_1", Optional.empty(), ImmutableMap.of("hive", new SelectedRole(SelectedRole.Type.ROLE, Optional.of("set_role_4"))))) + .build(); + + MaterializedResult actual = getQueryRunner().execute(unsetRole, "SELECT * FROM hive.information_schema.applicable_roles"); + MaterializedResult expected = MaterializedResult.resultBuilder(unsetRole, createUnboundedVarcharType(), createUnboundedVarcharType(), createUnboundedVarcharType(), createUnboundedVarcharType()) + .row("set_user_1", "USER", "public", "NO") + .row("set_user_1", "USER", "set_role_1", "NO") + .row("set_role_1", "ROLE", "set_role_2", "NO") + .row("set_role_2", "ROLE", "set_role_3", "NO") + .build(); + assertEqualsIgnoreOrder(actual, expected); + + actual = getQueryRunner().execute(unsetRole, "SELECT * FROM hive.information_schema.enabled_roles"); + expected = MaterializedResult.resultBuilder(unsetRole, createUnboundedVarcharType()) + .row("public") + .row("set_role_1") + .row("set_role_2") + .row("set_role_3") + .build(); + assertEqualsIgnoreOrder(actual, expected); + + actual = getQueryRunner().execute(setRoleAll, "SELECT * FROM hive.information_schema.enabled_roles"); + expected = MaterializedResult.resultBuilder(setRoleAll, createUnboundedVarcharType()) + .row("public") + .row("set_role_1") + .row("set_role_2") + .row("set_role_3") + .build(); + assertEqualsIgnoreOrder(actual, expected); + + actual = getQueryRunner().execute(setRoleNone, "SELECT * FROM hive.information_schema.enabled_roles"); + expected = MaterializedResult.resultBuilder(setRoleNone, createUnboundedVarcharType()) + .row("public") + .build(); + assertEqualsIgnoreOrder(actual, expected); + + actual = getQueryRunner().execute(setRole1, "SELECT * FROM hive.information_schema.enabled_roles"); + expected = MaterializedResult.resultBuilder(setRole1, createUnboundedVarcharType()) + .row("public") + .row("set_role_1") + .row("set_role_2") + .row("set_role_3") + .build(); + assertEqualsIgnoreOrder(actual, expected); + + actual = getQueryRunner().execute(setRole2, "SELECT * FROM hive.information_schema.enabled_roles"); + expected = MaterializedResult.resultBuilder(setRole2, createUnboundedVarcharType()) + .row("public") + .row("set_role_2") + .row("set_role_3") + .build(); + assertEqualsIgnoreOrder(actual, expected); + + actual = getQueryRunner().execute(setRole3, "SELECT * FROM hive.information_schema.enabled_roles"); + expected = MaterializedResult.resultBuilder(setRole3, createUnboundedVarcharType()) + .row("public") + .row("set_role_3") + .build(); + assertEqualsIgnoreOrder(actual, expected); + + assertQueryFails(setRole4, "SELECT * FROM hive.information_schema.enabled_roles", ".*?Cannot set role set_role_4"); + + executeFromAdmin("DROP ROLE set_role_1"); + executeFromAdmin("DROP ROLE set_role_2"); + executeFromAdmin("DROP ROLE set_role_3"); + executeFromAdmin("DROP ROLE set_role_4"); + } + + private Set listRoles() + { + return executeFromAdmin("SELECT * FROM hive.information_schema.roles") + .getMaterializedRows() + .stream() + .map(row -> row.getField(0).toString()) + .collect(Collectors.toSet()); + } + + private MaterializedResult listApplicableRoles(String user) + { + return executeFromUser(user, "SELECT * FROM hive.information_schema.applicable_roles"); + } + + private MaterializedResult applicableRoles(String... values) + { + List types = ImmutableList.of(createUnboundedVarcharType(), createUnboundedVarcharType(), createUnboundedVarcharType(), createUnboundedVarcharType()); + int rowLength = types.size(); + checkArgument(values.length % rowLength == 0); + MaterializedResult.Builder result = MaterializedResult.resultBuilder(getQueryRunner().getDefaultSession(), types); + Object[] row = null; + for (int i = 0; i < values.length; i++) { + if (i % rowLength == 0) { + if (row != null) { + result.row(row); + } + row = new Object[rowLength]; + } + checkState(row != null); + row[i % rowLength] = values[i]; + } + if (row != null) { + result.row(row); + } + return result.build(); + } + + private MaterializedResult executeFromAdmin(String sql) + { + return getQueryRunner().execute(createAdminSession(), sql); + } + + private MaterializedResult executeFromUser(String user, String sql) + { + return getQueryRunner().execute(createUserSession(user), sql); + } + + private Session createAdminSession() + { + return Session.builder(getQueryRunner().getDefaultSession()) + .setIdentity(new Identity("admin", Optional.empty(), ImmutableMap.of("hive", new SelectedRole(SelectedRole.Type.ROLE, Optional.of("admin"))))) + .build(); + } + + private Session createUserSession(String user) + { + return Session.builder(getQueryRunner().getDefaultSession()) + .setIdentity(new Identity(user, Optional.empty())) + .build(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveSplit.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveSplit.java new file mode 100644 index 00000000..6ae28534 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveSplit.java @@ -0,0 +1,102 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.airlift.json.JsonCodec; +import io.prestosql.plugin.hive.HiveColumnHandle.ColumnType; +import io.prestosql.spi.HostAddress; +import org.apache.hadoop.fs.Path; +import org.testng.annotations.Test; + +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; + +import static io.prestosql.plugin.hive.HiveBucketing.BucketingVersion.BUCKETING_V1; +import static io.prestosql.plugin.hive.HiveType.HIVE_LONG; +import static io.prestosql.plugin.hive.HiveType.HIVE_STRING; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static org.testng.Assert.assertEquals; + +public class TestHiveSplit +{ + private final JsonCodec codec = JsonCodec.jsonCodec(HiveSplit.class); + + @Test + public void testJsonRoundTrip() + { + Properties schema = new Properties(); + schema.setProperty("foo", "bar"); + schema.setProperty("bar", "baz"); + + ImmutableList partitionKeys = ImmutableList.of(new HivePartitionKey("a", "apple"), new HivePartitionKey("b", "42")); + ImmutableList addresses = ImmutableList.of(HostAddress.fromParts("127.0.0.1", 44), HostAddress.fromParts("127.0.0.1", 45)); + + DeleteDeltaLocations.Builder deleteDeltaLocationsBuilder = DeleteDeltaLocations.builder(new Path("file:///data/fullacid")); + deleteDeltaLocationsBuilder.addDeleteDelta(new Path("file:///data/fullacid/delete_delta_0000004_0000004_0000"), 4L, 4L, 0); + deleteDeltaLocationsBuilder.addDeleteDelta(new Path("file:///data/fullacid/delete_delta_0000007_0000007_0000"), 7L, 7L, 0); + DeleteDeltaLocations deleteDeltaLocations = deleteDeltaLocationsBuilder.build().get(); + + Map customSplitInfo = ImmutableMap.of("key", "value"); + + HiveSplit expected = new HiveSplit( + "db", + "table", + "partitionId", + "path", + 42, + 87, + 88, + 0, + schema, + partitionKeys, + addresses, + OptionalInt.empty(), + true, + ImmutableMap.of(1, HIVE_STRING), + Optional.of(new HiveSplit.BucketConversion( + BUCKETING_V1, + 32, + 16, + ImmutableList.of(new HiveColumnHandle("col", HIVE_LONG, BIGINT.getTypeSignature(), 5, ColumnType.REGULAR, Optional.of("comment"))))), + false, + Optional.of(deleteDeltaLocations), + Optional.empty(), + false, + customSplitInfo); + + String json = codec.toJson(expected); + HiveSplit actual = codec.fromJson(json); + + assertEquals(actual.getDatabase(), expected.getDatabase()); + assertEquals(actual.getTable(), expected.getTable()); + assertEquals(actual.getPartitionName(), expected.getPartitionName()); + assertEquals(actual.getPath(), expected.getPath()); + assertEquals(actual.getStart(), expected.getStart()); + assertEquals(actual.getLength(), expected.getLength()); + assertEquals(actual.getFileSize(), expected.getFileSize()); + assertEquals(actual.getSchema(), expected.getSchema()); + assertEquals(actual.getPartitionKeys(), expected.getPartitionKeys()); + assertEquals(actual.getAddresses(), expected.getAddresses()); + assertEquals(actual.getColumnCoercions(), expected.getColumnCoercions()); + assertEquals(actual.getBucketConversion(), expected.getBucketConversion()); + assertEquals(actual.isForceLocalScheduling(), expected.isForceLocalScheduling()); + assertEquals(actual.isS3SelectPushdownEnabled(), expected.isS3SelectPushdownEnabled()); + assertEquals(actual.getDeleteDeltaLocations().get(), expected.getDeleteDeltaLocations().get()); + assertEquals(actual.getCustomSplitInfo(), expected.getCustomSplitInfo()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveSplitSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveSplitSource.java new file mode 100644 index 00000000..1834f3fe --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveSplitSource.java @@ -0,0 +1,853 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.util.concurrent.SettableFuture; +import io.airlift.stats.CounterStat; +import io.airlift.units.DataSize; +import io.prestosql.spi.HostAddress; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorSplit; +import io.prestosql.spi.connector.ConnectorSplitSource; +import io.prestosql.spi.type.TestingTypeManager; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.testing.TestingConnectorSession; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +import static io.airlift.concurrent.MoreFutures.getFutureValue; +import static io.airlift.testing.Assertions.assertContains; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.plugin.hive.HiveTestUtils.createTestDynamicFilterSupplier; +import static io.prestosql.spi.connector.NotPartitionedPartitionHandle.NOT_PARTITIONED; +import static java.lang.Math.toIntExact; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; + +public class TestHiveSplitSource +{ + @Test + public void testOutstandingSplitCount() + { + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, new HiveConfig(), + HiveStorageFormat.ORC); + + // add 10 splits + for (int i = 0; i < 10; i++) { + hiveSplitSource.addToQueue(new TestSplit(i)); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), i + 1); + } + + // remove 1 split + assertEquals(getSplits(hiveSplitSource, 1).size(), 1); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), 9); + + // remove 4 splits + assertEquals(getSplits(hiveSplitSource, 4).size(), 4); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), 5); + + // try to remove 20 splits, and verify we only got 5 + assertEquals(getSplits(hiveSplitSource, 20).size(), 5); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), 0); + } + + @Test + public void testFail() + { + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, new HiveConfig(), + HiveStorageFormat.ORC); + + // add some splits + for (int i = 0; i < 5; i++) { + hiveSplitSource.addToQueue(new TestSplit(i)); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), i + 1); + } + + // remove a split and verify + assertEquals(getSplits(hiveSplitSource, 1).size(), 1); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), 4); + + // fail source + hiveSplitSource.fail(new RuntimeException("test")); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), 4); + + // try to remove a split and verify we got the expected exception + try { + getSplits(hiveSplitSource, 1); + fail("expected RuntimeException"); + } + catch (RuntimeException e) { + assertEquals(e.getMessage(), "test"); + } + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), 4); // 3 splits + poison + + // attempt to add another split and verify it does not work + hiveSplitSource.addToQueue(new TestSplit(99)); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), 4); // 3 splits + poison + + // fail source again + hiveSplitSource.fail(new RuntimeException("another failure")); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), 4); // 3 splits + poison + + // try to remove a split and verify we got the first exception + try { + getSplits(hiveSplitSource, 1); + fail("expected RuntimeException"); + } + catch (RuntimeException e) { + assertEquals(e.getMessage(), "test"); + } + } + + @Test + public void testReaderWaitsForSplits() + throws Exception + { + final HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, new HiveConfig(), + HiveStorageFormat.ORC); + + final SettableFuture splits = SettableFuture.create(); + + // create a thread that will get a split + final CountDownLatch started = new CountDownLatch(1); + Thread getterThread = new Thread(new Runnable() + { + @Override + public void run() + { + try { + started.countDown(); + List batch = getSplits(hiveSplitSource, 1); + assertEquals(batch.size(), 1); + splits.set(batch.get(0)); + } + catch (Throwable e) { + splits.setException(e); + } + } + }); + getterThread.start(); + + try { + // wait for the thread to be started + assertTrue(started.await(1, TimeUnit.SECONDS)); + + // sleep for a bit, and assure the thread is blocked + TimeUnit.MILLISECONDS.sleep(200); + assertTrue(!splits.isDone()); + + // add a split + hiveSplitSource.addToQueue(new TestSplit(33)); + + // wait for thread to get the split + ConnectorSplit split = splits.get(800, TimeUnit.MILLISECONDS); + assertEquals(HiveSplitWrapper.getOnlyHiveSplit(split).getSchema().getProperty("id"), "33"); + } + finally { + // make sure the thread exits + getterThread.interrupt(); + } + } + + @Test + public void testOutstandingSplitSize() + { + DataSize maxOutstandingSplitsSize = new DataSize(1, MEGABYTE); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10000, + maxOutstandingSplitsSize, + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, new HiveConfig(), + HiveStorageFormat.ORC); + int testSplitSizeInBytes = new TestSplit(0).getEstimatedSizeInBytes(); + + int maxSplitCount = toIntExact(maxOutstandingSplitsSize.toBytes()) / testSplitSizeInBytes; + for (int i = 0; i < maxSplitCount; i++) { + hiveSplitSource.addToQueue(new TestSplit(i)); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), i + 1); + } + + assertEquals(getSplits(hiveSplitSource, maxSplitCount).size(), maxSplitCount); + + for (int i = 0; i < maxSplitCount; i++) { + hiveSplitSource.addToQueue(new TestSplit(i)); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), i + 1); + } + try { + hiveSplitSource.addToQueue(new TestSplit(0)); + fail("expect failure"); + } + catch (PrestoException e) { + assertContains(e.getMessage(), "Split buffering for database.table exceeded memory limit"); + } + } + + @Test + public void testEmptyBucket() + { + final HiveSplitSource hiveSplitSource = HiveSplitSource.bucketed( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, new HiveConfig(), + HiveStorageFormat.ORC); + hiveSplitSource.addToQueue(new TestSplit(0, OptionalInt.of(2))); + hiveSplitSource.noMoreSplits(); + assertEquals(getSplits(hiveSplitSource, OptionalInt.of(0), 10).size(), 0); + assertEquals(getSplits(hiveSplitSource, OptionalInt.of(1), 10).size(), 0); + assertEquals(getSplits(hiveSplitSource, OptionalInt.of(2), 10).size(), 1); + assertEquals(getSplits(hiveSplitSource, OptionalInt.of(3), 10).size(), 0); + } + + @Test + public void testHiveSplitSourceWithDynamicFilter() + { + TypeManager typeManager = new TestingTypeManager(); + ConnectorSession session = new TestingConnectorSession( + new HiveSessionProperties(new HiveConfig().setDynamicFilterPartitionFilteringEnabled(true), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + session, + "database", + "table", + 10, + 10000, + new DataSize(10, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + createTestDynamicFilterSupplier("pt_d", ImmutableList.of(1L)), + null, + typeManager, + new HiveConfig(), + HiveStorageFormat.ORC); + + for (int i = 0; i < 5; i++) { + hiveSplitSource.addToQueue(new TestPartitionSplit(2 * i, ImmutableList.of(new HivePartitionKey("pt_d", "0")), "pt_d=0")); + hiveSplitSource.addToQueue(new TestPartitionSplit(2 * i + 1, ImmutableList.of(new HivePartitionKey("pt_d", "1")), "pt_d=1")); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), 2 * i + 2); + } + + assertEquals(getSplits(hiveSplitSource, 10).size(), 5); + } + + private static List getSplits(ConnectorSplitSource source, int maxSize) + { + return getSplits(source, OptionalInt.empty(), maxSize); + } + + private static List getSplits(ConnectorSplitSource source, OptionalInt bucketNumber, int maxSize) + { + if (bucketNumber.isPresent()) { + return getFutureValue(source.getNextBatch(new HivePartitionHandle(bucketNumber.getAsInt()), maxSize)).getSplits(); + } + else { + return getFutureValue(source.getNextBatch(NOT_PARTITIONED, maxSize)).getSplits(); + } + } + + private static class TestingHiveSplitLoader + implements HiveSplitLoader + { + @Override + public void start(HiveSplitSource splitSource) + { + } + + @Override + public void stop() + { + } + } + + private static class TestSplit + extends InternalHiveSplit + { + private TestSplit(int id, List hostAddress) + { + this(id, OptionalInt.empty(), 100, hostAddress); + } + + private TestSplit(int id) + { + this(id, OptionalInt.empty(), 100, ImmutableList.of()); + } + + private TestSplit(int id, OptionalInt bucketNumber) + { + this(id, bucketNumber, 100, ImmutableList.of()); + } + + private TestSplit(int id, OptionalInt bucketNumber, long fileSize, List hostAddress) + { + super( + "partition-name", + "path", + 0, + 100, + fileSize, + 0, + properties("id", String.valueOf(id)), + ImmutableList.of(), + ImmutableList.of(new InternalHiveBlock(0, 100, hostAddress)), + bucketNumber, + true, + false, + ImmutableMap.of(), + Optional.empty(), + false, + Optional.empty(), + Optional.empty(), + ImmutableMap.of()); + } + + private static Properties properties(String key, String value) + { + Properties properties = new Properties(); + properties.put(key, value); + return properties; + } + } + + private static class TestPartitionSplit + extends InternalHiveSplit + { + private TestPartitionSplit(int id, List partitionKeys, String partitionName) + { + this(id, partitionKeys, partitionName, OptionalInt.empty()); + } + + private TestPartitionSplit(int id, List partitionKeys, String partitionName, OptionalInt bucketNumber) + { + super( + partitionName, + "path", + 0, + 100, + 100, + 0, + properties("id", String.valueOf(id)), + partitionKeys, + ImmutableList.of(new InternalHiveBlock(0, 100, ImmutableList.of())), + bucketNumber, + true, + false, + ImmutableMap.of(), + Optional.empty(), + false, + Optional.empty(), + Optional.empty(), + ImmutableMap.of()); + } + + private static Properties properties(String key, String value) + { + Properties properties = new Properties(); + properties.put(key, value); + return properties; + } + } + + @Test + public void testGroupSmallSplit() + { + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setMaxSplitsToGroup(10); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, hiveConfig, + HiveStorageFormat.ORC); + + List hostAddress = new ArrayList<>(); + hostAddress.add(new HostAddress("vm1", 1)); + hostAddress.add(new HostAddress("vm3", 1)); + hostAddress.add(new HostAddress("vm2", 1)); + + for (int i = 0; i < 12; i++) { + hiveSplitSource.addToQueue(new TestSplit(i, hostAddress)); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), i + 1); + } + + List connectorSplits = getSplits(hiveSplitSource, 100); + List groupedConnectorSplits = hiveSplitSource.groupSmallSplits(connectorSplits, 1); + assertEquals(groupedConnectorSplits.size(), 3); + List hiveSplitWrappers = new ArrayList<>(); + groupedConnectorSplits.forEach(pendingSplit -> hiveSplitWrappers.add((HiveSplitWrapper) pendingSplit)); + assertEquals(hiveSplitWrappers.get(0).getSplits().size(), 4); + assertEquals(hiveSplitWrappers.get(1).getSplits().size(), 4); + assertEquals(hiveSplitWrappers.get(2).getSplits().size(), 4); + } + + @Test + public void testGroupSmallSplitReplicationFactor1() + { + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setMaxSplitsToGroup(10); + // ReplicationFactor 1 & all splits have same location + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, hiveConfig, + HiveStorageFormat.ORC); + + List hostAddress = new ArrayList<>(); + hostAddress.add(new HostAddress("vm1", 1)); + + for (int i = 0; i < 30; i++) { + hiveSplitSource.addToQueue(new TestSplit(i, hostAddress)); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), i + 1); + } + + List connectorSplits = getSplits(hiveSplitSource, 100); + List groupedConnectorSplits = hiveSplitSource.groupSmallSplits(connectorSplits, 1); + assertEquals(groupedConnectorSplits.size(), 3); + List hiveSplitWrappers = new ArrayList<>(); + groupedConnectorSplits.forEach(pendingSplit -> hiveSplitWrappers.add((HiveSplitWrapper) pendingSplit)); + assertEquals(hiveSplitWrappers.get(0).getSplits().size(), 10); + assertEquals(hiveSplitWrappers.get(1).getSplits().size(), 10); + assertEquals(hiveSplitWrappers.get(2).getSplits().size(), 10); + } + + @Test + public void testGroupSmallSplitReplicationFactor2() + { + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setMaxSplitsToGroup(10); + // ReplicationFactor 2 & Number of nodes 3 , split should be distributed equally among 3 nodes + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, hiveConfig, + HiveStorageFormat.ORC); + + for (int i = 0; i < 24; i++) { + List hostAddress = new ArrayList<>(); + hostAddress.add(new HostAddress("vm" + (i % 3), 1)); + hostAddress.add(new HostAddress("vm" + ((i + 1) % 3), 1)); + hiveSplitSource.addToQueue(new TestSplit(i, hostAddress)); + assertEquals(hiveSplitSource.getBufferedInternalSplitCount(), i + 1); + } + + List connectorSplits = getSplits(hiveSplitSource, 100); + List groupedConnectorSplits = hiveSplitSource.groupSmallSplits(connectorSplits, 1); + assertEquals(groupedConnectorSplits.size(), 6); + List hiveSplitWrappers = new ArrayList<>(); + groupedConnectorSplits.forEach(pendingSplit -> hiveSplitWrappers.add((HiveSplitWrapper) pendingSplit)); + for (int i = 0; i < 6; i++) { + assertEquals(hiveSplitWrappers.get(i).getSplits().size(), 4); + } + } + + @Test + public void testGroupSmallSplitReplicationFactor2MoreThan10SplitsPerNode() + { + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setMaxSplitsToGroup(10); + // ReplicationFactor 2 & Number of nodes 3, 10 splits need to form one group + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, hiveConfig, + HiveStorageFormat.ORC); + + for (int i = 0; i < 90; i++) { + List hostAddress = new ArrayList<>(); + hostAddress.add(new HostAddress("vm" + (i % 3), 1)); + hostAddress.add(new HostAddress("vm" + ((i + 1) % 3), 1)); + hiveSplitSource.addToQueue(new TestSplit(i, hostAddress)); + } + + // remove 1 split + List connectorSplits = getSplits(hiveSplitSource, 100); + List groupedConnectorSplits = hiveSplitSource.groupSmallSplits(connectorSplits, 1); + assertEquals(groupedConnectorSplits.size(), 9); + List hiveSplitWrappers = new ArrayList<>(); + groupedConnectorSplits.forEach(pendingSplit -> hiveSplitWrappers.add((HiveSplitWrapper) pendingSplit)); + for (int i = 0; i < 9; i++) { + assertEquals(hiveSplitWrappers.get(i).getSplits().size(), 10); + } + } + + @Test + public void testGroupSmallSplitConfigSetMaxSmallSplitsGrouped() + { + // testing setMaxSmallSplitsGrouped, need to 30 splits + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setMaxSplitsToGroup(30); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, hiveConfig, + HiveStorageFormat.ORC); + + for (int i = 0; i < 90; i++) { + List hostAddress = new ArrayList<>(); + hostAddress.add(new HostAddress("vm1", 1)); + hiveSplitSource.addToQueue(new TestSplit(i, hostAddress)); + } + + List connectorSplits = getSplits(hiveSplitSource, 100); + List groupedConnectorSplits = hiveSplitSource.groupSmallSplits(connectorSplits, 1); + assertEquals(groupedConnectorSplits.size(), 3); + List hiveSplitWrappers = new ArrayList<>(); + groupedConnectorSplits.forEach(pendingSplit -> hiveSplitWrappers.add((HiveSplitWrapper) pendingSplit)); + for (int i = 0; i < 3; i++) { + assertEquals(hiveSplitWrappers.get(i).getSplits().size(), 30); + } + } + + @Test + public void testGroupSmallSplitBucket() + { + // test with 4 different bucket values + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setMaxSplitsToGroup(100); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, hiveConfig, + HiveStorageFormat.ORC); + + for (int i = 0; i < 100; i++) { + List hostAddress = new ArrayList<>(); + hostAddress.add(new HostAddress("vm1", 1)); + hiveSplitSource.addToQueue(new TestSplit(i, OptionalInt.of(i % 4), 100, hostAddress)); + } + + List connectorSplits = getSplits(hiveSplitSource, 100); + List groupedConnectorSplits = hiveSplitSource.groupSmallSplits(connectorSplits, 1); + assertEquals(groupedConnectorSplits.size(), 4); + List hiveSplitWrappers = new ArrayList<>(); + groupedConnectorSplits.forEach(pendingSplit -> hiveSplitWrappers.add((HiveSplitWrapper) pendingSplit)); + for (int i = 0; i < 4; i++) { + assertEquals(hiveSplitWrappers.get(i).getSplits().size(), 25); + } + } + + @Test + public void testGroupSmallSplitAlternativeFileSize() + { + // alternative big and small size total 100 files + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setMaxSplitsToGroup(100); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, hiveConfig, + HiveStorageFormat.ORC); + + for (int i = 0; i < 100; i++) { + List hostAddress = new ArrayList<>(); + hostAddress.add(new HostAddress("vm1", 1)); + hiveSplitSource.addToQueue(new TestSplit(i, OptionalInt.empty(), 67108864 / (((i + 1) % 2) + 1), hostAddress)); + } + + List connectorSplits = getSplits(hiveSplitSource, 100); + List groupedConnectorSplits = hiveSplitSource.groupSmallSplits(connectorSplits, 1); + //assertEquals(groupedConnectorSplits.size(), 51); + List hiveSplitWrappers = new ArrayList<>(); + groupedConnectorSplits.forEach(pendingSplit -> hiveSplitWrappers.add((HiveSplitWrapper) pendingSplit)); + System.out.println("hiveSplitWrappers.get(i).getSplits().size() " + groupedConnectorSplits.size()); + for (int i = 0; i < 50; i++) { + //System.out.println(hiveSplitWrappers.get(i).getSplits().size()); + assertEquals(hiveSplitWrappers.get(i).getSplits().size(), 1); + } + for (int i = 50; i < groupedConnectorSplits.size(); i++) { + System.out.println(hiveSplitWrappers.get(i).getSplits().size()); + assertEquals(hiveSplitWrappers.get(i).getSplits().size(), 2); + } + } + + @Test + public void testGroupSmallSplitAllBigSizeFiles() + { + // alternative big and small size total 100 files + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setMaxSplitsToGroup(100); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, hiveConfig, + HiveStorageFormat.ORC); + + for (int i = 0; i < 100; i++) { + List hostAddress = new ArrayList<>(); + hostAddress.add(new HostAddress("vm1", 1)); + hiveSplitSource.addToQueue(new TestSplit(i, OptionalInt.empty(), 67108864, hostAddress)); + } + + List connectorSplits = getSplits(hiveSplitSource, 100); + List groupedConnectorSplits = hiveSplitSource.groupSmallSplits(connectorSplits, 1); + assertEquals(groupedConnectorSplits.size(), 100); + List hiveSplitWrappers = new ArrayList<>(); + groupedConnectorSplits.forEach(pendingSplit -> hiveSplitWrappers.add((HiveSplitWrapper) pendingSplit)); + System.out.println("hiveSplitWrappers.get(i).getSplits().size() " + groupedConnectorSplits.size()); + for (int i = 0; i < groupedConnectorSplits.size(); i++) { + //System.out.println(hiveSplitWrappers.get(i).getSplits().size()); + assertEquals(hiveSplitWrappers.get(i).getSplits().size(), 1); + } + } + + @Test + public void testGroupSmallSplitDifferentFileSize() + { + // alternative big and small size total 100 files + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setMaxSplitsToGroup(100); + HiveSplitSource hiveSplitSource = HiveSplitSource.allAtOnce( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, hiveConfig, + HiveStorageFormat.ORC); + + List hostAddress = new ArrayList<>(); + hostAddress.add(new HostAddress("vm1", 1)); + hiveSplitSource.addToQueue(new TestSplit(1, OptionalInt.empty(), 67108864 / 2, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(2, OptionalInt.empty(), 67108864 / 100, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(3, OptionalInt.empty(), 67108864 / 10, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(4, OptionalInt.empty(), 67108864 / 2, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(5, OptionalInt.empty(), 67108864 / 4, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(6, OptionalInt.empty(), 67108864 / 100, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(7, OptionalInt.empty(), 67108864 / 20, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(8, OptionalInt.empty(), 67108864 / 100, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(9, OptionalInt.empty(), 67108864 / 2, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(10, OptionalInt.empty(), 67108864 / 4, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(11, OptionalInt.empty(), 67108864 / 4, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(12, OptionalInt.empty(), 67108864 / 4, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(13, OptionalInt.empty(), 67108864 / 5, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(14, OptionalInt.empty(), 67108864 * 2, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(15, OptionalInt.empty(), 7000, hostAddress)); + hiveSplitSource.addToQueue(new TestSplit(16, OptionalInt.empty(), 20000, hostAddress)); + + List connectorSplits = getSplits(hiveSplitSource, 100); + List groupedConnectorSplits = hiveSplitSource.groupSmallSplits(connectorSplits, 1); + //assertEquals(groupedConnectorSplits.size(), 51); + List hiveSplitWrappers = new ArrayList<>(); + groupedConnectorSplits.forEach(pendingSplit -> hiveSplitWrappers.add((HiveSplitWrapper) pendingSplit)); + assertEquals(groupedConnectorSplits.size(), 6); + } + + @Test + public void testBucketedGroupSmallSplit() + { + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setMaxSplitsToGroup(10); + final HiveSplitSource hiveSplitSource = HiveSplitSource.bucketed( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, hiveConfig, + HiveStorageFormat.ORC); + for (int i = 0; i < 10; i++) { + hiveSplitSource.addToQueue(new TestSplit(i, OptionalInt.of(2))); + } + hiveSplitSource.noMoreSplits(); + List connectorSplits = getSplits(hiveSplitSource, OptionalInt.of(2), 100); + List groupedConnectorSplits = hiveSplitSource.groupSmallSplits(connectorSplits, 1); + assertEquals(groupedConnectorSplits.size(), 1); + List hiveSplitWrappers = new ArrayList<>(); + groupedConnectorSplits.forEach(pendingSplit -> hiveSplitWrappers.add((HiveSplitWrapper) pendingSplit)); + assertEquals(hiveSplitWrappers.get(0).getSplits().size(), 10); + } + + @Test + public void testBucketedGroupSmallSplitDifferentBucket() + { + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setMaxSplitsToGroup(10); + final HiveSplitSource hiveSplitSource = HiveSplitSource.bucketed( + HiveTestUtils.SESSION, + "database", + "table", + 10, + 10, + new DataSize(1, MEGABYTE), + Integer.MAX_VALUE, + new TestingHiveSplitLoader(), + Executors.newFixedThreadPool(5), + new CounterStat(), + null, + null, null, hiveConfig, + HiveStorageFormat.ORC); + for (int i = 0; i < 100; i++) { + hiveSplitSource.addToQueue(new TestSplit(i, OptionalInt.of(i % 4))); + } + hiveSplitSource.noMoreSplits(); + + for (int i = 0; i < 4; i++) { + List connectorSplits = getSplits(hiveSplitSource, OptionalInt.of(i), 100); + List groupedConnectorSplits = hiveSplitSource.groupSmallSplits(connectorSplits, 1); + List hiveSplitWrappers = new ArrayList<>(); + groupedConnectorSplits.forEach(pendingSplit -> hiveSplitWrappers.add((HiveSplitWrapper) pendingSplit)); + assertEquals(hiveSplitWrappers.get(0).getSplits().size(), 10); + assertEquals(hiveSplitWrappers.get(1).getSplits().size(), 10); + assertEquals(hiveSplitWrappers.get(2).getSplits().size(), 5); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveTableHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveTableHandle.java new file mode 100644 index 00000000..3fdfbc04 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveTableHandle.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.airlift.json.JsonCodec; +import io.airlift.json.JsonCodecFactory; +import io.airlift.json.ObjectMapperProvider; +import io.prestosql.spi.relation.InputReferenceExpression; +import io.prestosql.spi.type.Type; +import io.prestosql.type.TypeDeserializer; +import org.testng.annotations.Test; + +import java.util.Collections; +import java.util.Optional; +import java.util.OptionalLong; + +import static io.prestosql.metadata.MetadataManager.createTestMetadataManager; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static org.testng.Assert.assertEquals; + +public class TestHiveTableHandle +{ + @Test + public void testRoundTrip() + { + ObjectMapperProvider objectMapperProvider = new ObjectMapperProvider(); + objectMapperProvider.setJsonDeserializers(ImmutableMap.of(Type.class, new TypeDeserializer(createTestMetadataManager()))); + JsonCodec codec = new JsonCodecFactory(objectMapperProvider, true).jsonCodec(HiveTableHandle.class); + + HiveTableHandle expected = new HiveTableHandle("schema", "table", ImmutableMap.of(), ImmutableList.of(), Optional.empty()); + HiveOffloadExpression expression = + new HiveOffloadExpression(Collections.emptySet(), new InputReferenceExpression(0, SMALLINT), Optional.empty(), OptionalLong.of(5), Collections.emptyMap()); + expected = expected.withOffloadExpression(expression); + + String json = codec.toJson(expected); + HiveTableHandle actual = codec.fromJson(json); + + assertEquals(actual.getSchemaTableName(), expected.getSchemaTableName()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveTypeTranslator.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveTypeTranslator.java new file mode 100644 index 00000000..071231fc --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveTypeTranslator.java @@ -0,0 +1,103 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableMap; +import io.prestosql.spi.ErrorCode; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.type.Type; +import org.testng.annotations.Test; + +import java.util.HashMap; +import java.util.Map; + +import static io.airlift.testing.Assertions.assertContains; +import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; +import static java.util.Objects.requireNonNull; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.fail; + +public class TestHiveTypeTranslator +{ + private final TypeTranslator typeTranslator; + + private final Map typeTranslationMap; + + public TestHiveTypeTranslator() + { + this(new HiveTypeTranslator(), ImmutableMap.of()); + } + + protected TestHiveTypeTranslator(TypeTranslator typeTranslator, Map overwriteTranslation) + { + this.typeTranslator = requireNonNull(typeTranslator, "typeTranslator is null"); + + ImmutableMap hiveTypeTranslationMap = ImmutableMap.builder() + .put("bigint", HiveType.HIVE_LONG) + .put("integer", HiveType.HIVE_INT) + .put("smallint", HiveType.HIVE_SHORT) + .put("tinyint", HiveType.HIVE_BYTE) + .put("double", HiveType.HIVE_DOUBLE) + .put("varchar(3)", HiveType.valueOf("varchar(3)")) + .put("varchar", HiveType.HIVE_STRING) + .put("date", HiveType.HIVE_DATE) + .put("timestamp", HiveType.HIVE_TIMESTAMP) + .put("decimal(5,3)", HiveType.valueOf("decimal(5,3)")) + .put("varbinary", HiveType.HIVE_BINARY) + .put("array(timestamp)", HiveType.valueOf("array")) + .put("map(boolean,varbinary)", HiveType.valueOf("map")) + .put("row(col0 integer,col1 varbinary)", HiveType.valueOf("struct")) + .build(); + + typeTranslationMap = new HashMap<>(); + typeTranslationMap.putAll(hiveTypeTranslationMap); + typeTranslationMap.putAll(overwriteTranslation); + } + + @Test + public void testTypeTranslator() + { + for (Map.Entry entry : typeTranslationMap.entrySet()) { + assertTypeTranslation(entry.getKey(), entry.getValue()); + } + + assertInvalidTypeTranslation("row(integer,varbinary)", NOT_SUPPORTED.toErrorCode(), "Anonymous row type is not supported in Hive. Please give each field a name: row(integer,varbinary)"); + } + + private void assertTypeTranslation(String typeName, HiveType hiveType) + { + Type type = HiveTestUtils.TYPE_MANAGER.getType(parseTypeSignature(typeName)); + assertEquals(HiveType.toHiveType(typeTranslator, type), hiveType); + } + + private void assertInvalidTypeTranslation(String typeName, ErrorCode errorCode, String message) + { + Type type = HiveTestUtils.TYPE_MANAGER.getType(parseTypeSignature(typeName)); + try { + HiveType.toHiveType(typeTranslator, type); + fail("expected exception"); + } + catch (PrestoException e) { + try { + assertEquals(e.getErrorCode(), errorCode); + assertContains(e.getMessage(), message); + } + catch (Throwable failure) { + failure.addSuppressed(e); + throw failure; + } + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveUtil.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveUtil.java new file mode 100644 index 00000000..a0b1f8cc --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveUtil.java @@ -0,0 +1,257 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.dynamicfilter.BloomFilterDynamicFilter; +import io.prestosql.spi.dynamicfilter.DynamicFilter; +import io.prestosql.spi.dynamicfilter.HashSetDynamicFilter; +import io.prestosql.spi.type.TestingTypeManager; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.util.BloomFilter; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer; +import org.apache.hadoop.hive.serde2.thrift.test.IntString; +import org.apache.thrift.protocol.TBinaryProtocol; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormat; +import org.testng.annotations.Test; + +import java.util.AbstractList; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Optional; +import java.util.Properties; +import java.util.Set; + +import static io.airlift.testing.Assertions.assertInstanceOf; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveStorageFormat.ORC; +import static io.prestosql.plugin.hive.HiveType.HIVE_LONG; +import static io.prestosql.plugin.hive.HiveType.HIVE_STRING; +import static io.prestosql.plugin.hive.HiveUtil.getDeserializer; +import static io.prestosql.plugin.hive.HiveUtil.getPartitionKeyColumnHandles; +import static io.prestosql.plugin.hive.HiveUtil.getRegularColumnHandles; +import static io.prestosql.plugin.hive.HiveUtil.isPartitionFiltered; +import static io.prestosql.plugin.hive.HiveUtil.parseHiveTimestamp; +import static io.prestosql.plugin.hive.HiveUtil.shouldUseRecordReaderFromInputFormat; +import static io.prestosql.plugin.hive.HiveUtil.toPartitionValues; +import static io.prestosql.spi.type.StandardTypes.BIGINT; +import static io.prestosql.spi.type.StandardTypes.VARCHAR; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_OUTPUT_FORMAT; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_SERDE; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_CLASS; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +public class TestHiveUtil +{ + private static final Storage STORAGE_1 = new Storage(StorageFormat.fromHiveStorageFormat(ORC), "", Optional.empty(), false, ImmutableMap.of()); + private static final Table TABLE_1 = new Table("schema", + "table", + "user", + "MANAGED_TABLE", + STORAGE_1, + ImmutableList.of(new Column("col_1", HiveType.HIVE_INT, Optional.empty()), new Column("col_2", HiveType.HIVE_INT, Optional.empty()), new Column("col_3", HiveType.HIVE_INT, Optional.empty())), + ImmutableList.of(new Column("part_col_1", HIVE_STRING, Optional.empty())), + ImmutableMap.of(), + Optional.of("original"), + Optional.of("expanded")); + private static final HiveBucketProperty HIVE_BUCKET_PROPERTY = new HiveBucketProperty(ImmutableList.of("col_3"), HiveBucketing.BucketingVersion.BUCKETING_V2, 2, ImmutableList.of()); + private static final Storage STORAGE_2 = new Storage(StorageFormat.fromHiveStorageFormat(ORC), "", Optional.of(HIVE_BUCKET_PROPERTY), false, ImmutableMap.of()); + private static final Table TABLE_2 = new Table("schema", + "table", + "user", + "MANAGED_TABLE", + STORAGE_2, + ImmutableList.of(new Column("col_1", HiveType.HIVE_INT, Optional.empty()), new Column("col_2", HiveType.HIVE_INT, Optional.empty()), new Column("col_3", HiveType.HIVE_INT, Optional.empty())), + ImmutableList.of(new Column("part_col_1", HIVE_STRING, Optional.empty())), + ImmutableMap.of(), + Optional.of("original"), + Optional.of("expanded")); + + @Test + public void testParseHiveTimestamp() + { + DateTime time = new DateTime(2011, 5, 6, 7, 8, 9, 123, DateTimeZone.UTC); + assertEquals(parse(time, "yyyy-MM-dd HH:mm:ss"), unixTime(time, 0)); + assertEquals(parse(time, "yyyy-MM-dd HH:mm:ss.S"), unixTime(time, 1)); + assertEquals(parse(time, "yyyy-MM-dd HH:mm:ss.SSS"), unixTime(time, 3)); + assertEquals(parse(time, "yyyy-MM-dd HH:mm:ss.SSSSSSS"), unixTime(time, 6)); + assertEquals(parse(time, "yyyy-MM-dd HH:mm:ss.SSSSSSSSS"), unixTime(time, 7)); + } + + @Test + public void testGetThriftDeserializer() + { + Properties schema = new Properties(); + schema.setProperty(SERIALIZATION_LIB, ThriftDeserializer.class.getName()); + schema.setProperty(SERIALIZATION_CLASS, IntString.class.getName()); + schema.setProperty(SERIALIZATION_FORMAT, TBinaryProtocol.class.getName()); + + assertInstanceOf(getDeserializer(new Configuration(false), schema), ThriftDeserializer.class); + } + + @Test + public void testToPartitionValues() + throws MetaException + { + assertToPartitionValues("ds=2015-12-30/event_type=QueryCompletion"); + assertToPartitionValues("ds=2015-12-30"); + assertToPartitionValues("a=1/b=2/c=3"); + assertToPartitionValues("a=1"); + assertToPartitionValues("pk=!@%23$%25%5E&%2A()%2F%3D"); + assertToPartitionValues("pk=__HIVE_DEFAULT_PARTITION__"); + } + + @Test + public void testShouldUseRecordReaderFromInputFormat() + { + Properties schema = new Properties(); + schema.setProperty(FILE_INPUT_FORMAT, "org.apache.hudi.hadoop.HoodieParquetInputFormat"); + schema.setProperty(META_TABLE_SERDE, "parquet.hive.serde.ParquetHiveSerDe"); + schema.setProperty(FILE_OUTPUT_FORMAT, ""); + assertTrue(shouldUseRecordReaderFromInputFormat(new Configuration(), schema)); + + schema.setProperty(FILE_INPUT_FORMAT, "org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat"); + schema.setProperty(META_TABLE_SERDE, "parquet.hive.serde.ParquetHiveSerDe"); + schema.setProperty(FILE_OUTPUT_FORMAT, ""); + assertTrue(shouldUseRecordReaderFromInputFormat(new Configuration(), schema)); + } + + @Test + public void testIsPartitionFiltered() + { + TypeManager typeManager = new TestingTypeManager(); + assertFalse(isPartitionFiltered(null, null, typeManager), "Should not filter partition if either partitions or dynamicFilters is null"); + + Set dynamicFilters = new HashSet<>(); + List partitions = new ArrayList<>(); + + assertFalse(isPartitionFiltered(partitions, null, typeManager), "Should not filter partition if either partitions or dynamicFilters is null"); + assertFalse(isPartitionFiltered(null, ImmutableList.of(dynamicFilters), typeManager), "Should not filter partition if either partitions or dynamicFilters is null"); + assertFalse(isPartitionFiltered(partitions, ImmutableList.of(dynamicFilters), typeManager), "Should not filter partition if partitions and dynamicFilters are empty"); + + partitions.add(new HivePartitionKey("pt_d", "0")); + partitions.add(new HivePartitionKey("app_id", "10000")); + assertFalse(isPartitionFiltered(partitions, ImmutableList.of(dynamicFilters), typeManager), "Should not filter partition if dynamicFilters is empty"); + + ColumnHandle dayColumn = new HiveColumnHandle("pt_d", HIVE_LONG, parseTypeSignature(BIGINT), 0, PARTITION_KEY, Optional.empty()); + BloomFilter dayFilter = new BloomFilter(1024 * 1024, 0.01); + dynamicFilters.add(new BloomFilterDynamicFilter("1", dayColumn, dayFilter, DynamicFilter.Type.GLOBAL)); + assertTrue(isPartitionFiltered(partitions, ImmutableList.of(dynamicFilters), typeManager), "Should filter partition if any dynamicFilter has 0 element count"); + + dayFilter.add(1L); + assertTrue(isPartitionFiltered(partitions, ImmutableList.of(dynamicFilters), typeManager), "Should filter partition if partition value not in dynamicFilter"); + + dayFilter.add(0L); + assertFalse(isPartitionFiltered(partitions, ImmutableList.of(dynamicFilters), typeManager), "Should not filter partition if partition value is in dynamicFilter"); + + Set dynamicFilters1 = new HashSet<>(); + BloomFilter dayFilter1 = new BloomFilter(1024 * 1024, 0.01); + dynamicFilters1.add(new BloomFilterDynamicFilter("1", dayColumn, dayFilter1, DynamicFilter.Type.GLOBAL)); + dayFilter1.add(0L); + assertFalse(isPartitionFiltered(partitions, ImmutableList.of(dynamicFilters1), typeManager), "Should not filter partition if partition value is in dynamicFilter"); + } + + @Test + public void testIsPartitionFilteredWithNonPartitionFilter() + { + TypeManager typeManager = new TestingTypeManager(); + Set dynamicFilters = new HashSet<>(); + List partitions = new ArrayList<>(); + + partitions.add(new HivePartitionKey("pt_d", "0")); + partitions.add(new HivePartitionKey("app_id", "10000")); + + ColumnHandle nameColumn = new HiveColumnHandle("name", HIVE_STRING, parseTypeSignature(VARCHAR), 0, REGULAR, Optional.empty()); + Set nameFilter = new HashSet(); + nameFilter.add("Alice"); + dynamicFilters.add(new HashSetDynamicFilter("1", nameColumn, nameFilter, DynamicFilter.Type.GLOBAL)); + assertFalse(isPartitionFiltered(partitions, ImmutableList.of(dynamicFilters), typeManager), "Should not filter partition if dynamicFilter is on non-partition column"); + } + + @Test + public void testGetRegularColumnHandles() + { + List regularColumns = getRegularColumnHandles(TABLE_1); + assertEquals(regularColumns.get(0).isRequired(), false); + assertEquals(regularColumns.get(1).isRequired(), false); + assertEquals(regularColumns.get(2).isRequired(), false); + List bucketedRegularColumns = getRegularColumnHandles(TABLE_2); + assertEquals(bucketedRegularColumns.get(0).isRequired(), false); + assertEquals(bucketedRegularColumns.get(1).isRequired(), false); + assertEquals(bucketedRegularColumns.get(2).isRequired(), true); + } + + @Test + public void testGetPartitionKeyColumnHandles() + { + List partitionColumns = getPartitionKeyColumnHandles(TABLE_1); + assertEquals(partitionColumns.get(0).isRequired(), true); + } + + private static void assertToPartitionValues(String partitionName) + throws MetaException + { + List actual = toPartitionValues(partitionName); + AbstractList expected = new ArrayList<>(); + for (String s : actual) { + expected.add(null); + } + Warehouse.makeValsFromName(partitionName, expected); + assertEquals(actual, expected); + } + + private static long parse(DateTime time, String pattern) + { + return parseHiveTimestamp(DateTimeFormat.forPattern(pattern).print(time)); + } + + private static long unixTime(DateTime time, int factionalDigits) + { + int factor = (int) Math.pow(10, Math.max(0, 3 - factionalDigits)); + return (time.getMillis() / factor) * factor; + } + + static DateTimeZone nonDefaultTimeZone() + { + String defaultId = DateTimeZone.getDefault().getID(); + for (String id : DateTimeZone.getAvailableIDs()) { + if (!id.equals(defaultId)) { + DateTimeZone zone = DateTimeZone.forID(id); + if (zone.getStandardOffset(0) != 0) { + return zone; + } + } + } + throw new IllegalStateException("no non-default timezone"); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveVacuumTableHandle.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveVacuumTableHandle.java new file mode 100644 index 00000000..29175398 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveVacuumTableHandle.java @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.plugin.hive.HiveVacuumTableHandle.Range; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +import static com.google.common.collect.Iterables.getOnlyElement; +import static org.testng.Assert.assertEquals; + +public class TestHiveVacuumTableHandle +{ + List ranges = new ArrayList<>(); + + @Test + public void testAddRange() + { + ranges.clear(); + addRange(new Range(22, 22)); + addRange(new Range(22, 23)); + assertEquals(ranges.size(), 1); + addRange(new Range(24, 26)); + addRange(new Range(25, 26)); + assertEquals(ranges.size(), 2); + addRange(new Range(25, 27)); + assertEquals(ranges.size(), 2); + addRange(new Range(21, 27)); + assertEquals(ranges.size(), 1); + } + + @Test + public void testAddRange2() + { + ranges.clear(); + addRange(new Range(2, 2)); + addRange(new Range(3, 11)); + assertEquals(ranges.size(), 2); + addRange(new Range(1, 10)); + assertEquals(ranges.size(), 1); + assertEquals(getOnlyElement(ranges), new Range(1, 11)); + } + + @Test + public void testAddRange3() + { + ranges.clear(); + addRange(new Range(5, 10)); + addRange(new Range(1, 4)); + addRange(new Range(2, 5)); + assertEquals(ranges.size(), 1); + addRange(new Range(1, 7)); + assertEquals(ranges.size(), 1); + assertEquals(getOnlyElement(ranges), new Range(1, 10)); + } + + void addRange(Range range) + { + HiveVacuumTableHandle.addRange(range, ranges); + } + + @Test + public void testDeleteDeltaStatement() + { + assertEquals(BackgroundHiveSplitLoader.getStatementId("delete_delta_000001_000002_0000").getAsInt(), 0); + assertEquals(BackgroundHiveSplitLoader.getStatementId("delete_delta_000001_000002_0001").getAsInt(), 1); + assertEquals(BackgroundHiveSplitLoader.getStatementId("delete_delta_000001_000002").getAsInt(), -1); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveView.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveView.java new file mode 100644 index 00000000..2e6a5e22 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveView.java @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultimap; +import com.google.common.collect.ImmutableSet; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.authentication.NoHdfsAuthentication; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.HivePrivilegeInfo; +import io.prestosql.plugin.hive.metastore.PrincipalPrivileges; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.metastore.file.FileHiveMetastore; +import io.prestosql.spi.connector.ConnectorMetadata; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorViewDefinition; +import io.prestosql.spi.connector.SchemaTableName; +import org.apache.hadoop.hive.metastore.TableType; +import org.testng.SkipException; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Optional; + +import static io.prestosql.plugin.hive.HiveType.HIVE_STRING; +import static io.prestosql.spi.security.PrincipalType.USER; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +public class TestHiveView + extends AbstractTestHiveLocal +{ + @Override + protected HiveMetastore createMetastore(File tempDir) + { + File baseDir = new File(tempDir, "metastore"); + HiveConfig hiveConfig = new HiveConfig(); + HdfsConfigurationInitializer updator = new HdfsConfigurationInitializer(hiveConfig); + HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(updator, ImmutableSet.of()); + HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hiveConfig, new NoHdfsAuthentication()); + return new FileHiveMetastore(hdfsEnvironment, baseDir.toURI().toString(), "test"); + } + + @Override + public void testMismatchSchemaTable() + { + // FileHiveMetastore only supports replaceTable() for views + } + + @Override + public void testPartitionSchemaMismatch() + { + // test expects an exception to be thrown + throw new SkipException("FileHiveMetastore only supports replaceTable() for views"); + } + + @Override + public void testBucketedTableEvolution() + { + // FileHiveMetastore only supports replaceTable() for views + } + + @Override + public void testTransactionDeleteInsert() + { + // FileHiveMetastore has various incompatibilities + } + + @Test(enabled = false) + public void tesHiveView() + { + SchemaTableName temporaryCreateView = temporaryTable("hive_view"); + String viewData = "test hive view"; + String expectedvViewData = "{\n" + + " \"originalSql\" : \"test hive view\",\n" + + " \"catalog\" : \"hive\",\n" + + " \"columns\" : [ {\n" + + " \"name\" : \"dummy\",\n" + + " \"type\" : \"varchar\"\n" + + " } ],\n" + + " \"owner\" : \"test\",\n" + + " \"runAsInvoker\" : false\n" + + "}"; + String owner = "test"; + ConnectorSession session = newSession(); + HiveIdentity identity = new HiveIdentity(session); + metastoreClient.createTable(identity, buildHiveView(temporaryCreateView, owner, viewData), buildInitialPrivilegeSet(owner)); + try (Transaction transaction = newTransaction()) { + ConnectorMetadata metadata = transaction.getMetadata(); + Optional views = metadata.getView(newSession(), temporaryCreateView); + assertEquals(views.get().getOriginalSql(), expectedvViewData); + + assertTrue(metadata.listViews(newSession(), Optional.of(temporaryCreateView.getSchemaName())).contains(temporaryCreateView)); + } + finally { + metastoreClient.dropTable(identity, temporaryCreateView.getSchemaName(), temporaryCreateView.getTableName(), true); + } + } + + private static Table buildHiveView(SchemaTableName viewName, String owner, String viewData) + { + Column dummyColumn = new Column("dummy", HIVE_STRING, Optional.empty()); + + Table.Builder tableBuilder = Table.builder() + .setDatabaseName(viewName.getSchemaName()) + .setTableName(viewName.getTableName()) + .setOwner(owner) + .setTableType(TableType.VIRTUAL_VIEW.name()) + .setDataColumns(ImmutableList.of(dummyColumn)) + .setPartitionColumns(ImmutableList.of()) + .setParameters(ImmutableMap.of()) + .setViewOriginalText(Optional.of(viewData)) + .setViewExpandedText(Optional.of(viewData)); + + tableBuilder.getStorageBuilder() + .setStorageFormat(StorageFormat.VIEW_STORAGE_FORMAT) + .setLocation(""); + + return tableBuilder.build(); + } + + private static PrincipalPrivileges buildInitialPrivilegeSet(String tableOwner) + { + HivePrincipal owner = new HivePrincipal(USER, tableOwner); + return new PrincipalPrivileges( + ImmutableMultimap.builder() + .put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.SELECT, true, owner, owner)) + .put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.INSERT, true, owner, owner)) + .put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.UPDATE, true, owner, owner)) + .put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.DELETE, true, owner, owner)) + .build(), + ImmutableMultimap.of()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveWriteUtils.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveWriteUtils.java new file mode 100644 index 00000000..7d99bcd6 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveWriteUtils.java @@ -0,0 +1,52 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.plugin.hive.HdfsEnvironment.HdfsContext; +import org.apache.hadoop.fs.Path; +import org.testng.annotations.Test; + +import static io.prestosql.plugin.hive.HiveTestUtils.createTestHdfsEnvironment; +import static io.prestosql.plugin.hive.HiveWriteUtils.isS3FileSystem; +import static io.prestosql.plugin.hive.HiveWriteUtils.isViewFileSystem; +import static io.prestosql.testing.TestingConnectorSession.SESSION; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +public class TestHiveWriteUtils +{ + private static final HdfsContext CONTEXT = new HdfsContext(SESSION, "test_schema"); + + @Test + public void testIsS3FileSystem() + { + HdfsEnvironment hdfsEnvironment = createTestHdfsEnvironment(new HiveConfig()); + assertTrue(isS3FileSystem(CONTEXT, hdfsEnvironment, new Path("s3://test-bucket/test-folder"))); + assertFalse(isS3FileSystem(CONTEXT, hdfsEnvironment, new Path("/test-dir/test-folder"))); + } + + @Test + public void testIsViewFileSystem() + { + HdfsEnvironment hdfsEnvironment = createTestHdfsEnvironment(new HiveConfig()); + Path viewfsPath = new Path("viewfs://ns-default/test-folder"); + Path nonViewfsPath = new Path("hdfs://localhost/test-dir/test-folder"); + + // ViewFS check requires the mount point config + hdfsEnvironment.getConfiguration(CONTEXT, viewfsPath).set("fs.viewfs.mounttable.ns-default.link./test-folder", "hdfs://localhost/app"); + + assertTrue(isViewFileSystem(CONTEXT, hdfsEnvironment, viewfsPath)); + assertFalse(isViewFileSystem(CONTEXT, hdfsEnvironment, nonViewfsPath)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveWriterFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveWriterFactory.java new file mode 100644 index 00000000..e0a74c2f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestHiveWriterFactory.java @@ -0,0 +1,193 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.airlift.units.DataSize; +import io.airlift.units.Duration; +import io.prestosql.PagesIndexPageSorter; +import io.prestosql.metadata.Metadata; +import io.prestosql.operator.PagesIndex; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.authentication.NoHdfsAuthentication; +import io.prestosql.plugin.hive.metastore.CachingHiveMetastore; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.HiveMetastore; +import io.prestosql.plugin.hive.metastore.HivePageSinkMetadata; +import io.prestosql.plugin.hive.metastore.HivePageSinkMetadataProvider; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.metastore.thrift.BridgingHiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.MetastoreLocator; +import io.prestosql.plugin.hive.metastore.thrift.MockThriftMetastoreClient; +import io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastoreConfig; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreClient; +import io.prestosql.spi.PageSorter; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.spi.type.TypeSignature; +import io.prestosql.testing.TestingConnectorSession; +import io.prestosql.testing.TestingNodeManager; +import io.prestosql.type.InternalTypeManager; +import org.testng.annotations.Test; + +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.concurrent.ExecutorService; + +import static io.airlift.concurrent.Threads.daemonThreadsNamed; +import static io.airlift.units.DataSize.Unit.KILOBYTE; +import static io.prestosql.metadata.MetadataManager.createTestMetadataManager; +import static io.prestosql.plugin.hive.HiveStorageFormat.ORC; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultHiveFileWriterFactories; +import static io.prestosql.plugin.hive.HiveTestUtils.getDefaultOrcFileWriterFactory; +import static java.util.concurrent.Executors.newCachedThreadPool; +import static org.apache.hadoop.hive.ql.exec.Utilities.getBucketIdFromFile; +import static org.joda.time.DateTimeZone.UTC; +import static org.testng.Assert.assertEquals; + +public class TestHiveWriterFactory +{ + private ThriftMetastoreClient mockClient; + protected ExecutorService executor; + protected ExecutorService executorRefresh; + protected HiveMetastore metastore; + + private void setUp() + { + mockClient = new MockThriftMetastoreClient(); + executor = newCachedThreadPool(daemonThreadsNamed("hive-%s")); + executorRefresh = newCachedThreadPool(daemonThreadsNamed("hive-refresh-%s")); + MetastoreLocator metastoreLocator = new MockMetastoreLocator(mockClient); + metastore = new CachingHiveMetastore( + new BridgingHiveMetastore(new ThriftHiveMetastore(metastoreLocator, new ThriftHiveMetastoreConfig())), + executor, + executorRefresh, Duration.valueOf("1m"), + Duration.valueOf("15s"), + Duration.valueOf("1m"), + Duration.valueOf("15s"), + 10000, + false); + } + + @Test + public void testComputeBucketedFileName() + { + String name = HiveWriterFactory.computeBucketedFileName("20180102_030405_00641_x1y2z", 1234); + assertEquals(name, "001234_0_20180102_030405_00641_x1y2z"); + assertEquals(getBucketIdFromFile(name), 1234); + } + + @Test + public void testSortingPath() + { + setUp(); + String targetPath = "/tmp"; + String writePath = "/tmp/table"; + Optional writeIdInfo = Optional.of(new WriteIdInfo(1, 1, 0)); + StorageFormat storageFormat = StorageFormat.fromHiveStorageFormat(ORC); + Storage storage = new Storage(storageFormat, "", Optional.empty(), false, ImmutableMap.of()); + Table table = new Table("schema", + "table", + "user", + "MANAGED_TABLE", + storage, + ImmutableList.of(new Column("col_1", HiveType.HIVE_INT, Optional.empty())), + ImmutableList.of(), + ImmutableMap.of("transactional", "true"), + Optional.of("original"), + Optional.of("expanded")); + HiveConfig hiveConfig = getHiveConfig(); + HivePageSinkMetadata hivePageSinkMetadata = new HivePageSinkMetadata(new SchemaTableName("schema", "table"), Optional.of(table), ImmutableMap.of()); + PageSorter pageSorter = new PagesIndexPageSorter(new PagesIndex.TestingFactory(false)); + Metadata metadata = createTestMetadataManager(); + TypeManager typeManager = new InternalTypeManager(metadata.getFunctionAndTypeManager()); + HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hiveConfig), ImmutableSet.of()); + HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hiveConfig, new NoHdfsAuthentication()); + LocationService locationService = new HiveLocationService(hdfsEnvironment); + ConnectorSession session = newSession(); + HiveWriterFactory hiveWriterFactory = new HiveWriterFactory( + getDefaultHiveFileWriterFactories(hiveConfig), + "schema", + "table", + false, + HiveACIDWriteType.DELETE, + ImmutableList.of(new HiveColumnHandle("col_1", HiveType.HIVE_INT, new TypeSignature("integer", ImmutableList.of()), 0, HiveColumnHandle.ColumnType.REGULAR, Optional.empty())), + ORC, + ORC, + ImmutableMap.of(), + OptionalInt.empty(), + ImmutableList.of(), + new LocationHandle(targetPath, writePath, false, LocationHandle.WriteMode.STAGE_AND_MOVE_TO_TARGET_DIRECTORY, writeIdInfo), + locationService, + session.getQueryId(), + new HivePageSinkMetadataProvider(hivePageSinkMetadata, CachingHiveMetastore.memoizeMetastore(metastore, 1000), new HiveIdentity(session)), + typeManager, + hdfsEnvironment, + pageSorter, + hiveConfig.getWriterSortBufferSize(), + hiveConfig.getMaxOpenSortFiles(), + false, + UTC, + session, + new TestingNodeManager("fake-environment"), + new HiveEventClient(), + new HiveSessionProperties(hiveConfig, new OrcFileWriterConfig(), new ParquetFileWriterConfig()), + new HiveWriterStats(), + getDefaultOrcFileWriterFactory(hiveConfig)); + HiveWriter hiveWriter = hiveWriterFactory.createWriter(ImmutableList.of(), OptionalInt.empty(), Optional.empty()); + assertEquals(((SortingFileWriter) hiveWriter.getFileWriter()).getTempFilePrefix().getName(), ".tmp-sort.bucket_00000"); + } + + protected HiveConfig getHiveConfig() + { + return new HiveConfig() + .setMaxOpenSortFiles(10) + .setWriterSortBufferSize(new DataSize(100, KILOBYTE)); + } + + protected ConnectorSession newSession() + { + return newSession(ImmutableMap.of()); + } + + protected ConnectorSession newSession(Map propertyValues) + { + HiveSessionProperties properties = new HiveSessionProperties(new HiveConfig(), new OrcFileWriterConfig(), new ParquetFileWriterConfig()); + return new TestingConnectorSession(properties.getSessionProperties(), propertyValues); + } + + private static class MockMetastoreLocator + implements MetastoreLocator + { + private final ThriftMetastoreClient client; + + private MockMetastoreLocator(ThriftMetastoreClient client) + { + this.client = client; + } + + @Override + public ThriftMetastoreClient createMetastoreClient() + { + return client; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestIonSqlQueryBuilder.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestIonSqlQueryBuilder.java new file mode 100644 index 00000000..7179604f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestIonSqlQueryBuilder.java @@ -0,0 +1,129 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.Range; +import io.prestosql.spi.predicate.SortedRangeSet; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.StandardTypes; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.type.InternalTypeManager; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.Optional; + +import static io.prestosql.metadata.MetadataManager.createTestMetadataManager; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveType.HIVE_DATE; +import static io.prestosql.plugin.hive.HiveType.HIVE_DOUBLE; +import static io.prestosql.plugin.hive.HiveType.HIVE_INT; +import static io.prestosql.plugin.hive.HiveType.HIVE_STRING; +import static io.prestosql.plugin.hive.HiveType.HIVE_TIMESTAMP; +import static io.prestosql.spi.predicate.TupleDomain.withColumnDomains; +import static io.prestosql.spi.predicate.ValueSet.ofRanges; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.StandardTypes.DECIMAL; +import static io.prestosql.spi.type.StandardTypes.INTEGER; +import static io.prestosql.spi.type.StandardTypes.TIMESTAMP; +import static io.prestosql.spi.type.StandardTypes.VARCHAR; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; +import static io.prestosql.spi.util.DateTimeUtils.parseDate; +import static org.testng.Assert.assertEquals; + +public class TestIonSqlQueryBuilder +{ + private final TypeManager typeManager = new InternalTypeManager(createTestMetadataManager().getFunctionAndTypeManager()); + + @Test + public void testBuildSQL() + { + IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager); + List columns = ImmutableList.of( + new HiveColumnHandle("n_nationkey", HIVE_INT, parseTypeSignature(INTEGER), 0, REGULAR, Optional.empty()), + new HiveColumnHandle("n_name", HIVE_STRING, parseTypeSignature(VARCHAR), 1, REGULAR, Optional.empty()), + new HiveColumnHandle("n_regionkey", HIVE_INT, parseTypeSignature(INTEGER), 2, REGULAR, Optional.empty())); + + assertEquals("SELECT s._1, s._2, s._3 FROM S3Object s", + queryBuilder.buildSql(columns, TupleDomain.all())); + TupleDomain tupleDomain = withColumnDomains(ImmutableMap.of( + columns.get(2), Domain.create(SortedRangeSet.copyOf(BIGINT, ImmutableList.of(Range.equal(BIGINT, 3L))), false))); + assertEquals("SELECT s._1, s._2, s._3 FROM S3Object s WHERE (case s._3 when '' then null else CAST(s._3 AS INT) end = 3)", + queryBuilder.buildSql(columns, tupleDomain)); + } + + @Test + public void testEmptyColumns() + { + IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager); + assertEquals("SELECT ' ' FROM S3Object s", queryBuilder.buildSql(ImmutableList.of(), TupleDomain.all())); + } + + @Test + public void testDecimalColumns() + { + TypeManager typeManager = this.typeManager; + IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager); + List columns = ImmutableList.of( + new HiveColumnHandle("quantity", HiveType.valueOf("decimal(20,0)"), parseTypeSignature(DECIMAL), 0, REGULAR, Optional.empty()), + new HiveColumnHandle("extendedprice", HiveType.valueOf("decimal(20,2)"), parseTypeSignature(DECIMAL), 1, REGULAR, Optional.empty()), + new HiveColumnHandle("discount", HiveType.valueOf("decimal(10,2)"), parseTypeSignature(DECIMAL), 2, REGULAR, Optional.empty())); + DecimalType decimalType = DecimalType.createDecimalType(10, 2); + TupleDomain tupleDomain = withColumnDomains( + ImmutableMap.of( + columns.get(0), Domain.create(ofRanges(Range.lessThan(DecimalType.createDecimalType(20, 0), HiveTestUtils.longDecimal("50"))), false), + columns.get(1), Domain.create(ofRanges(Range.equal(HiveType.valueOf("decimal(20,2)").getType(typeManager), HiveTestUtils.longDecimal("0.05"))), false), + columns.get(2), Domain.create(ofRanges(Range.range(decimalType, HiveTestUtils.shortDecimal("0.0"), true, HiveTestUtils.shortDecimal("0.02"), true)), false))); + assertEquals("SELECT s._1, s._2, s._3 FROM S3Object s WHERE ((case s._1 when '' then null else CAST(s._1 AS DECIMAL(20,0)) end < 50)) AND " + + "(case s._2 when '' then null else CAST(s._2 AS DECIMAL(20,2)) end = 0.05) AND ((case s._3 when '' then null else CAST(s._3 AS DECIMAL(10,2)) " + + "end >= 0.00 AND case s._3 when '' then null else CAST(s._3 AS DECIMAL(10,2)) end <= 0.02))", + queryBuilder.buildSql(columns, tupleDomain)); + } + + @Test + public void testDateColumn() + { + IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager); + List columns = ImmutableList.of( + new HiveColumnHandle("t1", HIVE_TIMESTAMP, parseTypeSignature(TIMESTAMP), 0, REGULAR, Optional.empty()), + new HiveColumnHandle("t2", HIVE_DATE, parseTypeSignature(StandardTypes.DATE), 1, REGULAR, Optional.empty())); + TupleDomain tupleDomain = withColumnDomains(ImmutableMap.of( + columns.get(1), Domain.create(SortedRangeSet.copyOf(DATE, ImmutableList.of(Range.equal(DATE, (long) parseDate("2001-08-22")))), false))); + + assertEquals("SELECT s._1, s._2 FROM S3Object s WHERE (case s._2 when '' then null else CAST(s._2 AS TIMESTAMP) end = `2001-08-22`)", queryBuilder.buildSql(columns, tupleDomain)); + } + + @Test + public void testNotPushDoublePredicates() + { + IonSqlQueryBuilder queryBuilder = new IonSqlQueryBuilder(typeManager); + List columns = ImmutableList.of( + new HiveColumnHandle("quantity", HIVE_INT, parseTypeSignature(INTEGER), 0, REGULAR, Optional.empty()), + new HiveColumnHandle("extendedprice", HIVE_DOUBLE, parseTypeSignature(StandardTypes.DOUBLE), 1, REGULAR, Optional.empty()), + new HiveColumnHandle("discount", HIVE_DOUBLE, parseTypeSignature(StandardTypes.DOUBLE), 2, REGULAR, Optional.empty())); + TupleDomain tupleDomain = withColumnDomains( + ImmutableMap.of( + columns.get(0), Domain.create(ofRanges(Range.lessThan(BIGINT, 50L)), false), + columns.get(1), Domain.create(ofRanges(Range.equal(DOUBLE, 0.05)), false), + columns.get(2), Domain.create(ofRanges(Range.range(DOUBLE, 0.0, true, 0.02, true)), false))); + assertEquals("SELECT s._1, s._2, s._3 FROM S3Object s WHERE ((case s._1 when '' then null else CAST(s._1 AS INT) end < 50))", + queryBuilder.buildSql(columns, tupleDomain)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestOrcCache.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestOrcCache.java new file mode 100644 index 00000000..1b3830d9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestOrcCache.java @@ -0,0 +1,328 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.execution.SplitCacheMap; +import io.prestosql.tests.AbstractTestQueryFramework; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +public class TestOrcCache + extends AbstractTestQueryFramework +{ + public TestOrcCache() + { + super(() -> HiveQueryRunner.createQueryRunner()); + } + + @BeforeClass + public void setUp() + { + assertUpdate("CREATE TABLE employee(id INTEGER, name VARCHAR, dob DATE, perf DOUBLE) WITH (partitioned_by = ARRAY['name', 'dob', 'perf'], format = 'ORC')"); + assertUpdate("INSERT INTO employee VALUES(0, 'Alice', DATE '1995-10-09', DOUBLE '8.0')", 1); + assertUpdate("INSERT INTO employee VALUES(1, 'Bob', DATE '1995-11-14', DOUBLE '9.5')", 1); + assertUpdate("INSERT INTO employee VALUES(2, 'Sheldon', DATE '1978-08-29', DOUBLE '10.0')", 1); + assertUpdate("INSERT INTO employee VALUES(3, 'Winters', DATE '1918-01-21', DOUBLE '9.2')", 1); + assertUpdate("INSERT INTO employee VALUES(4, 'Lenard', DATE '1980-06-24', DOUBLE '8.8')", 1); + assertUpdate("INSERT INTO employee VALUES(5, 'Raj', DATE '1979-05-09', DOUBLE '8.2')", 1); + assertUpdate("INSERT INTO employee VALUES(6, 'Trump', DATE '1945-08-15', DOUBLE '2.5')", 1); + + assertUpdate("" + + "CREATE TABLE all_types(" + + "id INTEGER," + + "_boolean BOOLEAN," + + "_tinyint TINYINT," + + "_smallint SMALLINT," + + "_integer INTEGER," + + "_bigint BIGINT," + + "_float FLOAT," + + "_double DOUBLE," + + "_date DATE," + + "_timestamp TIMESTAMP," + + "_varchar VARCHAR) " + + "WITH (partitioned_by = ARRAY['_boolean', '_tinyint', '_smallint', '_integer', '_bigint', '_float', '_double', '_date', '_timestamp', '_varchar'], " + + "format = 'ORC')"); + assertUpdate("INSERT INTO all_types VALUES(0, true, TINYINT '0', SMALLINT '0', 0, BIGINT '0', FLOAT '2E-1', DOUBLE '0.2', DATE '1995-10-09', TIMESTAMP '1995-10-09 00:00:00', 'jiahao')", 1); + assertUpdate("INSERT INTO all_types VALUES(1, false, TINYINT '1', SMALLINT '1', 1, BIGINT '1', FLOAT '5E-2', DOUBLE '0.5', DATE '1995-11-14', TIMESTAMP '1995-11-14 00:00:00', 'han')", 1); + + assertUpdate("CREATE TABLE test_drop_cache_1(id INTEGER, p1 INTEGER, p2 INTEGER) WITH (partitioned_by = ARRAY['p1', 'p2'], format = 'ORC')"); + assertUpdate("CREATE TABLE test_drop_cache_2(id INTEGER, p3 INTEGER) WITH (partitioned_by = ARRAY['p3'], format = 'ORC')"); + + assertUpdate("CREATE TABLE test_drop_cache_3(id INTEGER, p1 INTEGER, p2 INTEGER) WITH (partitioned_by = ARRAY['p1', 'p2'], format = 'ORC')"); + assertUpdate("CREATE TABLE test_drop_cache_4(id INTEGER, p3 INTEGER) WITH (partitioned_by = ARRAY['p3'], format = 'ORC')"); + } + + @Test + public void testCacheTableOnAllColumnTypes() + { + SplitCacheMap splitCacheMap = SplitCacheMap.getInstance(); + + assertQuerySucceeds("CACHE TABLE all_types WHERE _boolean = true"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.all_types").showPredicates().contains("(_boolean = true)")); + assertQueryOrdered("SELECT id, _boolean FROM all_types WHERE _boolean = true", "VALUES (0, true)"); + + assertQuerySucceeds("CACHE TABLE all_types WHERE _tinyint = 0"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.all_types").showPredicates().contains("(_tinyint = 0)")); + assertQueryOrdered("SELECT id, _tinyint FROM all_types WHERE _tinyint = 0", "VALUES (0, 0)"); + + assertQuerySucceeds("CACHE TABLE all_types WHERE _smallint = 0"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.all_types").showPredicates().contains("(_smallint = 0)")); + assertQueryOrdered("SELECT id, _smallint FROM all_types WHERE _smallint = 0", "VALUES (0, 0)"); + + assertQuerySucceeds("CACHE TABLE all_types WHERE _integer = 0"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.all_types").showPredicates().contains("(_integer = 0)")); + assertQueryOrdered("SELECT id, _integer FROM all_types WHERE _integer = 0", "VALUES (0, 0)"); + + assertQuerySucceeds("CACHE TABLE all_types WHERE _bigint = 0"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.all_types").showPredicates().contains("(_bigint = 0)")); + assertQueryOrdered("SELECT id, _bigint FROM all_types WHERE _bigint = 0", "VALUES (0, 0)"); + + assertQuerySucceeds("CACHE TABLE all_types WHERE _float > FLOAT '6e-2'"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.all_types").showPredicates().contains("(_float > FLOAT '6e-2')")); + assertQueryOrdered("SELECT id, _float FROM all_types WHERE _float > FLOAT '6e-2'", "VALUES (0, 2E-1)"); + + assertQuerySucceeds("CACHE TABLE all_types WHERE _double = DOUBLE '0.2'"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.all_types").showPredicates().contains("(_double = DOUBLE '0.2')")); + assertQueryOrdered("SELECT id, _double FROM all_types WHERE _double = DOUBLE '0.2'", "VALUES (0, 0.2)"); + + assertQuerySucceeds("CACHE TABLE all_types WHERE _date = DATE '1995-10-09'"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.all_types").showPredicates().contains("(_date = DATE '1995-10-09')")); + assertQueryOrdered("SELECT id, _date FROM all_types WHERE _date = DATE '1995-10-09'", "VALUES (0, '1995-10-09')"); + + assertQuerySucceeds("CACHE TABLE all_types WHERE _timestamp = TIMESTAMP '1995-10-09 00:00:00'"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.all_types").showPredicates().contains("(_timestamp = TIMESTAMP '1995-10-09 00:00:00')")); + assertQueryOrdered("SELECT id, _timestamp FROM all_types WHERE _timestamp = TIMESTAMP '1995-10-09 00:00:00'", "VALUES (0, '1995-10-09 00:00:00')"); + + assertQuerySucceeds("CACHE TABLE all_types WHERE _varchar = 'jiahao'"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.all_types").showPredicates().contains("(_varchar = 'jiahao')")); + assertQueryOrdered("SELECT id, _varchar FROM all_types WHERE _varchar = 'jiahao'", "VALUES (0, 'jiahao')"); + } + + @Test + public void testCacheTableOnNonPartitionedColumn() + { + assertQueryFails("CACHE TABLE employee WHERE id <= 3", ".*?Column 'id' is not cacheable"); + } + + @Test + public void testCacheTableWithLikePredicate() + { + assertQueryFails("CACHE TABLE employee WHERE name LIKE 'S%'", ".*?LIKE predicate is not supported."); + } + + @Test + public void testCacheTableWithOrOperator() + { + assertQueryFails("CACHE TABLE employee WHERE dob > DATE '1985-01-01' OR perf < DOUBLE '9.0'", ".*?OR operator is not supported"); + } + + @Test + public void testCacheTableWithIsNullPredicate() + { + SplitCacheMap splitCacheMap = SplitCacheMap.getInstance(); + + assertQuerySucceeds("CACHE TABLE employee WHERE dob IS NULL"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.employee").showPredicates().contains("(dob IS NULL)")); + + assertQuerySucceeds("CACHE TABLE employee WHERE name IS NULL"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.employee").showPredicates().contains("(name IS NULL)")); + + assertQuerySucceeds("CACHE TABLE employee WHERE perf IS NULL"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.employee").showPredicates().contains("(perf IS NULL)")); + } + + @Test + public void testCacheTableWithIsNotNullPredicate() + { + SplitCacheMap splitCacheMap = SplitCacheMap.getInstance(); + + assertQuerySucceeds("CACHE TABLE employee WHERE dob IS NOT NULL"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.employee").showPredicates().contains("(dob IS NOT NULL)")); + + assertQuerySucceeds("CACHE TABLE employee WHERE name IS NOT NULL"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.employee").showPredicates().contains("(name IS NOT NULL)")); + + assertQuerySucceeds("CACHE TABLE employee WHERE perf IS NOT NULL"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.employee").showPredicates().contains("(perf IS NOT NULL)")); + } + + @Test + public void testCacheTableWithComplexPredicate() + { + SplitCacheMap splitCacheMap = SplitCacheMap.getInstance(); + + assertQuerySucceeds("CACHE TABLE employee WHERE dob BETWEEN DATE '1980-01-01' AND DATE '2000-01-01' AND perf < DOUBLE '9.0'"); + assertQueryOrdered( + "SELECT * FROM employee WHERE dob BETWEEN DATE '1980-01-01' AND DATE '2000-01-01' AND perf < DOUBLE '9.0' ORDER BY id", + "VALUES (0, 'Alice', '1995-10-09', 8.0), (4, 'Lenard', '1980-06-24', 8.8)"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.employee").showPredicates().contains("((dob BETWEEN DATE '1980-01-01' AND DATE '2000-01-01') AND (perf < DOUBLE '9.0'))")); + + assertQuerySucceeds("CACHE TABLE employee WHERE dob < DATE '1980-01-01' AND perf < DOUBLE '8.0'"); + assertQueryOrdered( + "SELECT * FROM employee WHERE dob < DATE '1980-01-01' AND perf < DOUBLE '8.0' ORDER BY id", + "VALUES (6, 'Trump', '1945-08-15', 2.5)"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.employee").showPredicates().contains("((dob < DATE '1980-01-01') AND (perf < DOUBLE '8.0'))")); + } + + @Test + public void testCacheNonOrcTable() + { + // PARQUET + assertUpdate("CREATE TABLE parquet_table(id BIGINT, par BIGINT) WITH (partitioned_by = ARRAY['par'], format = 'PARQUET')"); + assertQueryFails("CACHE TABLE parquet_table WHERE par = 0", ".*?Table 'hive.tpch.parquet_table' cannot be cached"); + assertUpdate("DROP TABLE parquet_table"); + + // AVRO + assertUpdate("CREATE TABLE avro_table(id BIGINT, par BIGINT) WITH (partitioned_by = ARRAY['par'], format = 'AVRO')"); + assertQueryFails("CACHE TABLE avro_table WHERE par = 0", ".*?Table 'hive.tpch.avro_table' cannot be cached"); + assertUpdate("DROP TABLE avro_table"); + + // RCBINARY + assertUpdate("CREATE TABLE rcbinary_table(id BIGINT, par BIGINT) WITH (partitioned_by = ARRAY['par'], format = 'RCBINARY')"); + assertQueryFails("CACHE TABLE rcbinary_table WHERE par = 0", ".*?Table 'hive.tpch.rcbinary_table' cannot be cached"); + assertUpdate("DROP TABLE rcbinary_table"); + + // RCTEXT + assertUpdate("CREATE TABLE rctext_table(id BIGINT, par BIGINT) WITH (partitioned_by = ARRAY['par'], format = 'RCTEXT')"); + assertQueryFails("CACHE TABLE rctext_table WHERE par = 0", ".*?Table 'hive.tpch.rctext_table' cannot be cached"); + assertUpdate("DROP TABLE rctext_table"); + + // SEQUENCEFILE + assertUpdate("CREATE TABLE sequencefile_table(id BIGINT, par BIGINT) WITH (partitioned_by = ARRAY['par'], format = 'SEQUENCEFILE')"); + assertQueryFails("CACHE TABLE sequencefile_table WHERE par = 0", ".*?Table 'hive.tpch.sequencefile_table' cannot be cached"); + assertUpdate("DROP TABLE sequencefile_table"); + + // JSON + assertUpdate("CREATE TABLE json_table(id BIGINT, par BIGINT) WITH (partitioned_by = ARRAY['par'], format = 'JSON')"); + assertQueryFails("CACHE TABLE json_table WHERE par = 0", ".*?Table 'hive.tpch.json_table' cannot be cached"); + assertUpdate("DROP TABLE json_table"); + + // TEXTFILE + assertUpdate("CREATE TABLE textfile_table(id BIGINT, par BIGINT) WITH (partitioned_by = ARRAY['par'], format = 'TEXTFILE')"); + assertQueryFails("CACHE TABLE textfile_table WHERE par = 0", ".*?Table 'hive.tpch.textfile_table' cannot be cached"); + assertUpdate("DROP TABLE textfile_table"); + + // CSV + assertUpdate("CREATE TABLE csv_table(id VARCHAR, par VARCHAR) WITH (partitioned_by = ARRAY['par'], format = 'CSV')"); + assertQueryFails("CACHE TABLE csv_table WHERE par = ''", ".*?Table 'hive.tpch.csv_table' cannot be cached"); + assertUpdate("DROP TABLE csv_table"); + } + + @Test + public void testDropCacheOnMultipleTables() + { + SplitCacheMap splitCacheMap = SplitCacheMap.getInstance(); + + assertQuerySucceeds("CACHE TABLE test_drop_cache_1 WHERE p1 = 1"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_1").showPredicates().contains("(p1 = 1)")); + + assertQuerySucceeds("CACHE TABLE test_drop_cache_2 WHERE p3 = 3"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_2").showPredicates().contains("(p3 = 3)")); + + assertQuerySucceeds("CACHE TABLE test_drop_cache_2 WHERE p3 = 4"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_2").showPredicates().contains("(p3 = 4)")); + + assertQuerySucceeds("CACHE TABLE test_drop_cache_1 WHERE p2 = 2"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_1").showPredicates().contains("(p2 = 2)")); + + assertQuerySucceeds("DROP CACHE test_drop_cache_1"); + assertEquals(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_1"), null); + + assertQuerySucceeds("DROP CACHE test_drop_cache_2"); + assertEquals(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_2"), null); + } + + @Test + public void testDropCacheWithPredicates() + { + SplitCacheMap splitCacheMap = SplitCacheMap.getInstance(); + + assertQuerySucceeds("CACHE TABLE test_drop_cache_3 WHERE p1 = 1"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_3").showPredicates().contains("(p1 = 1)")); + + assertQuerySucceeds("CACHE TABLE test_drop_cache_4 WHERE p3 = 3"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_4").showPredicates().contains("(p3 = 3)")); + + assertQuerySucceeds("CACHE TABLE test_drop_cache_4 WHERE p3 = 4"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_4").showPredicates().contains("(p3 = 4)")); + + assertQuerySucceeds("CACHE TABLE test_drop_cache_3 WHERE p2 = 2"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_3").showPredicates().contains("(p2 = 2)")); + + assertQuerySucceeds("DROP CACHE test_drop_cache_3 WHERE p1 = 1"); + assertFalse(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_3").showPredicates().contains("(p1 = 1)")); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_3").showPredicates().contains("(p2 = 2)")); + + assertQuerySucceeds("DROP CACHE test_drop_cache_4 WHERE p3 = 4"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_4").showPredicates().contains("(p3 = 3)")); + assertFalse(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache_4").showPredicates().contains("(p3 = 4)")); + } + + @Test + public void testDropCacheOnNonExistTable() + { + assertQueryFails("DROP CACHE test_drop_cache_no WHERE par = 0", ".*?Cache for table 'hive.tpch.test_drop_cache_no' does not exist"); + assertQueryFails("DROP CACHE test_drop_cache_no", ".*?Cache for table 'hive.tpch.test_drop_cache_no' does not exist"); + } + + @Test + public void testDropCacheWithNonExistPredicates() + { + SplitCacheMap splitCacheMap = SplitCacheMap.getInstance(); + + assertUpdate("CREATE TABLE test_drop_cache(id INTEGER, par INTEGER) WITH (partitioned_by = ARRAY['par'], format = 'ORC')"); + + assertQuerySucceeds("CACHE TABLE test_drop_cache WHERE par = 0"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_drop_cache").showPredicates().contains("(par = 0)")); + + assertQueryFails("DROP CACHE test_drop_cache WHERE par = 1", ".*?Cache predicate '\\(par = 1\\)' does not exist"); + } + + @Test + public void testShowCacheOnNonExistTable() + { + assertQueryFails("SHOW CACHE test_show_no_cache", ".*?Cache for table 'hive.tpch.test_show_no_cache' does not exist"); + } + + @Test + public void testShowCacheAfterTableDeleted() + { + SplitCacheMap splitCacheMap = SplitCacheMap.getInstance(); + + assertUpdate("CREATE TABLE test_show_cache(id INTEGER, par INTEGER) WITH (partitioned_by = ARRAY['par'], format = 'ORC')"); + assertQuerySucceeds("CACHE TABLE test_show_cache WHERE par = 0"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.test_show_cache").showPredicates().contains("(par = 0)")); + + assertUpdate("DROP TABLE test_show_cache"); + assertQueryFails("SHOW CACHE test_show_cache", ".*?Cache for table 'hive.tpch.test_show_cache' does not exist"); + } + + @Test + public void testShowCache() + { + SplitCacheMap splitCacheMap = SplitCacheMap.getInstance(); + + assertQuerySucceeds("CACHE TABLE employee WHERE dob BETWEEN DATE '1980-01-01' AND DATE '2000-01-01'"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.employee").showPredicates().contains("(dob BETWEEN DATE '1980-01-01' AND DATE '2000-01-01')")); + + assertQuerySucceeds("CACHE TABLE employee WHERE perf > DOUBLE '9.0'"); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.employee").showPredicates().contains("(dob BETWEEN DATE '1980-01-01' AND DATE '2000-01-01')")); + assertTrue(splitCacheMap.tableCacheInfoMap().get("hive.tpch.employee").showPredicates().contains("(perf > DOUBLE '9.0')")); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestOrcFileWriterConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestOrcFileWriterConfig.java new file mode 100644 index 00000000..c254ee1c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestOrcFileWriterConfig.java @@ -0,0 +1,68 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableMap; +import io.airlift.units.DataSize; +import org.testng.annotations.Test; + +import java.util.Map; + +import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; +import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; +import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; +import static io.airlift.units.DataSize.Unit.BYTE; +import static io.airlift.units.DataSize.Unit.KILOBYTE; +import static io.airlift.units.DataSize.Unit.MEGABYTE; + +public class TestOrcFileWriterConfig +{ + @Test + public void testDefaults() + { + assertRecordedDefaults(recordDefaults(OrcFileWriterConfig.class) + .setStripeMinSize(new DataSize(32, MEGABYTE)) + .setStripeMaxSize(new DataSize(64, MEGABYTE)) + .setStripeMaxRowCount(10_000_000) + .setRowGroupMaxRowCount(10_000) + .setDictionaryMaxMemory(new DataSize(16, MEGABYTE)) + .setStringStatisticsLimit(new DataSize(64, BYTE)) + .setMaxCompressionBufferSize(new DataSize(256, KILOBYTE))); + } + + @Test + public void testExplicitPropertyMappings() + { + Map properties = new ImmutableMap.Builder() + .put("hive.orc.writer.stripe-min-size", "13MB") + .put("hive.orc.writer.stripe-max-size", "27MB") + .put("hive.orc.writer.stripe-max-rows", "44") + .put("hive.orc.writer.row-group-max-rows", "11") + .put("hive.orc.writer.dictionary-max-memory", "13MB") + .put("hive.orc.writer.string-statistics-limit", "17MB") + .put("hive.orc.writer.max-compression-buffer-size", "19MB") + .build(); + + OrcFileWriterConfig expected = new OrcFileWriterConfig() + .setStripeMinSize(new DataSize(13, MEGABYTE)) + .setStripeMaxSize(new DataSize(27, MEGABYTE)) + .setStripeMaxRowCount(44) + .setRowGroupMaxRowCount(11) + .setDictionaryMaxMemory(new DataSize(13, MEGABYTE)) + .setStringStatisticsLimit(new DataSize(17, MEGABYTE)) + .setMaxCompressionBufferSize(new DataSize(19, MEGABYTE)); + + assertFullMapping(properties, expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestOrcPageSourceMemoryTracking.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestOrcPageSourceMemoryTracking.java new file mode 100644 index 00000000..a0b73d87 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestOrcPageSourceMemoryTracking.java @@ -0,0 +1,819 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.airlift.slice.Slice; +import io.airlift.stats.Distribution; +import io.airlift.units.DataSize; +import io.prestosql.execution.Lifespan; +import io.prestosql.metadata.Metadata; +import io.prestosql.metadata.Split; +import io.prestosql.operator.DriverContext; +import io.prestosql.operator.ScanFilterAndProjectOperator.ScanFilterAndProjectOperatorFactory; +import io.prestosql.operator.SourceOperator; +import io.prestosql.operator.SourceOperatorFactory; +import io.prestosql.operator.TableScanOperator.TableScanOperatorFactory; +import io.prestosql.operator.project.CursorProcessor; +import io.prestosql.operator.project.PageProcessor; +import io.prestosql.orc.OrcCacheStore; +import io.prestosql.plugin.hive.orc.OrcConcatPageSource; +import io.prestosql.plugin.hive.orc.OrcPageSourceFactory; +import io.prestosql.spi.Page; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.classloader.ThreadContextClassLoader; +import io.prestosql.spi.connector.CatalogName; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.dynamicfilter.DynamicFilter; +import io.prestosql.spi.dynamicfilter.DynamicFilterSupplier; +import io.prestosql.spi.operator.ReuseExchangeOperator; +import io.prestosql.spi.plan.PlanNodeId; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.type.Type; +import io.prestosql.sql.gen.ExpressionCompiler; +import io.prestosql.sql.gen.PageFunctionCompiler; +import io.prestosql.testing.TestingConnectorSession; +import io.prestosql.testing.TestingSplit; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; +import org.apache.hadoop.hive.ql.io.HiveOutputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcFile; +import org.apache.hadoop.hive.ql.io.orc.OrcFile.WriterOptions; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcSerde; +import org.apache.hadoop.hive.ql.io.orc.Writer; +import org.apache.hadoop.hive.serde2.Serializer; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.orc.NullMemoryManager; +import org.apache.orc.impl.WriterImpl; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.lang.reflect.Constructor; +import java.lang.reflect.Field; +import java.lang.reflect.Method; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Properties; +import java.util.Random; +import java.util.UUID; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +import static com.google.common.base.Predicates.not; +import static com.google.common.collect.Iterables.filter; +import static com.google.common.collect.Iterables.transform; +import static io.airlift.concurrent.Threads.daemonThreadsNamed; +import static io.airlift.testing.Assertions.assertBetweenInclusive; +import static io.airlift.units.DataSize.Unit.BYTE; +import static io.prestosql.metadata.MetadataManager.createTestMetadataManager; +import static io.prestosql.orc.OrcReader.MAX_BATCH_SIZE; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT; +import static io.prestosql.plugin.hive.HiveTestUtils.SESSION; +import static io.prestosql.plugin.hive.HiveTestUtils.TYPE_MANAGER; +import static io.prestosql.spi.type.VarcharType.createUnboundedVarcharType; +import static io.prestosql.sql.relational.Expressions.field; +import static io.prestosql.testing.TestingHandles.TEST_TABLE_HANDLE; +import static io.prestosql.testing.TestingSession.testSessionBuilder; +import static io.prestosql.testing.TestingTaskContext.createTaskContext; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.Executors.newCachedThreadPool; +import static java.util.concurrent.Executors.newScheduledThreadPool; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; +import static org.apache.hadoop.hive.ql.io.orc.CompressionKind.ZLIB; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector; +import static org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.COMPRESS_CODEC; +import static org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.COMPRESS_TYPE; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; + +public class TestOrcPageSourceMemoryTracking +{ + private static final String ORC_RECORD_WRITER = OrcOutputFormat.class.getName() + "$OrcRecordWriter"; + private static final Constructor WRITER_CONSTRUCTOR = getOrcWriterConstructor(); + private static final Configuration CONFIGURATION = new Configuration(); + private static final int NUM_ROWS = 50000; + private static final int STRIPE_ROWS = 20000; + private static final Metadata metadata = createTestMetadataManager(); + private static final ExpressionCompiler EXPRESSION_COMPILER = new ExpressionCompiler(metadata, new PageFunctionCompiler(metadata, 0)); + + private final Random random = new Random(); + private final List testColumns = ImmutableList.builder() + .add(new TestColumn("p_empty_string", javaStringObjectInspector, () -> "", true)) + .add(new TestColumn("p_string", javaStringObjectInspector, () -> Long.toHexString(random.nextLong()), false)) + .build(); + + private File tempFile; + private TestPreparer testPreparer; + + @DataProvider(name = "rowCount") + public static Object[][] rowCount() + { + return new Object[][] {{50_000}, {10_000}, {5_000}}; + } + + @BeforeClass + public void setUp() + throws Exception + { + tempFile = File.createTempFile("presto_test_orc_page_source_memory_tracking", "orc"); + tempFile.delete(); + testPreparer = new TestPreparer(tempFile.getAbsolutePath()); + } + + @AfterClass(alwaysRun = true) + public void tearDown() + { + tempFile.delete(); + } + + @Test + public void testPageSource() + throws Exception + { + // Numbers used in assertions in this test may change when implementation is modified, + // feel free to change them if they break in the future + + FileFormatDataSourceStats stats = new FileFormatDataSourceStats(); + ConnectorPageSource pageSource = testPreparer.newPageSource(stats); + + assertEquals(pageSource.getSystemMemoryUsage(), 0); + + long memoryUsage = -1; + int totalRows = 0; + while (totalRows < 20000) { + assertFalse(pageSource.isFinished()); + Page page = pageSource.getNextPage(); + assertNotNull(page); + Block block = page.getBlock(1); + + if (memoryUsage == -1) { + assertBetweenInclusive(pageSource.getSystemMemoryUsage(), 180000L, 189999L); // Memory usage before lazy-loading the block + createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1); // trigger loading for lazy block + memoryUsage = pageSource.getSystemMemoryUsage(); + assertBetweenInclusive(memoryUsage, 460000L, 469999L); // Memory usage after lazy-loading the actual block + } + else { + assertEquals(pageSource.getSystemMemoryUsage(), memoryUsage); + createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1); // trigger loading for lazy block + assertEquals(pageSource.getSystemMemoryUsage(), memoryUsage); + } + totalRows += page.getPositionCount(); + } + + memoryUsage = -1; + while (totalRows < 40000) { + assertFalse(pageSource.isFinished()); + Page page = pageSource.getNextPage(); + assertNotNull(page); + Block block = page.getBlock(1); + + if (memoryUsage == -1) { + assertBetweenInclusive(pageSource.getSystemMemoryUsage(), 180000L, 189999L); // Memory usage before lazy-loading the block + createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1); // trigger loading for lazy block + memoryUsage = pageSource.getSystemMemoryUsage(); + assertBetweenInclusive(memoryUsage, 460000L, 469999L); // Memory usage after lazy-loading the actual block + } + else { + assertEquals(pageSource.getSystemMemoryUsage(), memoryUsage); + createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1); // trigger loading for lazy block + assertEquals(pageSource.getSystemMemoryUsage(), memoryUsage); + } + totalRows += page.getPositionCount(); + } + + memoryUsage = -1; + while (totalRows < NUM_ROWS) { + assertFalse(pageSource.isFinished()); + Page page = pageSource.getNextPage(); + assertNotNull(page); + Block block = page.getBlock(1); + + if (memoryUsage == -1) { + assertBetweenInclusive(pageSource.getSystemMemoryUsage(), 90000L, 99999L); // Memory usage before lazy-loading the block + createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1); // trigger loading for lazy block + memoryUsage = pageSource.getSystemMemoryUsage(); + assertBetweenInclusive(memoryUsage, 360000L, 369999L); // Memory usage after lazy-loading the actual block + } + else { + assertEquals(pageSource.getSystemMemoryUsage(), memoryUsage); + createUnboundedVarcharType().getSlice(block, block.getPositionCount() - 1); // trigger loading for lazy block + assertEquals(pageSource.getSystemMemoryUsage(), memoryUsage); + } + totalRows += page.getPositionCount(); + } + + assertFalse(pageSource.isFinished()); + assertNull(pageSource.getNextPage()); + assertTrue(pageSource.isFinished()); + assertEquals(pageSource.getSystemMemoryUsage(), 0); + pageSource.close(); + } + + @Test(dataProvider = "rowCount") + public void testMaxReadBytes(int rowCount) + throws Exception + { + int maxReadBytes = 1_000; + HiveConfig config = new HiveConfig(); + config.setOrcMaxReadBlockSize(new DataSize(maxReadBytes, BYTE)); + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(config, new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + FileFormatDataSourceStats stats = new FileFormatDataSourceStats(); + + // Build a table where every row gets larger, so we can test that the "batchSize" reduces + int numColumns = 5; + int step = 250; + ImmutableList.Builder columnBuilder = ImmutableList.builder() + .add(new TestColumn("p_empty_string", javaStringObjectInspector, () -> "", true)); + GrowingTestColumn[] dataColumns = new GrowingTestColumn[numColumns]; + for (int i = 0; i < numColumns; i++) { + dataColumns[i] = new GrowingTestColumn("p_string", javaStringObjectInspector, () -> Long.toHexString(random.nextLong()), false, step * (i + 1)); + columnBuilder.add(dataColumns[i]); + } + List testColumns = columnBuilder.build(); + File tempFile = File.createTempFile("presto_test_orc_page_source_max_read_bytes", "orc"); + tempFile.delete(); + + TestPreparer testPreparer = new TestPreparer(tempFile.getAbsolutePath(), testColumns, rowCount, rowCount); + ConnectorPageSource pageSource = testPreparer.newPageSource(stats, session); + + try { + int positionCount = 0; + while (true) { + Page page = pageSource.getNextPage(); + if (pageSource.isFinished()) { + break; + } + assertNotNull(page); + page = page.getLoadedPage(); + positionCount += page.getPositionCount(); + // assert upper bound is tight + // ignore the first MAX_BATCH_SIZE rows given the sizes are set when loading the blocks + if (positionCount > MAX_BATCH_SIZE) { + // either the block is bounded by maxReadBytes or we just load one single large block + // an error margin MAX_BATCH_SIZE / step is needed given the block sizes are increasing + assertTrue(page.getSizeInBytes() < maxReadBytes * (MAX_BATCH_SIZE / step) || 1 == page.getPositionCount()); + } + } + + // verify the stats are correctly recorded + Distribution distribution = stats.getMaxCombinedBytesPerRow().getAllTime(); + assertEquals((int) distribution.getCount(), 1); + // the block is VariableWidthBlock that contains valueIsNull and offsets arrays as overhead + assertEquals((int) distribution.getMax(), Arrays.stream(dataColumns).mapToInt(GrowingTestColumn::getMaxSize).sum() + (Integer.BYTES + Byte.BYTES) * numColumns); + pageSource.close(); + } + finally { + tempFile.delete(); + } + } + + @Test + public void testTableScanOperator() + { + // Numbers used in assertions in this test may change when implementation is modified, + // feel free to change them if they break in the future + + DriverContext driverContext = testPreparer.newDriverContext(); + SourceOperator operator = testPreparer.newTableScanOperator(driverContext); + + assertEquals(driverContext.getSystemMemoryUsage(), 0); + + long memoryUsage = -1; + int totalRows = 0; + while (totalRows < 20000) { + assertFalse(operator.isFinished()); + Page page = operator.getOutput(); + assertNotNull(page); + page.getBlock(1); + totalRows += page.getPositionCount(); + if (memoryUsage == -1) { + memoryUsage = driverContext.getSystemMemoryUsage(); + assertBetweenInclusive(memoryUsage, 180000L, 469999L); + System.out.println(String.format("[TotalRows: %d] memUsage: %d", totalRows, driverContext.getSystemMemoryUsage())); + } + else { + //assertEquals(driverContext.getSystemMemoryUsage(), memoryUsage); + System.out.println(String.format("[TotalRows: %d] memUsage: %d", totalRows, driverContext.getSystemMemoryUsage())); + } + } + + memoryUsage = -1; + while (totalRows < 40000) { + assertFalse(operator.isFinished()); + Page page = operator.getOutput(); + assertNotNull(page); + page.getBlock(1); + if (memoryUsage == -1) { + memoryUsage = driverContext.getSystemMemoryUsage(); + assertBetweenInclusive(memoryUsage, 460000L, 469999L); + } + else { + assertEquals(driverContext.getSystemMemoryUsage(), memoryUsage); + } + totalRows += page.getPositionCount(); + } + + memoryUsage = -1; + while (totalRows < NUM_ROWS) { + assertFalse(operator.isFinished()); + Page page = operator.getOutput(); + assertNotNull(page); + page.getBlock(1); + if (memoryUsage == -1) { + memoryUsage = driverContext.getSystemMemoryUsage(); + assertBetweenInclusive(memoryUsage, 360000L, 369999L); + } + else { + assertEquals(driverContext.getSystemMemoryUsage(), memoryUsage); + } + totalRows += page.getPositionCount(); + } + + assertFalse(operator.isFinished()); + assertNull(operator.getOutput()); + assertTrue(operator.isFinished()); + assertEquals(driverContext.getSystemMemoryUsage(), 0); + } + + @Test + public void testScanFilterAndProjectOperator() + { + // Numbers used in assertions in this test may change when implementation is modified, + // feel free to change them if they break in the future + + DriverContext driverContext = testPreparer.newDriverContext(); + SourceOperator operator = testPreparer.newScanFilterAndProjectOperator(driverContext); + + assertEquals(driverContext.getSystemMemoryUsage(), 0); + + int totalRows = 0; + while (totalRows < NUM_ROWS) { + assertFalse(operator.isFinished()); + Page page = operator.getOutput(); + assertNotNull(page); + assertBetweenInclusive(driverContext.getSystemMemoryUsage(), 90_000L, 499_999L); + totalRows += page.getPositionCount(); + } + + // done... in the current implementation finish is not set until output returns a null page + assertNull(operator.getOutput()); + assertTrue(operator.isFinished()); + assertBetweenInclusive(driverContext.getSystemMemoryUsage(), 0L, 500L); + } + + private class TestPreparer + { + private final FileSplit fileSplit; + private final Properties schema; + private final List columns; + private final List types; + private final List partitionKeys; + private final ExecutorService executor = newCachedThreadPool(daemonThreadsNamed("test-executor-%s")); + private final ScheduledExecutorService scheduledExecutor = newScheduledThreadPool(2, daemonThreadsNamed("test-scheduledExecutor-%s")); + + public TestPreparer(String tempFilePath) + throws Exception + { + this(tempFilePath, testColumns, NUM_ROWS, STRIPE_ROWS); + } + + public TestPreparer(String tempFilePath, List testColumns, int numRows, int stripeRows) + throws Exception + { + OrcSerde serde = new OrcSerde(); + schema = new Properties(); + schema.setProperty("columns", + testColumns.stream() + .map(TestColumn::getName) + .collect(Collectors.joining(","))); + schema.setProperty("columns.types", + testColumns.stream() + .map(TestColumn::getType) + .collect(Collectors.joining(","))); + schema.setProperty(FILE_INPUT_FORMAT, OrcInputFormat.class.getName()); + schema.setProperty(SERIALIZATION_LIB, serde.getClass().getName()); + + partitionKeys = testColumns.stream() + .filter(TestColumn::isPartitionKey) + .map(input -> new HivePartitionKey(input.getName(), (String) input.getWriteValue())) + .collect(toList()); + + ImmutableList.Builder columnsBuilder = ImmutableList.builder(); + ImmutableList.Builder typesBuilder = ImmutableList.builder(); + int nextHiveColumnIndex = 0; + for (int i = 0; i < testColumns.size(); i++) { + TestColumn testColumn = testColumns.get(i); + int columnIndex = testColumn.isPartitionKey() ? -1 : nextHiveColumnIndex++; + + ObjectInspector inspector = testColumn.getObjectInspector(); + HiveType hiveType = HiveType.valueOf(inspector.getTypeName()); + Type type = hiveType.getType(TYPE_MANAGER); + + columnsBuilder.add(new HiveColumnHandle(testColumn.getName(), hiveType, type.getTypeSignature(), columnIndex, testColumn.isPartitionKey() ? PARTITION_KEY : REGULAR, Optional.empty())); + typesBuilder.add(type); + } + columns = columnsBuilder.build(); + types = typesBuilder.build(); + + fileSplit = createTestFile(tempFilePath, new OrcOutputFormat(), serde, null, testColumns, numRows, stripeRows); + } + + public ConnectorPageSource newPageSource() + { + return newPageSource(new FileFormatDataSourceStats(), SESSION); + } + + public ConnectorPageSource newPageSource(FileFormatDataSourceStats stats) + { + return newPageSource(stats, SESSION); + } + + public ConnectorPageSource newPageSource(FileFormatDataSourceStats stats, ConnectorSession session) + { + return newPageSource(stats, session, Optional.empty()); + } + + public ConnectorPageSource newPageSource(FileFormatDataSourceStats stats, ConnectorSession session, Optional dynamicFilterSupplier) + { + OrcPageSourceFactory orcPageSourceFactory = new OrcPageSourceFactory(TYPE_MANAGER, new HiveConfig().setUseOrcColumnNames(false), HDFS_ENVIRONMENT, stats, OrcCacheStore.builder().newCacheStore( + new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), + new HiveConfig().getOrcStripeFooterCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), + new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), + new HiveConfig().getOrcBloomFiltersCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), + new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), + new HiveConfig().isOrcCacheStatsMetricCollectionEnabled())); + return HivePageSourceProvider.createHivePageSource( + ImmutableSet.of(), + ImmutableSet.of(orcPageSourceFactory), + new Configuration(), + session, + fileSplit.getPath(), + OptionalInt.empty(), + fileSplit.getStart(), + fileSplit.getLength(), + fileSplit.getLength(), + schema, + TupleDomain.all(), + columns, + partitionKeys, + TYPE_MANAGER, + ImmutableMap.of(), + Optional.empty(), + false, + dynamicFilterSupplier, + Optional.empty(), + Optional.empty(), + Optional.empty(), + null, + false, + -1L, + ImmutableMap.of(), + ImmutableList.of(), + Optional.empty(), + new HiveOffloadExpression(), + ImmutableMap.of()) + .get(); + } + + public SourceOperator newTableScanOperator(DriverContext driverContext) + { + ConnectorPageSource pageSource = newPageSource(); + SourceOperatorFactory sourceOperatorFactory = new TableScanOperatorFactory( + 0, + new PlanNodeId("0"), + (session, split, table, columnHandles, dynamicFilter) -> pageSource, + TEST_TABLE_HANDLE, + columns.stream().map(columnHandle -> (ColumnHandle) columnHandle).collect(toList()), + types, + DataSize.valueOf("462304B"), + 5, ReuseExchangeOperator.STRATEGY.REUSE_STRATEGY_DEFAULT, new UUID(0, 0), false, Optional.empty(), 0, 0); + SourceOperator operator = sourceOperatorFactory.createOperator(driverContext); + operator.addSplit(new Split(new CatalogName("test"), TestingSplit.createLocalSplit(), Lifespan.taskWide())); + return operator; + } + + public SourceOperator newScanFilterAndProjectOperator(DriverContext driverContext) + { + ConnectorPageSource pageSource = newPageSource(); + ImmutableList.Builder projectionsBuilder = ImmutableList.builder(); + for (int i = 0; i < types.size(); i++) { + projectionsBuilder.add(field(i, types.get(i))); + } + Supplier cursorProcessor = EXPRESSION_COMPILER.compileCursorProcessor(Optional.empty(), projectionsBuilder.build(), "key"); + Supplier pageProcessor = EXPRESSION_COMPILER.compilePageProcessor(Optional.empty(), projectionsBuilder.build()); + SourceOperatorFactory sourceOperatorFactory = new ScanFilterAndProjectOperatorFactory( + 0, + new PlanNodeId("test"), + new PlanNodeId("0"), + (session, split, table, columnHandles, dynamicFilter) -> pageSource, + cursorProcessor, + pageProcessor, + TEST_TABLE_HANDLE, + columns.stream().map(columnHandle -> (ColumnHandle) columnHandle).collect(toList()), + Optional.empty(), + types, + new DataSize(0, BYTE), + 0, + ReuseExchangeOperator.STRATEGY.REUSE_STRATEGY_DEFAULT, new UUID(0, 0), false, Optional.empty(), 0, 0); + SourceOperator operator = sourceOperatorFactory.createOperator(driverContext); + operator.addSplit(new Split(new CatalogName("test"), TestingSplit.createLocalSplit(), Lifespan.taskWide())); + operator.noMoreSplits(); + return operator; + } + + private DriverContext newDriverContext() + { + return createTaskContext(executor, scheduledExecutor, testSessionBuilder().build()) + .addPipelineContext(0, true, true, false) + .addDriverContext(); + } + } + + public static FileSplit createTestFile(String filePath, + HiveOutputFormat outputFormat, + Serializer serializer, + String compressionCodec, + List testColumns, + int numRows, + int stripeRows) + throws Exception + { + // filter out partition keys, which are not written to the file + testColumns = ImmutableList.copyOf(filter(testColumns, not(TestColumn::isPartitionKey))); + + Properties tableProperties = new Properties(); + tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName))); + tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType))); + serializer.initialize(CONFIGURATION, tableProperties); + + JobConf jobConf = new JobConf(); + if (compressionCodec != null) { + CompressionCodec codec = new CompressionCodecFactory(CONFIGURATION).getCodecByName(compressionCodec); + jobConf.set(COMPRESS_CODEC, codec.getClass().getName()); + jobConf.set(COMPRESS_TYPE, SequenceFile.CompressionType.BLOCK.toString()); + } + + RecordWriter recordWriter = createRecordWriter(new Path(filePath), CONFIGURATION); + + try { + SettableStructObjectInspector objectInspector = getStandardStructObjectInspector( + ImmutableList.copyOf(transform(testColumns, TestColumn::getName)), + ImmutableList.copyOf(transform(testColumns, TestColumn::getObjectInspector))); + + Object row = objectInspector.create(); + + List fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs()); + + for (int rowNumber = 0; rowNumber < numRows; rowNumber++) { + for (int i = 0; i < testColumns.size(); i++) { + Object writeValue = testColumns.get(i).getWriteValue(); + if (writeValue instanceof Slice) { + writeValue = ((Slice) writeValue).getBytes(); + } + objectInspector.setStructFieldData(row, fields.get(i), writeValue); + } + + Writable record = serializer.serialize(row, objectInspector); + recordWriter.write(record); + if (rowNumber % stripeRows == stripeRows - 1) { + flushStripe(recordWriter); + } + } + } + finally { + recordWriter.close(false); + } + + Path path = new Path(filePath); + path.getFileSystem(CONFIGURATION).setVerifyChecksum(true); + File file = new File(filePath); + return new FileSplit(path, 0, file.length(), new String[0]); + } + + private static void flushStripe(RecordWriter recordWriter) + { + try { + Field writerField = OrcOutputFormat.class.getClassLoader() + .loadClass(ORC_RECORD_WRITER) + .getDeclaredField("writer"); + writerField.setAccessible(true); + Writer writer = (Writer) writerField.get(recordWriter); + Method flushStripe = WriterImpl.class.getDeclaredMethod("flushStripe"); + flushStripe.setAccessible(true); + flushStripe.invoke(writer); + } + catch (ReflectiveOperationException e) { + throw new RuntimeException(e); + } + } + + private static RecordWriter createRecordWriter(Path target, Configuration conf) + { + try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(FileSystem.class.getClassLoader())) { + WriterOptions options = OrcFile.writerOptions(conf) + .memory(new NullMemoryManager()) + .compress(ZLIB); + + try { + return WRITER_CONSTRUCTOR.newInstance(target, options); + } + catch (ReflectiveOperationException e) { + throw new RuntimeException(e); + } + } + } + + private static Constructor getOrcWriterConstructor() + { + try { + Constructor constructor = OrcOutputFormat.class.getClassLoader() + .loadClass(ORC_RECORD_WRITER) + .asSubclass(RecordWriter.class) + .getDeclaredConstructor(Path.class, WriterOptions.class); + constructor.setAccessible(true); + return constructor; + } + catch (ReflectiveOperationException e) { + throw new RuntimeException(e); + } + } + + public static class TestColumn + { + private final String name; + private final ObjectInspector objectInspector; + private final Supplier writeValue; + private final boolean partitionKey; + + public TestColumn(String name, ObjectInspector objectInspector, Supplier writeValue, boolean partitionKey) + { + this.name = requireNonNull(name, "name is null"); + this.objectInspector = requireNonNull(objectInspector, "objectInspector is null"); + this.writeValue = writeValue; + this.partitionKey = partitionKey; + } + + public String getName() + { + return name; + } + + public String getType() + { + return objectInspector.getTypeName(); + } + + public ObjectInspector getObjectInspector() + { + return objectInspector; + } + + public Object getWriteValue() + { + return writeValue.get(); + } + + public boolean isPartitionKey() + { + return partitionKey; + } + + @Override + public String toString() + { + StringBuilder sb = new StringBuilder("TestColumn{"); + sb.append("name='").append(name).append('\''); + sb.append(", objectInspector=").append(objectInspector); + sb.append(", partitionKey=").append(partitionKey); + sb.append('}'); + return sb.toString(); + } + } + + public static final class GrowingTestColumn + extends TestColumn + { + private final Supplier writeValue; + private int counter; + private int step; + private int maxSize; + + public GrowingTestColumn(String name, ObjectInspector objectInspector, Supplier writeValue, boolean partitionKey, int step) + { + super(name, objectInspector, writeValue, partitionKey); + this.writeValue = writeValue; + this.counter = step; + this.step = step; + } + + @Override + public Object getWriteValue() + { + StringBuilder builder = new StringBuilder(); + String source = writeValue.get(); + for (int i = 0; i < counter / step; i++) { + builder.append(source); + } + counter++; + if (builder.length() > maxSize) { + maxSize = builder.length(); + } + return builder.toString(); + } + + public int getMaxSize() + { + return maxSize; + } + } + + @Test + public void testOrcConcatPageSourceDynamicFilterBlocked() + throws InterruptedException + { + OrcConcatPageSource orcConcatPageSource = getOrcConcatPageSource(1000); + + Page page = orcConcatPageSource.getNextPage(); + assertNull(page); + + TimeUnit.SECONDS.sleep(2); + page = orcConcatPageSource.getNextPage(); + assertNotNull(page); + } + + @Test + public void testOrcConcatPageSourceDynamicFilterNotBlocked() + { + OrcConcatPageSource orcConcatPageSource = getOrcConcatPageSource(0); + Page page = orcConcatPageSource.getNextPage(); + assertNotNull(page); + } + + private OrcConcatPageSource getOrcConcatPageSource(long waitTime) + { + HiveConfig config = new HiveConfig(); + FileFormatDataSourceStats stats = new FileFormatDataSourceStats(); + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(config, new OrcFileWriterConfig(), + new ParquetFileWriterConfig()).getSessionProperties()); + List pageSources = new ArrayList<>(); + + Supplier>> supplier = null; + DynamicFilterSupplier theSupplier = new DynamicFilterSupplier(supplier, System.currentTimeMillis(), waitTime); + + Optional dynamicFilterSupplier = Optional.of(theSupplier); + pageSources.add(testPreparer.newPageSource(stats, session, dynamicFilterSupplier)); + OrcConcatPageSource orcConcatPageSource = new OrcConcatPageSource(pageSources); + + return orcConcatPageSource; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestParquetFileWriterConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestParquetFileWriterConfig.java new file mode 100644 index 00000000..7c5540b1 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestParquetFileWriterConfig.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableMap; +import io.airlift.units.DataSize; +import org.apache.parquet.hadoop.ParquetWriter; +import org.testng.annotations.Test; + +import java.util.Map; + +import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; +import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; +import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; +import static io.airlift.units.DataSize.Unit.BYTE; +import static io.airlift.units.DataSize.Unit.MEGABYTE; + +public class TestParquetFileWriterConfig +{ + @Test + public void testDefaults() + { + assertRecordedDefaults(recordDefaults(ParquetFileWriterConfig.class) + .setBlockSize(new DataSize(ParquetWriter.DEFAULT_BLOCK_SIZE, BYTE)) + .setPageSize(new DataSize(ParquetWriter.DEFAULT_PAGE_SIZE, BYTE))); + } + + @Test + public void testExplicitPropertyMappings() + { + Map properties = new ImmutableMap.Builder() + .put("hive.parquet.writer.block-size", "234MB") + .put("hive.parquet.writer.page-size", "11MB") + .build(); + + ParquetFileWriterConfig expected = new ParquetFileWriterConfig() + .setBlockSize(new DataSize(234, MEGABYTE)) + .setPageSize(new DataSize(11, MEGABYTE)); + + assertFullMapping(properties, expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestPartitionOfflineException.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestPartitionOfflineException.java new file mode 100644 index 00000000..60c64683 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestPartitionOfflineException.java @@ -0,0 +1,39 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.connector.SchemaTableName; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + +public class TestPartitionOfflineException +{ + @Test + public void testMessage() + { + assertMessage(new SchemaTableName("schema", "table"), "pk=1", false, "", "Table 'schema.table' partition 'pk=1' is offline"); + assertMessage(new SchemaTableName("schema", "table"), "pk=1", false, null, "Table 'schema.table' partition 'pk=1' is offline"); + assertMessage(new SchemaTableName("schema", "table"), "pk=1", true, "", "Table 'schema.table' partition 'pk=1' is offline for Presto"); + assertMessage(new SchemaTableName("schema", "table"), "pk=1", true, null, "Table 'schema.table' partition 'pk=1' is offline for Presto"); + assertMessage(new SchemaTableName("schema", "table"), "pk=1", false, "offline reason", "Table 'schema.table' partition 'pk=1' is offline: offline reason"); + assertMessage(new SchemaTableName("schema", "table"), "pk=1", true, "offline reason", "Table 'schema.table' partition 'pk=1' is offline for Presto: offline reason"); + } + + private static void assertMessage(SchemaTableName tableName, String partitionName, boolean forPresto, String offlineMessage, String expectedMessage) + { + PartitionOfflineException tableOfflineException = new PartitionOfflineException(tableName, partitionName, forPresto, offlineMessage); + assertEquals(tableOfflineException.getMessage(), expectedMessage); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestPartitionUpdate.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestPartitionUpdate.java new file mode 100644 index 00000000..a3cae69e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestPartitionUpdate.java @@ -0,0 +1,54 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.airlift.json.JsonCodec; +import io.prestosql.plugin.hive.PartitionUpdate.UpdateMode; +import org.apache.hadoop.fs.Path; +import org.testng.annotations.Test; + +import static io.airlift.json.JsonCodec.jsonCodec; +import static org.testng.Assert.assertEquals; + +public class TestPartitionUpdate +{ + private static final JsonCodec CODEC = jsonCodec(PartitionUpdate.class); + + @Test + public void testRoundTrip() + { + PartitionUpdate expected = new PartitionUpdate( + "test", + UpdateMode.APPEND, + "/writePath", + "/targetPath", + ImmutableList.of("file1", "file3"), + 123, + 456, + 789, + ImmutableList.of()); + + PartitionUpdate actual = CODEC.fromJson(CODEC.toJson(expected)); + + assertEquals(actual.getName(), "test"); + assertEquals(actual.getUpdateMode(), UpdateMode.APPEND); + assertEquals(actual.getWritePath(), new Path("/writePath")); + assertEquals(actual.getTargetPath(), new Path("/targetPath")); + assertEquals(actual.getFileNames(), ImmutableList.of("file1", "file3")); + assertEquals(actual.getRowCount(), 123); + assertEquals(actual.getInMemoryDataSizeInBytes(), 456); + assertEquals(actual.getOnDiskDataSizeInBytes(), 789); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestS3SelectLineRecordReader.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestS3SelectLineRecordReader.java new file mode 100644 index 00000000..7bd8290b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestS3SelectLineRecordReader.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import org.testng.annotations.Test; + +import java.io.IOException; + +import static io.prestosql.plugin.hive.S3SelectLineRecordReader.UnrecoverableS3OperationException; + +public class TestS3SelectLineRecordReader +{ + @Test(expectedExceptions = UnrecoverableS3OperationException.class, expectedExceptionsMessageRegExp = "java.io.IOException: test io exception \\(Bucket: test-bucket, Key: test-key\\)") + public void testUnrecoverableS3ExceptionMessage() + { + throw new UnrecoverableS3OperationException("test-bucket", "test-key", new IOException("test io exception")); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestS3SelectPushdown.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestS3SelectPushdown.java new file mode 100644 index 00000000..0713a245 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestS3SelectPushdown.java @@ -0,0 +1,45 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.TextInputFormat; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +public class TestS3SelectPushdown +{ + private TextInputFormat inputFormat; + + @BeforeClass + public void setUp() + { + inputFormat = new TextInputFormat(); + inputFormat.configure(new JobConf()); + } + + @Test + public void testIsCompressionCodecSupported() + { + assertTrue(S3SelectPushdown.isCompressionCodecSupported(inputFormat, new Path("s3://fakeBucket/fakeObject.gz"))); + assertTrue(S3SelectPushdown.isCompressionCodecSupported(inputFormat, new Path("s3://fakeBucket/fakeObject"))); + assertFalse(S3SelectPushdown.isCompressionCodecSupported(inputFormat, new Path("s3://fakeBucket/fakeObject.lz4"))); + assertFalse(S3SelectPushdown.isCompressionCodecSupported(inputFormat, new Path("s3://fakeBucket/fakeObject.snappy"))); + assertTrue(S3SelectPushdown.isCompressionCodecSupported(inputFormat, new Path("s3://fakeBucket/fakeObject.bz2"))); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestS3SelectRecordCursor.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestS3SelectRecordCursor.java new file mode 100644 index 00000000..db9f822b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestS3SelectRecordCursor.java @@ -0,0 +1,240 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.spi.type.StandardTypes; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.mapred.RecordReader; +import org.testng.annotations.Test; + +import java.util.Optional; +import java.util.Properties; +import java.util.stream.Stream; + +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveType.HIVE_INT; +import static io.prestosql.plugin.hive.HiveType.HIVE_STRING; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; +import static java.util.Arrays.asList; +import static java.util.stream.Collectors.joining; +import static org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMNS; +import static org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMN_TYPES; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_DDL; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; +import static org.testng.Assert.assertEquals; + +public class TestS3SelectRecordCursor +{ + private static final String LAZY_SERDE_CLASS_NAME = LazySimpleSerDe.class.getName(); + + private static final HiveColumnHandle ARTICLE_COLUMN = new HiveColumnHandle("article", HIVE_STRING, parseTypeSignature(StandardTypes.VARCHAR), 1, REGULAR, Optional.empty()); + private static final HiveColumnHandle AUTHOR_COLUMN = new HiveColumnHandle("author", HIVE_STRING, parseTypeSignature(StandardTypes.VARCHAR), 1, REGULAR, Optional.empty()); + private static final HiveColumnHandle DATE_ARTICLE_COLUMN = new HiveColumnHandle("date_pub", HIVE_INT, parseTypeSignature(StandardTypes.DATE), 1, REGULAR, Optional.empty()); + private static final HiveColumnHandle QUANTITY_COLUMN = new HiveColumnHandle("quantity", HIVE_INT, parseTypeSignature(StandardTypes.INTEGER), 1, REGULAR, Optional.empty()); + private static final HiveColumnHandle[] DEFAULT_TEST_COLUMNS = {ARTICLE_COLUMN, AUTHOR_COLUMN, DATE_ARTICLE_COLUMN, QUANTITY_COLUMN}; + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "Invalid Thrift DDL struct article \\{ \\}") + public void shouldThrowIllegalArgumentExceptionWhenSerialDDLHasNoColumns() + { + String ddlSerializationValue = "struct article { }"; + buildSplitSchema(ddlSerializationValue, DEFAULT_TEST_COLUMNS); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "Thrift DDL should start with struct") + public void shouldThrowIllegalArgumentExceptionWhenSerialDDLNotStartingWithStruct() + { + String ddlSerializationValue = "foo article { varchar article varchar }"; + buildSplitSchema(ddlSerializationValue, DEFAULT_TEST_COLUMNS); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "Invalid Thrift DDL struct article \\{varchar article\\}") + public void shouldThrowIllegalArgumentExceptionWhenSerialDDLNotStartingWithStruct2() + { + String ddlSerializationValue = "struct article {varchar article}"; + buildSplitSchema(ddlSerializationValue, DEFAULT_TEST_COLUMNS); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "Invalid Thrift DDL struct article varchar article varchar \\}") + public void shouldThrowIllegalArgumentExceptionWhenMissingOpenStartStruct() + { + String ddlSerializationValue = "struct article varchar article varchar }"; + buildSplitSchema(ddlSerializationValue, DEFAULT_TEST_COLUMNS); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "Invalid Thrift DDL struct article\\{varchar article varchar author date date_pub int quantity") + public void shouldThrowIllegalArgumentExceptionWhenDDlFormatNotCorrect() + { + String ddlSerializationValue = "struct article{varchar article varchar author date date_pub int quantity"; + buildSplitSchema(ddlSerializationValue, DEFAULT_TEST_COLUMNS); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "Invalid Thrift DDL struct article \\{ varchar article varchar author date date_pub int quantity ") + public void shouldThrowIllegalArgumentExceptionWhenEndOfStructNotFound() + { + String ddlSerializationValue = "struct article { varchar article varchar author date date_pub int quantity "; + buildSplitSchema(ddlSerializationValue, DEFAULT_TEST_COLUMNS); + } + + @Test + public void shouldFilterColumnsWhichDoesNotMatchInTheHiveTable() + { + String ddlSerializationValue = "struct article { varchar address varchar company date date_pub int quantity}"; + String expectedDDLSerialization = "struct article { date date_pub, int quantity}"; + assertEquals(buildSplitSchema(ddlSerializationValue, DEFAULT_TEST_COLUMNS), + buildExpectedProperties(expectedDDLSerialization, DEFAULT_TEST_COLUMNS)); + } + + @Test + public void shouldReturnOnlyQuantityColumnInTheDDl() + { + String ddlSerializationValue = "struct article { varchar address varchar company date date_pub int quantity}"; + String expectedDDLSerialization = "struct article { int quantity}"; + assertEquals(buildSplitSchema(ddlSerializationValue, ARTICLE_COLUMN, QUANTITY_COLUMN), + buildExpectedProperties(expectedDDLSerialization, ARTICLE_COLUMN, QUANTITY_COLUMN)); + } + + @Test + public void shouldReturnProperties() + { + String ddlSerializationValue = "struct article { varchar article varchar author date date_pub int quantity}"; + String expectedDDLSerialization = "struct article { varchar article, varchar author, date date_pub, int quantity}"; + assertEquals(buildSplitSchema(ddlSerializationValue, DEFAULT_TEST_COLUMNS), + buildExpectedProperties(expectedDDLSerialization, DEFAULT_TEST_COLUMNS)); + } + + @Test + public void shouldReturnPropertiesWithoutDoubleCommaInColumnsNameLastColumnNameWithEndStruct() + { + String ddlSerializationValue = "struct article { varchar article, varchar author, date date_pub, int quantity}"; + String expectedDDLSerialization = "struct article { varchar article, varchar author, date date_pub, int quantity}"; + assertEquals(buildSplitSchema(ddlSerializationValue, DEFAULT_TEST_COLUMNS), + buildExpectedProperties(expectedDDLSerialization, DEFAULT_TEST_COLUMNS)); + } + + @Test + public void shouldReturnPropertiesWithoutDoubleCommaInColumnsNameLastColumnNameWithoutEndStruct() + { + String ddlSerializationValue = "struct article { varchar article, varchar author, date date_pub, int quantity }"; + String expectedDDLSerialization = "struct article { varchar article, varchar author, date date_pub, int quantity}"; + assertEquals(buildSplitSchema(ddlSerializationValue, DEFAULT_TEST_COLUMNS), + buildExpectedProperties(expectedDDLSerialization, DEFAULT_TEST_COLUMNS)); + } + + @Test + public void shouldOnlyGetColumnTypeFromHiveObjectAndNotFromDDLSerialLastColumnNameWithEndStruct() + { + String ddlSerializationValue = "struct article { int article, double author, xxxx date_pub, int quantity}"; + String expectedDDLSerialization = "struct article { int article, double author, xxxx date_pub, int quantity}"; + assertEquals(buildSplitSchema(ddlSerializationValue, DEFAULT_TEST_COLUMNS), + buildExpectedProperties(expectedDDLSerialization, DEFAULT_TEST_COLUMNS)); + } + + @Test + public void shouldOnlyGetColumnTypeFromHiveObjectAndNotFromDDLSerialLastColumnNameWithoutEndStruct() + { + String ddlSerializationValue = "struct article { int article, double author, xxxx date_pub, int quantity }"; + String expectedDDLSerialization = "struct article { int article, double author, xxxx date_pub, int quantity}"; + assertEquals(buildSplitSchema(ddlSerializationValue, DEFAULT_TEST_COLUMNS), + buildExpectedProperties(expectedDDLSerialization, DEFAULT_TEST_COLUMNS)); + } + + @Test(expectedExceptions = NullPointerException.class) + public void shouldThrowNullPointerExceptionWhenColumnsIsNull() + { + S3SelectRecordCursor.updateSplitSchema(new Properties(), null); + } + + @Test(expectedExceptions = NullPointerException.class) + public void shouldThrowNullPointerExceptionWhenSchemaIsNull() + { + S3SelectRecordCursor.updateSplitSchema(null, ImmutableList.of()); + } + + private Properties buildSplitSchema(String ddlSerializationValue, HiveColumnHandle... columns) + { + Properties properties = new Properties(); + properties.put(SERIALIZATION_LIB, LAZY_SERDE_CLASS_NAME); + properties.put(SERIALIZATION_DDL, ddlSerializationValue); + return S3SelectRecordCursor.updateSplitSchema(properties, asList(columns)); + } + + private Properties buildExpectedProperties(String expectedDDLSerialization, HiveColumnHandle... expectedColumns) + { + String expectedColumnsType = getTypes(expectedColumns); + String expectedColumnsName = getName(expectedColumns); + Properties propExpected = new Properties(); + propExpected.put(LIST_COLUMNS, expectedColumnsName); + propExpected.put(SERIALIZATION_LIB, LAZY_SERDE_CLASS_NAME); + propExpected.put(SERIALIZATION_DDL, expectedDDLSerialization); + propExpected.put(LIST_COLUMN_TYPES, expectedColumnsType); + return propExpected; + } + + private String getName(HiveColumnHandle[] expectedColumns) + { + return Stream.of(expectedColumns) + .map(HiveColumnHandle::getName) + .collect(joining(",")); + } + + private String getTypes(HiveColumnHandle[] expectedColumns) + { + return Stream.of(expectedColumns) + .map(HiveColumnHandle::getHiveType) + .map(HiveType::getTypeInfo) + .map(TypeInfo::getTypeName) + .collect(joining(",")); + } + + private static final RecordReader MOCK_RECORD_READER = new RecordReader() + { + @Override + public boolean next(Object key, Object value) + { + throw new UnsupportedOperationException(); + } + + @Override + public Object createKey() + { + throw new UnsupportedOperationException(); + } + + @Override + public Object createValue() + { + throw new UnsupportedOperationException(); + } + + @Override + public long getPos() + { + throw new UnsupportedOperationException(); + } + + @Override + public void close() + { + throw new UnsupportedOperationException(); + } + + @Override + public float getProgress() + { + throw new UnsupportedOperationException(); + } + }; +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestShowStats.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestShowStats.java new file mode 100644 index 00000000..ecdef683 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestShowStats.java @@ -0,0 +1,184 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import com.google.common.collect.ImmutableList; +import io.prestosql.tests.AbstractTestQueryFramework; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +public class TestShowStats + extends AbstractTestQueryFramework +{ + public TestShowStats() + { + super(() -> HiveQueryRunner.createQueryRunner(ImmutableList.of())); + } + + @BeforeClass + public void setUp() + { + assertUpdate("CREATE TABLE nation_partitioned(nationkey BIGINT, name VARCHAR, comment VARCHAR, regionkey BIGINT) WITH (partitioned_by = ARRAY['regionkey'])"); + assertUpdate("INSERT INTO nation_partitioned SELECT nationkey, name, comment, regionkey from tpch.tiny.nation", 25); + assertUpdate("CREATE TABLE region(regionkey BIGINT, name VARCHAR, comment VARCHAR)"); + assertUpdate("CREATE TABLE orders(orderkey BIGINT, custkey BIGINT, totalprice DOUBLE, orderdate DATE, " + + "orderpriority VARCHAR, clerk VARCHAR, shippriority VARCHAR, comment VARCHAR, orderstatus VARCHAR)"); + } + + @Test + public void testShowStats() + { + assertQuery("SHOW STATS FOR nation_partitioned", + "SELECT * FROM (VALUES " + + " ('regionkey', null, 5.0, 0.0, null, 0, 4), " + + " ('nationkey', null, 5.0, 0.0, null, 0, 24), " + + " ('name', 177.0, 5.0, 0.0, null, null, null), " + + " ('comment', 1857.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 25.0, null, null))"); + + assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned)", + "SELECT * FROM (VALUES " + + " ('regionkey', null, 5.0, 0.0, null, 0, 4), " + + " ('nationkey', null, 5.0, 0.0, null, 0, 24), " + + " ('name', 177.0, 5.0, 0.0, null, null, null), " + + " ('comment', 1857.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 25.0, null, null))"); + + assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey IS NOT NULL)", + "SELECT * FROM (VALUES " + + " ('regionkey', null, 5.0, 0.0, null, 0, 4), " + + " ('nationkey', null, 5.0, 0.0, null, 0, 24), " + + " ('name', 177.0, 5.0, 0.0, null, null, null), " + + " ('comment', 1857.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 25.0, null, null))"); + + assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey IS NULL)", + "SELECT * FROM (VALUES " + + " ('regionkey', null, 0.0, 0.0, null, null, null), " + + " ('nationkey', null, 0.0, 0.0, null, null, null), " + + " ('name', 0.0, 0.0, 0.0, null, null, null), " + + " ('comment', 0.0, 0.0, 0.0, null, null, null), " + + " (null, null, null, null, 0.0, null, null))"); + + assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey = 1)", + "SELECT * FROM (VALUES " + + " ('regionkey', null, 1.0, 0.0, null, 1, 1), " + + " ('nationkey', null, 5.0, 0.0, null, 1, 24), " + + " ('name', 38.0, 5.0, 0.0, null, null, null), " + + " ('comment', 500.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 5.0, null, null))"); + + assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey IN (1, 3))", + "SELECT * FROM (VALUES " + + " ('regionkey', null, 2.0, 0.0, null, 1, 3), " + + " ('nationkey', null, 5.0, 0.0, null, 1, 24), " + + " ('name', 78.0, 5.0, 0.0, null, null, null), " + + " ('comment', 847.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 10.0, null, null))"); + + assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey BETWEEN 1 AND 1 + 2)", + "SELECT * FROM (VALUES " + + " ('regionkey', null, 3.0, 0.0, null, 1, 3), " + + " ('nationkey', null, 5.0, 0.0, null, 1, 24), " + + " ('name', 109.0, 5.0, 0.0, null, null, null), " + + " ('comment', 1199.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 15.0, null, null))"); + + assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey > 3)", + "SELECT * FROM (VALUES " + + " ('regionkey', null, 1.0, 0.0, null, 4, 4), " + + " ('nationkey', null, 5.0, 0.0, null, 4, 20), " + + " ('name', 31.0, 5.0, 0.0, null, null, null), " + + " ('comment', 348.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 5.0, null, null))"); + + assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey < 1)", + "SELECT * FROM (VALUES " + + " ('regionkey', null, 1.0, 0.0, null, 0, 0), " + + " ('nationkey', null, 5.0, 0.0, null, 0, 16), " + + " ('name', 37.0, 5.0, 0.0, null, null, null), " + + " ('comment', 310.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 5.0, null, null))"); + + assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey > 0 and regionkey < 4)", + "SELECT * FROM (VALUES " + + " ('regionkey', null, 3.0, 0.0, null, 1, 3), " + + " ('nationkey', null, 5.0, 0.0, null, 1, 24), " + + " ('name', 109.0, 5.0, 0.0, null, null, null), " + + " ('comment', 1199.0, 5.0, 0.0, null, null, null), " + + " (null, null, null, null, 15.0, null, null))"); + + assertQuery("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey > 10 or regionkey < 0)", + "SELECT * FROM (VALUES " + + " ('regionkey', null, 0.0, 0.0, null, null, null), " + + " ('nationkey', null, 0.0, 0.0, null, null, null), " + + " ('name', 0.0, 0.0, 0.0, null, null, null), " + + " ('comment', 0.0, 0.0, 0.0, null, null, null), " + + " (null, null, null, null, 0.0, null, null))"); + } + + @Test + public void testShowStatsWithoutFromFails() + { + assertQueryFails("SHOW STATS FOR (SELECT 1)", ".*There must be exactly one table in query passed to SHOW STATS SELECT clause"); + } + + @Test + public void testShowStatsWithMultipleFromFails() + { + assertQueryFails("SHOW STATS FOR (SELECT * FROM nation_partitioned, region)", ".*There must be exactly one table in query passed to SHOW STATS SELECT clause"); + } + + @Test + public void testShowStatsWithGroupByFails() + { + assertQueryFails("SHOW STATS FOR (SELECT avg(totalprice) FROM orders GROUP BY orderkey)", ".*GROUP BY is not supported in SHOW STATS SELECT clause"); + } + + @Test + public void testShowStatsWithHavingFails() + { + assertQueryFails("SHOW STATS FOR (SELECT count(nationkey) FROM nation_partitioned GROUP BY regionkey HAVING regionkey > 0)", ".*HAVING is not supported in SHOW STATS SELECT clause"); + } + + @Test + public void testShowStatsSelectNonStarFails() + { + assertQueryFails("SHOW STATS FOR (SELECT orderkey FROM orders)", ".*Only SELECT \\* is supported in SHOW STATS SELECT clause"); + assertQueryFails("SHOW STATS FOR (SELECT orderkey, custkey FROM orders)", ".*Only SELECT \\* is supported in SHOW STATS SELECT clause"); + assertQueryFails("SHOW STATS FOR (SELECT *, * FROM orders)", ".*Only SELECT \\* is supported in SHOW STATS SELECT clause"); + } + + @Test + public void testShowStatsWithSelectDistinctFails() + { + assertQueryFails("SHOW STATS FOR (SELECT DISTINCT * FROM orders)", ".*DISTINCT is not supported by SHOW STATS SELECT clause"); + } + + @Test + public void testShowStatsWithSelectFunctionCallFails() + { + assertQueryFails("SHOW STATS FOR (SELECT sin(orderkey) FROM orders)", ".*Only SELECT \\* is supported in SHOW STATS SELECT clause"); + assertQueryFails("SHOW STATS FOR (SELECT count(*) FROM orders)", ".*Only SELECT \\* is supported in SHOW STATS SELECT clause"); + } + + @Test + public void testShowStatsWithNonPushDownFilterFails() + { + assertQueryFails("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey + 100 < 200)", ".*Only predicates that can be pushed down are supported in the SHOW STATS WHERE clause"); + assertQueryFails("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE regionkey > 0 and nationkey > 0)", ".*Only predicates that can be pushed down are supported in the SHOW STATS WHERE clause"); + assertQueryFails("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE nationkey = 1 and name is not null)", ".*Only predicates that can be pushed down are supported in the SHOW STATS WHERE clause"); + assertQueryFails("SHOW STATS FOR (SELECT * FROM nation_partitioned WHERE sin(regionkey) > 0)", ".*Only predicates that can be pushed down are supported in the SHOW STATS WHERE clause"); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestTableOfflineException.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestTableOfflineException.java new file mode 100644 index 00000000..0a0dfa42 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/TestTableOfflineException.java @@ -0,0 +1,39 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive; + +import io.prestosql.spi.connector.SchemaTableName; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + +public class TestTableOfflineException +{ + @Test + public void testMessage() + { + assertMessage(new SchemaTableName("schema", "table"), false, "", "Table 'schema.table' is offline"); + assertMessage(new SchemaTableName("schema", "table"), false, null, "Table 'schema.table' is offline"); + assertMessage(new SchemaTableName("schema", "table"), true, "", "Table 'schema.table' is offline for Presto"); + assertMessage(new SchemaTableName("schema", "table"), true, null, "Table 'schema.table' is offline for Presto"); + assertMessage(new SchemaTableName("schema", "table"), false, "offline reason", "Table 'schema.table' is offline: offline reason"); + assertMessage(new SchemaTableName("schema", "table"), true, "offline reason", "Table 'schema.table' is offline for Presto: offline reason"); + } + + private static void assertMessage(SchemaTableName tableName, boolean forPresto, String offlineMessage, String expectedMessage) + { + TableOfflineException tableOfflineException = new TableOfflineException(tableName, forPresto, offlineMessage); + assertEquals(tableOfflineException.getMessage(), expectedMessage); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/authentication/TestHdfsKerberosConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/authentication/TestHdfsKerberosConfig.java new file mode 100644 index 00000000..2f1ed6aa --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/authentication/TestHdfsKerberosConfig.java @@ -0,0 +1,38 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import com.google.common.collect.ImmutableMap; +import io.airlift.configuration.testing.ConfigAssertions; +import org.testng.annotations.Test; + +import java.util.Map; + +public class TestHdfsKerberosConfig +{ + @Test + public void testExplicitPropertyMappings() + { + Map properties = new ImmutableMap.Builder() + .put("hive.hdfs.presto.principal", "presto@EXAMPLE.COM") + .put("hive.hdfs.presto.keytab", "/tmp/presto.keytab") + .build(); + + HdfsKerberosConfig expected = new HdfsKerberosConfig() + .setHdfsPrestoPrincipal("presto@EXAMPLE.COM") + .setHdfsPrestoKeytab("/tmp/presto.keytab"); + + ConfigAssertions.assertFullMapping(properties, expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/authentication/TestMetastoreKerberosConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/authentication/TestMetastoreKerberosConfig.java new file mode 100644 index 00000000..197e1962 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/authentication/TestMetastoreKerberosConfig.java @@ -0,0 +1,42 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.authentication; + +import com.google.common.collect.ImmutableMap; +import io.airlift.configuration.testing.ConfigAssertions; +import org.testng.annotations.Test; + +import java.util.Map; + +public class TestMetastoreKerberosConfig +{ + @Test + public void testExplicitPropertyMappings() + { + Map properties = new ImmutableMap.Builder() + .put("hive.metastore.service.principal", "hive/_HOST@EXAMPLE.COM") + .put("hive.metastore.client.principal", "metastore@EXAMPLE.COM") + .put("hive.metastore.client.keytab", "/tmp/metastore.keytab") + .put("hive.metastore.krb5.conf.path", "/tmp/krb5.conf") + .build(); + + MetastoreKerberosConfig expected = new MetastoreKerberosConfig() + .setHiveMetastoreServicePrincipal("hive/_HOST@EXAMPLE.COM") + .setHiveMetastoreClientPrincipal("metastore@EXAMPLE.COM") + .setHiveMetastoreClientKeytab("/tmp/metastore.keytab") + .setHiveMetastoreKrb5("/tmp/krb5.conf"); + + ConfigAssertions.assertFullMapping(properties, expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/DynamicFilterBenchmark.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/DynamicFilterBenchmark.java new file mode 100644 index 00000000..fe06a5bf --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/DynamicFilterBenchmark.java @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.benchmark; + +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HivePageSource; +import io.prestosql.plugin.hive.HivePartitionKey; +import io.prestosql.spi.Page; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.block.LongArrayBlockBuilder; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.dynamicfilter.BloomFilterDynamicFilter; +import io.prestosql.spi.dynamicfilter.DynamicFilter; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.util.BloomFilter; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; +import org.openjdk.jmh.runner.options.VerboseMode; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.TimeUnit; + +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveType.HIVE_INT; +import static io.prestosql.plugin.hive.HiveUtil.isPartitionFiltered; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.StandardTypes.INTEGER; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; + +@State(Scope.Benchmark) +@OutputTimeUnit(TimeUnit.SECONDS) +@Fork(3) +@Warmup(iterations = 20, time = 500, timeUnit = TimeUnit.MILLISECONDS) +@Measurement(iterations = 20, time = 500, timeUnit = TimeUnit.MILLISECONDS) +public class DynamicFilterBenchmark +{ + @State(Scope.Benchmark) + public static class BenchmarkData + { + private Map dynamicFilters; + private Page page; + private List partitions; + private Map eligibleColumns = new HashMap<>(); + + public BenchmarkData() + { + final int numValues = 1024; + BlockBuilder builder = new LongArrayBlockBuilder(null, numValues); + for (int i = 0; i < numValues; i++) { + builder.writeLong(i); + } + page = new Page(builder.build(), builder.build()); + + dynamicFilters = new HashMap<>(); + ColumnHandle dayColumn = new HiveColumnHandle("pt_d", HIVE_INT, parseTypeSignature(INTEGER), 0, REGULAR, Optional.empty()); + ColumnHandle appColumn = new HiveColumnHandle("app_d", HIVE_INT, parseTypeSignature(INTEGER), 1, PARTITION_KEY, Optional.empty()); + + BloomFilter dayFilter = new BloomFilter(1024 * 1024, 0.01); + for (int i = 0; i < 10; i++) { + dayFilter.add(i); + } + BloomFilter appFilter = new BloomFilter(1024 * 1024, 0.01); + for (int i = 1023; i > 1013; i--) { + appFilter.add(i); + } + + dynamicFilters.put(dayColumn, new BloomFilterDynamicFilter("1", dayColumn, dayFilter, DynamicFilter.Type.GLOBAL)); + dynamicFilters.put(appColumn, new BloomFilterDynamicFilter("2", appColumn, appFilter, DynamicFilter.Type.GLOBAL)); + + eligibleColumns.put(0, dayColumn); + eligibleColumns.put(1, appColumn); + + partitions = new ArrayList<>(); + partitions.add(new HivePartitionKey("app_id", "10000")); + } + + public Map getDynamicFilters() + { + return dynamicFilters; + } + + public Page getPage() + { + return page; + } + + public List getPartitions() + { + return partitions; + } + + public List> getEligibleColumns() + { + return ImmutableList.of(eligibleColumns); + } + } + + @Benchmark + public void testFilterRows(BenchmarkData data) + { + List> dynamicFilters = new ArrayList<>(); + dynamicFilters.add(data.getDynamicFilters()); + Page filteredPage = HivePageSource.filter(dynamicFilters, data.getPage(), data.getEligibleColumns(), new Type[] {BIGINT, BIGINT}); + } + + @Benchmark + public void testIsPartitionFiltered(BenchmarkData data) + { + isPartitionFiltered(data.getPartitions(), ImmutableList.of(new HashSet<>(data.getDynamicFilters().values())), null); + } + + public static void main(String[] args) + throws RunnerException + { + Options options = new OptionsBuilder() + .verbosity(VerboseMode.NORMAL) + .include(".*" + io.prestosql.plugin.hive.benchmark.DynamicFilterBenchmark.class.getSimpleName() + ".*") + .build(); + + new Runner(options).run(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/FileFormat.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/FileFormat.java new file mode 100644 index 00000000..ded606e5 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/FileFormat.java @@ -0,0 +1,493 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.benchmark; + +import com.google.common.collect.ImmutableMap; +import io.airlift.slice.OutputStreamSliceOutput; +import io.prestosql.orc.OrcCacheStore; +import io.prestosql.orc.OrcWriter; +import io.prestosql.orc.OrcWriterOptions; +import io.prestosql.orc.OrcWriterStats; +import io.prestosql.orc.OutputStreamOrcDataSink; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.GenericHiveRecordCursorProvider; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveCompressionCodec; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HivePageSourceFactory; +import io.prestosql.plugin.hive.HiveRecordCursorProvider; +import io.prestosql.plugin.hive.HiveStorageFormat; +import io.prestosql.plugin.hive.HiveTestUtils; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.HiveTypeName; +import io.prestosql.plugin.hive.HiveTypeTranslator; +import io.prestosql.plugin.hive.RecordFileWriter; +import io.prestosql.plugin.hive.TypeTranslator; +import io.prestosql.plugin.hive.benchmark.HiveFileFormatBenchmark.TestData; +import io.prestosql.plugin.hive.orc.OrcPageSourceFactory; +import io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory; +import io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory; +import io.prestosql.rcfile.AircompressorCodecFactory; +import io.prestosql.rcfile.HadoopCodecFactory; +import io.prestosql.rcfile.RcFileEncoding; +import io.prestosql.rcfile.RcFileWriter; +import io.prestosql.rcfile.binary.BinaryRcFileEncoding; +import io.prestosql.rcfile.text.TextRcFileEncoding; +import io.prestosql.spi.Page; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.connector.RecordPageSource; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.JobConf; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.Properties; + +import static io.prestosql.orc.OrcWriteValidation.OrcWriteValidationMode.BOTH; +import static io.prestosql.plugin.hive.HdfsConfigurationInitializer.configureCompression; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveTestUtils.TYPE_MANAGER; +import static io.prestosql.plugin.hive.HiveType.toHiveType; +import static io.prestosql.plugin.hive.metastore.StorageFormat.fromHiveStorageFormat; +import static java.util.stream.Collectors.joining; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMNS; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; +import static org.joda.time.DateTimeZone.UTC; + +public enum FileFormat +{ + PRESTO_RCBINARY { + @Override + public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List columnNames, List columnTypes) + { + HivePageSourceFactory pageSourceFactory = new RcFilePageSourceFactory(TYPE_MANAGER, hdfsEnvironment, new FileFormatDataSourceStats(), new HiveConfig().setRcfileTimeZone("UTC")); + return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes, HiveStorageFormat.RCBINARY); + } + + @Override + public FormatWriter createFileFormatWriter( + ConnectorSession session, + File targetFile, + List columnNames, + List columnTypes, + HiveCompressionCodec compressionCodec) + throws IOException + { + return new PrestoRcFileFormatWriter( + targetFile, + columnTypes, + new BinaryRcFileEncoding(UTC), + compressionCodec); + } + }, + PRESTO_RCTEXT { + @Override + public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List columnNames, List columnTypes) + { + HivePageSourceFactory pageSourceFactory = new RcFilePageSourceFactory(TYPE_MANAGER, hdfsEnvironment, new FileFormatDataSourceStats(), new HiveConfig().setRcfileTimeZone("UTC")); + return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes, HiveStorageFormat.RCTEXT); + } + + @Override + public FormatWriter createFileFormatWriter( + ConnectorSession session, + File targetFile, + List columnNames, + List columnTypes, + HiveCompressionCodec compressionCodec) + throws IOException + { + return new PrestoRcFileFormatWriter( + targetFile, + columnTypes, + new TextRcFileEncoding(), + compressionCodec); + } + }, + + PRESTO_ORC { + @Override + public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List columnNames, List columnTypes) + { + HivePageSourceFactory pageSourceFactory = new OrcPageSourceFactory(HiveTestUtils.TYPE_MANAGER, new HiveConfig().setUseOrcColumnNames(false), hdfsEnvironment, new FileFormatDataSourceStats(), OrcCacheStore.builder().newCacheStore( + new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), + new HiveConfig().getOrcStripeFooterCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), + new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), + new HiveConfig().getOrcBloomFiltersCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), + new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), + new HiveConfig().isOrcCacheStatsMetricCollectionEnabled())); + return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes, HiveStorageFormat.ORC); + } + + @Override + public FormatWriter createFileFormatWriter( + ConnectorSession session, + File targetFile, + List columnNames, + List columnTypes, + HiveCompressionCodec compressionCodec) + throws IOException + { + return new PrestoOrcFormatWriter( + targetFile, + columnNames, + columnTypes, + compressionCodec); + } + }, + + PRESTO_PARQUET { + @Override + public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List columnNames, List columnTypes) + { + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setParquetTimeZone("UTC"); + HivePageSourceFactory pageSourceFactory = new ParquetPageSourceFactory(TYPE_MANAGER, hdfsEnvironment, new FileFormatDataSourceStats(), hiveConfig); + return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes, HiveStorageFormat.PARQUET); + } + + @Override + public FormatWriter createFileFormatWriter( + ConnectorSession session, + File targetFile, + List columnNames, + List columnTypes, + HiveCompressionCodec compressionCodec) + { + return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.PARQUET, session); + } + }, + + HIVE_RCBINARY { + @Override + public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List columnNames, List columnTypes) + { + HiveRecordCursorProvider cursorProvider = new GenericHiveRecordCursorProvider(hdfsEnvironment); + return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes, HiveStorageFormat.RCBINARY); + } + + @Override + public FormatWriter createFileFormatWriter( + ConnectorSession session, + File targetFile, + List columnNames, + List columnTypes, + HiveCompressionCodec compressionCodec) + { + return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.RCBINARY, session); + } + }, + + HIVE_RCTEXT { + @Override + public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List columnNames, List columnTypes) + { + HiveRecordCursorProvider cursorProvider = new GenericHiveRecordCursorProvider(hdfsEnvironment); + return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes, HiveStorageFormat.RCTEXT); + } + + @Override + public FormatWriter createFileFormatWriter( + ConnectorSession session, + File targetFile, + List columnNames, + List columnTypes, + HiveCompressionCodec compressionCodec) + { + return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.RCTEXT, session); + } + }, + + HIVE_ORC { + @Override + public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List columnNames, List columnTypes) + { + HiveRecordCursorProvider cursorProvider = new GenericHiveRecordCursorProvider(hdfsEnvironment); + return createPageSource(cursorProvider, session, targetFile, columnNames, columnTypes, HiveStorageFormat.ORC); + } + + @Override + public FormatWriter createFileFormatWriter( + ConnectorSession session, + File targetFile, + List columnNames, + List columnTypes, + HiveCompressionCodec compressionCodec) + { + return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.ORC, session); + } + }, + + HIVE_PARQUET { + @Override + public ConnectorPageSource createFileFormatReader(ConnectorSession session, HdfsEnvironment hdfsEnvironment, File targetFile, List columnNames, List columnTypes) + { + HiveConfig hiveConfig = new HiveConfig(); + hiveConfig.setParquetTimeZone("UTC"); + HivePageSourceFactory pageSourceFactory = new ParquetPageSourceFactory(TYPE_MANAGER, hdfsEnvironment, new FileFormatDataSourceStats(), hiveConfig); + return createPageSource(pageSourceFactory, session, targetFile, columnNames, columnTypes, HiveStorageFormat.PARQUET); + } + + @Override + public FormatWriter createFileFormatWriter( + ConnectorSession session, + File targetFile, + List columnNames, + List columnTypes, + HiveCompressionCodec compressionCodec) + { + return new RecordFormatWriter(targetFile, columnNames, columnTypes, compressionCodec, HiveStorageFormat.PARQUET, session); + } + }; + + public boolean supportsDate() + { + return true; + } + + public abstract ConnectorPageSource createFileFormatReader( + ConnectorSession session, + HdfsEnvironment hdfsEnvironment, + File targetFile, + List columnNames, + List columnTypes); + + public abstract FormatWriter createFileFormatWriter( + ConnectorSession session, + File targetFile, + List columnNames, + List columnTypes, + HiveCompressionCodec compressionCodec) + throws IOException; + + private static final JobConf conf; + + static { + conf = new JobConf(new Configuration(false)); + conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"); + } + + public boolean supports(TestData testData) + { + return true; + } + + private static ConnectorPageSource createPageSource(HiveRecordCursorProvider cursorProvider, ConnectorSession session, File targetFile, List columnNames, List columnTypes, HiveStorageFormat format) + { + List columnHandles = new ArrayList<>(columnNames.size()); + TypeTranslator typeTranslator = new HiveTypeTranslator(); + for (int i = 0; i < columnNames.size(); i++) { + String columnName = columnNames.get(i); + Type columnType = columnTypes.get(i); + columnHandles.add(new HiveColumnHandle(columnName, toHiveType(typeTranslator, columnType), columnType.getTypeSignature(), i, REGULAR, Optional.empty())); + } + + RecordCursor recordCursor = cursorProvider + .createRecordCursor( + conf, + session, + new Path(targetFile.getAbsolutePath()), + 0, + targetFile.length(), + targetFile.length(), + createSchema(format, columnNames, columnTypes), + columnHandles, + TupleDomain.all(), + TYPE_MANAGER, + false, + ImmutableMap.of()) + .get(); + return new RecordPageSource(columnTypes, recordCursor); + } + + private static ConnectorPageSource createPageSource( + HivePageSourceFactory pageSourceFactory, + ConnectorSession session, + File targetFile, + List columnNames, + List columnTypes, + HiveStorageFormat format) + { + List columnHandles = new ArrayList<>(columnNames.size()); + TypeTranslator typeTranslator = new HiveTypeTranslator(); + for (int i = 0; i < columnNames.size(); i++) { + String columnName = columnNames.get(i); + Type columnType = columnTypes.get(i); + columnHandles.add(new HiveColumnHandle(columnName, toHiveType(typeTranslator, columnType), + columnType.getTypeSignature(), i, REGULAR, Optional.empty())); + } + + return pageSourceFactory + .createPageSource( + conf, + session, + new Path(targetFile.getAbsolutePath()), + 0, + targetFile.length(), + targetFile.length(), + createSchema(format, columnNames, columnTypes), + columnHandles, + TupleDomain.all(), + Optional.empty(), + Optional.empty(), + Optional.empty(), + Optional.empty(), + null, + false, + targetFile.lastModified()) + .get(); + } + + private static class RecordFormatWriter + implements FormatWriter + { + private final RecordFileWriter recordWriter; + + public RecordFormatWriter(File targetFile, + List columnNames, + List columnTypes, + HiveCompressionCodec compressionCodec, + HiveStorageFormat format, + ConnectorSession session) + { + JobConf config = new JobConf(conf); + configureCompression(config, compressionCodec); + + recordWriter = new RecordFileWriter( + new Path(targetFile.toURI()), + columnNames, + fromHiveStorageFormat(format), + createSchema(format, columnNames, columnTypes), + format.getEstimatedWriterSystemMemoryUsage(), + config, + TYPE_MANAGER, + UTC, + session); + } + + @Override + public void writePage(Page page) + { + for (int position = 0; position < page.getPositionCount(); position++) { + recordWriter.appendRow(page, position); + } + } + + @Override + public void close() + { + recordWriter.commit(); + } + } + + private static Properties createSchema(HiveStorageFormat format, List columnNames, List columnTypes) + { + Properties schema = new Properties(); + TypeTranslator typeTranslator = new HiveTypeTranslator(); + schema.setProperty(SERIALIZATION_LIB, format.getSerDe()); + schema.setProperty(FILE_INPUT_FORMAT, format.getInputFormat()); + schema.setProperty(META_TABLE_COLUMNS, columnNames.stream() + .collect(joining(","))); + schema.setProperty(META_TABLE_COLUMN_TYPES, columnTypes.stream() + .map(type -> toHiveType(typeTranslator, type)) + .map(HiveType::getHiveTypeName) + .map(HiveTypeName::toString) + .collect(joining(":"))); + return schema; + } + + private static class PrestoRcFileFormatWriter + implements FormatWriter + { + private final RcFileWriter writer; + + public PrestoRcFileFormatWriter(File targetFile, List types, RcFileEncoding encoding, HiveCompressionCodec compressionCodec) + throws IOException + { + writer = new RcFileWriter( + new OutputStreamSliceOutput(new FileOutputStream(targetFile)), + types, + encoding, + compressionCodec.getCodec().map(Class::getName), + new AircompressorCodecFactory(new HadoopCodecFactory(getClass().getClassLoader())), + ImmutableMap.of(), + true); + } + + @Override + public void writePage(Page page) + throws IOException + { + writer.write(page); + } + + @Override + public void close() + throws IOException + { + writer.close(); + } + } + + private static class PrestoOrcFormatWriter + implements FormatWriter + { + private final OrcWriter writer; + + public PrestoOrcFormatWriter(File targetFile, List columnNames, List types, HiveCompressionCodec compressionCodec) + throws IOException + { + writer = new OrcWriter( + new OutputStreamOrcDataSink(new FileOutputStream(targetFile)), + columnNames, + types, + compressionCodec.getOrcCompressionKind(), + new OrcWriterOptions(), + false, + ImmutableMap.of(), + false, + BOTH, + new OrcWriterStats(), Optional.empty(), Optional.empty()); + } + + @Override + public void writePage(Page page) + throws IOException + { + writer.write(page); + } + + @Override + public void close() + throws IOException + { + writer.close(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/FormatWriter.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/FormatWriter.java new file mode 100644 index 00000000..23e7a25f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/FormatWriter.java @@ -0,0 +1,26 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.benchmark; + +import io.prestosql.spi.Page; + +import java.io.Closeable; +import java.io.IOException; + +public interface FormatWriter + extends Closeable +{ + void writePage(Page page) + throws IOException; +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/HiveFileFormatBenchmark.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/HiveFileFormatBenchmark.java new file mode 100644 index 00000000..e2ba89e1 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/HiveFileFormatBenchmark.java @@ -0,0 +1,638 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.benchmark; + +import com.google.common.collect.ImmutableList; +import io.airlift.slice.Slices; +import io.airlift.tpch.OrderColumn; +import io.airlift.tpch.TpchColumn; +import io.airlift.tpch.TpchEntity; +import io.airlift.tpch.TpchTable; +import io.airlift.units.DataSize; +import io.prestosql.hadoop.HadoopNative; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HiveCompressionCodec; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveSessionProperties; +import io.prestosql.plugin.hive.HiveTestUtils; +import io.prestosql.plugin.hive.OrcFileWriterConfig; +import io.prestosql.plugin.hive.ParquetFileWriterConfig; +import io.prestosql.spi.Page; +import io.prestosql.spi.PageBuilder; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.type.ArrayType; +import io.prestosql.spi.type.Type; +import io.prestosql.testing.TestingConnectorSession; +import it.unimi.dsi.fastutil.ints.IntArrays; +import org.openjdk.jmh.annotations.AuxCounters; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.results.RunResult; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; +import org.openjdk.jmh.util.Statistics; + +import java.io.File; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Random; +import java.util.UUID; +import java.util.concurrent.TimeUnit; + +import static com.google.common.io.MoreFiles.deleteRecursively; +import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static io.airlift.tpch.TpchTable.LINE_ITEM; +import static io.airlift.tpch.TpchTable.ORDERS; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.VarcharType.createUnboundedVarcharType; +import static java.lang.String.format; +import static java.nio.file.Files.createTempDirectory; +import static java.util.stream.Collectors.toList; + +@State(Scope.Thread) +@OutputTimeUnit(TimeUnit.SECONDS) +@Measurement(iterations = 50) +@Warmup(iterations = 20) +@Fork(3) +@SuppressWarnings("UseOfSystemOutOrSystemErr") +public class HiveFileFormatBenchmark +{ + private static final long MIN_DATA_SIZE = new DataSize(50, MEGABYTE).toBytes(); + + static { + HadoopNative.requireHadoopNative(); + } + + private static final HiveConfig CONFIG = new HiveConfig(); + + private static final ConnectorSession SESSION = new TestingConnectorSession(new HiveSessionProperties(CONFIG, new OrcFileWriterConfig(), new ParquetFileWriterConfig()) + .getSessionProperties()); + + private static final HdfsEnvironment HDFS_ENVIRONMENT = HiveTestUtils.createTestHdfsEnvironment(CONFIG); + + @Param({ + "LINEITEM", + "BIGINT_SEQUENTIAL", + "BIGINT_RANDOM", + "VARCHAR_SMALL", + "VARCHAR_LARGE", + "VARCHAR_DICTIONARY", + "MAP_VARCHAR_DOUBLE", + "LARGE_MAP_VARCHAR_DOUBLE", + "MAP_INT_DOUBLE", + "LARGE_MAP_INT_DOUBLE", + "LARGE_ARRAY_VARCHAR"}) + private DataSet dataSet; + + @Param({ + "NONE", + "SNAPPY", + "GZIP"}) + private HiveCompressionCodec compression; + + @Param({ + "PRESTO_RCBINARY", + "PRESTO_RCTEXT", + "PRESTO_ORC", + "PRESTO_PARQUET", + "HIVE_RCBINARY", + "HIVE_RCTEXT", + "HIVE_ORC", + "HIVE_PARQUET"}) + private FileFormat fileFormat; + + private TestData data; + private File dataFile; + + private final File targetDir = createTempDir("presto-benchmark"); + + public HiveFileFormatBenchmark() + { + } + + public HiveFileFormatBenchmark(DataSet dataSet, HiveCompressionCodec compression, FileFormat fileFormat) + { + this.dataSet = dataSet; + this.compression = compression; + this.fileFormat = fileFormat; + } + + @Setup + public void setup() + throws IOException + { + data = dataSet.createTestData(fileFormat); + + targetDir.mkdirs(); + dataFile = new File(targetDir, UUID.randomUUID().toString()); + writeData(dataFile); + } + + @TearDown + public void tearDown() + throws IOException + { + deleteRecursively(targetDir.toPath(), ALLOW_INSECURE); + } + + @SuppressWarnings("PublicField") + @AuxCounters + @State(Scope.Thread) + public static class CompressionCounter + { + public long inputSize; + public long outputSize; + } + + @Benchmark + public List read(CompressionCounter counter) + throws IOException + { + if (!fileFormat.supports(data)) { + throw new RuntimeException(fileFormat + " does not support data set " + dataSet); + } + List pages = new ArrayList<>(100); + try (ConnectorPageSource pageSource = fileFormat.createFileFormatReader( + SESSION, + HDFS_ENVIRONMENT, + dataFile, + data.getColumnNames(), + data.getColumnTypes())) { + while (!pageSource.isFinished()) { + Page page = pageSource.getNextPage(); + if (page != null) { + pages.add(page.getLoadedPage()); + } + } + } + counter.inputSize += data.getSize(); + counter.outputSize += dataFile.length(); + return pages; + } + + @Benchmark + public File write(CompressionCounter counter) + throws IOException + { + File targetFile = new File(targetDir, UUID.randomUUID().toString()); + writeData(targetFile); + counter.inputSize += data.getSize(); + counter.outputSize += targetFile.length(); + return targetFile; + } + + private void writeData(File targetFile) + throws IOException + { + List inputPages = data.getPages(); + try (FormatWriter formatWriter = fileFormat.createFileFormatWriter( + SESSION, + targetFile, + data.getColumnNames(), + data.getColumnTypes(), + compression)) { + for (Page page : inputPages) { + formatWriter.writePage(page); + } + } + } + + public enum DataSet + { + LINEITEM { + @Override + public TestData createTestData(FileFormat format) + { + return createTpchDataSet(format, LINE_ITEM, LINE_ITEM.getColumns()); + } + }, + BIGINT_SEQUENTIAL { + @Override + public TestData createTestData(FileFormat format) + { + return createTpchDataSet(format, ORDERS, OrderColumn.ORDER_KEY); + } + }, + BIGINT_RANDOM { + @Override + public TestData createTestData(FileFormat format) + { + return createTpchDataSet(format, ORDERS, OrderColumn.CUSTOMER_KEY); + } + }, + VARCHAR_SMALL { + @Override + public TestData createTestData(FileFormat format) + { + return createTpchDataSet(format, ORDERS, OrderColumn.CLERK); + } + }, + VARCHAR_LARGE { + @Override + public TestData createTestData(FileFormat format) + { + return createTpchDataSet(format, ORDERS, OrderColumn.CLERK); + } + }, + VARCHAR_DICTIONARY { + @Override + public TestData createTestData(FileFormat format) + { + return createTpchDataSet(format, ORDERS, OrderColumn.ORDER_PRIORITY); + } + }, + MAP_VARCHAR_DOUBLE { + private static final int MIN_ENTRIES = 1; + private static final int MAX_ENTRIES = 5; + + @Override + public TestData createTestData(FileFormat format) + { + Type type = HiveTestUtils.mapType(createUnboundedVarcharType(), DOUBLE); + Random random = new Random(1234); + + PageBuilder pageBuilder = new PageBuilder(ImmutableList.of(type)); + ImmutableList.Builder pages = ImmutableList.builder(); + + int[] keys = new int[] {1, 2, 3, 4, 5}; + + long dataSize = 0; + while (dataSize < MIN_DATA_SIZE) { + pageBuilder.declarePosition(); + + BlockBuilder builder = pageBuilder.getBlockBuilder(0); + BlockBuilder mapBuilder = builder.beginBlockEntry(); + int entries = nextRandomBetween(random, MIN_ENTRIES, MAX_ENTRIES); + IntArrays.shuffle(keys, random); + for (int entryId = 0; entryId < entries; entryId++) { + createUnboundedVarcharType().writeSlice(mapBuilder, Slices.utf8Slice("key" + keys[entryId])); + DOUBLE.writeDouble(mapBuilder, random.nextDouble()); + } + builder.closeEntry(); + + if (pageBuilder.isFull()) { + Page page = pageBuilder.build(); + pages.add(page); + pageBuilder.reset(); + dataSize += page.getSizeInBytes(); + } + } + return new TestData(ImmutableList.of("map"), ImmutableList.of(type), pages.build()); + } + }, + LARGE_MAP_VARCHAR_DOUBLE { + private static final int MIN_ENTRIES = 5_000; + private static final int MAX_ENTRIES = 15_000; + + @Override + public TestData createTestData(FileFormat format) + { + Type type = HiveTestUtils.mapType(createUnboundedVarcharType(), DOUBLE); + Random random = new Random(1234); + + PageBuilder pageBuilder = new PageBuilder(ImmutableList.of(type)); + ImmutableList.Builder pages = ImmutableList.builder(); + long dataSize = 0; + while (dataSize < MIN_DATA_SIZE) { + pageBuilder.declarePosition(); + + BlockBuilder builder = pageBuilder.getBlockBuilder(0); + BlockBuilder mapBuilder = builder.beginBlockEntry(); + int entries = nextRandomBetween(random, MIN_ENTRIES, MAX_ENTRIES); + for (int entryId = 0; entryId < entries; entryId++) { + createUnboundedVarcharType().writeSlice(mapBuilder, Slices.utf8Slice("key" + random.nextInt(10_000_000))); + DOUBLE.writeDouble(mapBuilder, random.nextDouble()); + } + builder.closeEntry(); + + if (pageBuilder.isFull()) { + Page page = pageBuilder.build(); + pages.add(page); + pageBuilder.reset(); + dataSize += page.getSizeInBytes(); + } + } + return new TestData(ImmutableList.of("map"), ImmutableList.of(type), pages.build()); + } + }, + MAP_INT_DOUBLE { + private static final int MIN_ENTRIES = 1; + private static final int MAX_ENTRIES = 5; + + @Override + public TestData createTestData(FileFormat format) + { + Type type = HiveTestUtils.mapType(INTEGER, DOUBLE); + Random random = new Random(1234); + + PageBuilder pageBuilder = new PageBuilder(ImmutableList.of(type)); + ImmutableList.Builder pages = ImmutableList.builder(); + + int[] keys = new int[] {1, 2, 3, 4, 5}; + + long dataSize = 0; + while (dataSize < MIN_DATA_SIZE) { + pageBuilder.declarePosition(); + + BlockBuilder builder = pageBuilder.getBlockBuilder(0); + BlockBuilder mapBuilder = builder.beginBlockEntry(); + int entries = nextRandomBetween(random, MIN_ENTRIES, MAX_ENTRIES); + IntArrays.shuffle(keys, random); + for (int entryId = 0; entryId < entries; entryId++) { + INTEGER.writeLong(mapBuilder, keys[entryId]); + DOUBLE.writeDouble(mapBuilder, random.nextDouble()); + } + builder.closeEntry(); + + if (pageBuilder.isFull()) { + Page page = pageBuilder.build(); + pages.add(page); + pageBuilder.reset(); + dataSize += page.getSizeInBytes(); + } + } + return new TestData(ImmutableList.of("map"), ImmutableList.of(type), pages.build()); + } + }, + LARGE_MAP_INT_DOUBLE { + private static final int MIN_ENTRIES = 5_000; + private static final int MAX_ENTRIES = 15_0000; + + @Override + public TestData createTestData(FileFormat format) + { + Type type = HiveTestUtils.mapType(INTEGER, DOUBLE); + Random random = new Random(1234); + + PageBuilder pageBuilder = new PageBuilder(ImmutableList.of(type)); + ImmutableList.Builder pages = ImmutableList.builder(); + long dataSize = 0; + while (dataSize < MIN_DATA_SIZE) { + pageBuilder.declarePosition(); + + BlockBuilder builder = pageBuilder.getBlockBuilder(0); + BlockBuilder mapBuilder = builder.beginBlockEntry(); + int entries = nextRandomBetween(random, MIN_ENTRIES, MAX_ENTRIES); + for (int entryId = 0; entryId < entries; entryId++) { + INTEGER.writeLong(mapBuilder, random.nextInt(10_000_000)); + DOUBLE.writeDouble(mapBuilder, random.nextDouble()); + } + builder.closeEntry(); + + if (pageBuilder.isFull()) { + Page page = pageBuilder.build(); + pages.add(page); + pageBuilder.reset(); + dataSize += page.getSizeInBytes(); + } + } + return new TestData(ImmutableList.of("map"), ImmutableList.of(type), pages.build()); + } + }, + LARGE_ARRAY_VARCHAR { + private static final int MIN_ENTRIES = 5_000; + private static final int MAX_ENTRIES = 15_0000; + + @Override + public TestData createTestData(FileFormat format) + { + Type type = new ArrayType(createUnboundedVarcharType()); + Random random = new Random(1234); + + PageBuilder pageBuilder = new PageBuilder(ImmutableList.of(type)); + ImmutableList.Builder pages = ImmutableList.builder(); + long dataSize = 0; + while (dataSize < MIN_DATA_SIZE) { + pageBuilder.declarePosition(); + + BlockBuilder builder = pageBuilder.getBlockBuilder(0); + BlockBuilder mapBuilder = builder.beginBlockEntry(); + int entries = nextRandomBetween(random, MIN_ENTRIES, MAX_ENTRIES); + for (int entryId = 0; entryId < entries; entryId++) { + createUnboundedVarcharType().writeSlice(mapBuilder, Slices.utf8Slice("key" + random.nextInt(10_000_000))); + } + builder.closeEntry(); + + if (pageBuilder.isFull()) { + Page page = pageBuilder.build(); + pages.add(page); + pageBuilder.reset(); + dataSize += page.getSizeInBytes(); + } + } + return new TestData(ImmutableList.of("map"), ImmutableList.of(type), pages.build()); + } + }; + + public abstract TestData createTestData(FileFormat format); + } + + @SafeVarargs + private static TestData createTpchDataSet(FileFormat format, TpchTable tpchTable, TpchColumn... columns) + { + return createTpchDataSet(format, tpchTable, ImmutableList.copyOf(columns)); + } + + private static TestData createTpchDataSet(FileFormat format, TpchTable tpchTable, List> columns) + { + List columnNames = columns.stream().map(TpchColumn::getColumnName).collect(toList()); + List columnTypes = columns.stream().map(HiveFileFormatBenchmark::getColumnType) + .map(type -> format.supportsDate() || !DATE.equals(type) ? type : createUnboundedVarcharType()) + .collect(toList()); + + PageBuilder pageBuilder = new PageBuilder(columnTypes); + ImmutableList.Builder pages = ImmutableList.builder(); + long dataSize = 0; + for (E row : tpchTable.createGenerator(10, 1, 1)) { + pageBuilder.declarePosition(); + for (int i = 0; i < columns.size(); i++) { + TpchColumn column = columns.get(i); + BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(i); + switch (column.getType().getBase()) { + case IDENTIFIER: + BIGINT.writeLong(blockBuilder, column.getIdentifier(row)); + break; + case INTEGER: + INTEGER.writeLong(blockBuilder, column.getInteger(row)); + break; + case DATE: + if (format.supportsDate()) { + DATE.writeLong(blockBuilder, column.getDate(row)); + } + else { + createUnboundedVarcharType().writeString(blockBuilder, column.getString(row)); + } + break; + case DOUBLE: + DOUBLE.writeDouble(blockBuilder, column.getDouble(row)); + break; + case VARCHAR: + createUnboundedVarcharType().writeSlice(blockBuilder, Slices.utf8Slice(column.getString(row))); + break; + default: + throw new IllegalArgumentException("Unsupported type " + column.getType()); + } + } + if (pageBuilder.isFull()) { + Page page = pageBuilder.build(); + pages.add(page); + pageBuilder.reset(); + dataSize += page.getSizeInBytes(); + + if (dataSize >= MIN_DATA_SIZE) { + break; + } + } + } + return new TestData(columnNames, columnTypes, pages.build()); + } + + static class TestData + { + private final List columnNames; + private final List columnTypes; + + private final List pages; + + private final int size; + + public TestData(List columnNames, List columnTypes, List pages) + { + this.columnNames = ImmutableList.copyOf(columnNames); + this.columnTypes = ImmutableList.copyOf(columnTypes); + this.pages = ImmutableList.copyOf(pages); + this.size = (int) pages.stream().mapToLong(Page::getSizeInBytes).sum(); + } + + public List getColumnNames() + { + return columnNames; + } + + public List getColumnTypes() + { + return columnTypes; + } + + public List getPages() + { + return pages; + } + + public int getSize() + { + return size; + } + } + + private static Type getColumnType(TpchColumn input) + { + switch (input.getType().getBase()) { + case IDENTIFIER: + return BIGINT; + case INTEGER: + return INTEGER; + case DATE: + return DATE; + case DOUBLE: + return DOUBLE; + case VARCHAR: + return createUnboundedVarcharType(); + } + throw new IllegalArgumentException("Unsupported type " + input.getType()); + } + + public static void main(String[] args) + throws Exception + { + Options opt = new OptionsBuilder() + .include(".*\\." + HiveFileFormatBenchmark.class.getSimpleName() + ".*") + .jvmArgsAppend("-Xmx4g", "-Xms4g", "-XX:+UseG1GC") + .build(); + + Collection results = new Runner(opt).run(); + + for (RunResult result : results) { + Statistics inputSizeStats = result.getSecondaryResults().get("inputSize").getStatistics(); + Statistics outputSizeStats = result.getSecondaryResults().get("outputSize").getStatistics(); + double compressionRatio = 1.0 * inputSizeStats.getSum() / outputSizeStats.getSum(); + String compression = result.getParams().getParam("compression"); + String fileFormat = result.getParams().getParam("fileFormat"); + String dataSet = result.getParams().getParam("dataSet"); + System.out.printf(" %-10s %-30s %-10s %-25s %2.2f %10s ± %11s (%5.2f%%) (N = %d, \u03B1 = 99.9%%)\n", + result.getPrimaryResult().getLabel(), + dataSet, + compression, + fileFormat, + compressionRatio, + toHumanReadableSpeed((long) inputSizeStats.getMean()), + toHumanReadableSpeed((long) inputSizeStats.getMeanErrorAt(0.999)), + inputSizeStats.getMeanErrorAt(0.999) * 100 / inputSizeStats.getMean(), + inputSizeStats.getN()); + } + System.out.println(); + } + + private static String toHumanReadableSpeed(long bytesPerSecond) + { + String humanReadableSpeed; + if (bytesPerSecond < 1024 * 10L) { + humanReadableSpeed = format("%dB/s", bytesPerSecond); + } + else if (bytesPerSecond < 1024 * 1024 * 10L) { + humanReadableSpeed = format("%.1fkB/s", bytesPerSecond / 1024.0f); + } + else if (bytesPerSecond < 1024 * 1024 * 1024 * 10L) { + humanReadableSpeed = format("%.1fMB/s", bytesPerSecond / (1024.0f * 1024.0f)); + } + else { + humanReadableSpeed = format("%.1fGB/s", bytesPerSecond / (1024.0f * 1024.0f * 1024.0f)); + } + return humanReadableSpeed; + } + + private static int nextRandomBetween(Random random, int min, int max) + { + return min + random.nextInt(max - min); + } + + @SuppressWarnings("SameParameterValue") + private static File createTempDir(String prefix) + { + try { + return createTempDirectory(prefix).toFile(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/TestHiveFileFormatBenchmark.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/TestHiveFileFormatBenchmark.java new file mode 100644 index 00000000..50770843 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/benchmark/TestHiveFileFormatBenchmark.java @@ -0,0 +1,72 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.benchmark; + +import io.prestosql.plugin.hive.HiveCompressionCodec; +import org.testng.annotations.Test; + +import java.io.IOException; + +public class TestHiveFileFormatBenchmark +{ + @Test + public void testSomeFormats() + throws Exception + { + executeBenchmark(HiveFileFormatBenchmark.DataSet.LINEITEM, HiveCompressionCodec.SNAPPY, FileFormat.PRESTO_RCBINARY); + executeBenchmark(HiveFileFormatBenchmark.DataSet.LINEITEM, HiveCompressionCodec.SNAPPY, FileFormat.PRESTO_ORC); + executeBenchmark(HiveFileFormatBenchmark.DataSet.LINEITEM, HiveCompressionCodec.SNAPPY, FileFormat.HIVE_RCBINARY); + executeBenchmark(HiveFileFormatBenchmark.DataSet.MAP_VARCHAR_DOUBLE, HiveCompressionCodec.SNAPPY, FileFormat.PRESTO_RCBINARY); + executeBenchmark(HiveFileFormatBenchmark.DataSet.MAP_VARCHAR_DOUBLE, HiveCompressionCodec.SNAPPY, FileFormat.PRESTO_ORC); + executeBenchmark(HiveFileFormatBenchmark.DataSet.MAP_VARCHAR_DOUBLE, HiveCompressionCodec.SNAPPY, FileFormat.HIVE_RCBINARY); + executeBenchmark(HiveFileFormatBenchmark.DataSet.LARGE_MAP_VARCHAR_DOUBLE, HiveCompressionCodec.SNAPPY, FileFormat.PRESTO_RCBINARY); + executeBenchmark(HiveFileFormatBenchmark.DataSet.LARGE_MAP_VARCHAR_DOUBLE, HiveCompressionCodec.SNAPPY, FileFormat.PRESTO_ORC); + executeBenchmark(HiveFileFormatBenchmark.DataSet.LARGE_MAP_VARCHAR_DOUBLE, HiveCompressionCodec.SNAPPY, FileFormat.HIVE_RCBINARY); + } + + @Test + public void testAllCompression() + throws Exception + { + for (HiveCompressionCodec codec : HiveCompressionCodec.values()) { + executeBenchmark(HiveFileFormatBenchmark.DataSet.LINEITEM, codec, FileFormat.PRESTO_RCBINARY); + } + } + + @Test + public void testAllDataSets() + throws Exception + { + for (HiveFileFormatBenchmark.DataSet dataSet : HiveFileFormatBenchmark.DataSet.values()) { + executeBenchmark(dataSet, HiveCompressionCodec.SNAPPY, FileFormat.PRESTO_RCBINARY); + } + } + + private static void executeBenchmark(HiveFileFormatBenchmark.DataSet dataSet, HiveCompressionCodec codec, FileFormat format) + throws IOException + { + HiveFileFormatBenchmark benchmark = new HiveFileFormatBenchmark(dataSet, codec, format); + try { + benchmark.setup(); + benchmark.read(new HiveFileFormatBenchmark.CompressionCounter()); + benchmark.write(new HiveFileFormatBenchmark.CompressionCounter()); + } + catch (Exception e) { + throw new RuntimeException("Failed " + dataSet + " " + codec + " " + format, e); + } + finally { + benchmark.tearDown(); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/gcs/TestHiveGcsConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/gcs/TestHiveGcsConfig.java new file mode 100644 index 00000000..ea011586 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/gcs/TestHiveGcsConfig.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.gcs; + +import com.google.common.collect.ImmutableMap; +import org.testng.annotations.Test; + +import java.util.Map; + +import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; +import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; +import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; + +public class TestHiveGcsConfig +{ + @Test + public void testDefaults() + { + assertRecordedDefaults(recordDefaults(HiveGcsConfig.class) + .setJsonKeyFilePath(null) + .setUseGcsAccessToken(false)); + } + + @Test + public void testExplicitPropertyMappings() + { + Map properties = new ImmutableMap.Builder() + .put("hive.gcs.json-key-file-path", "/tmp/key.json") + .put("hive.gcs.use-access-token", "true") + .build(); + + HiveGcsConfig expected = new HiveGcsConfig() + .setJsonKeyFilePath("/tmp/key.json") + .setUseGcsAccessToken(true); + + assertFullMapping(properties, expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestCachingHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestCachingHiveMetastore.java new file mode 100644 index 00000000..babe3d44 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestCachingHiveMetastore.java @@ -0,0 +1,283 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; +import com.google.common.util.concurrent.ListeningExecutorService; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.thrift.BridgingHiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.MetastoreLocator; +import io.prestosql.plugin.hive.metastore.thrift.MockThriftMetastoreClient; +import io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastoreConfig; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreClient; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreStats; +import io.prestosql.spi.connector.TableNotFoundException; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.TimeUnit; + +import static com.google.common.util.concurrent.MoreExecutors.listeningDecorator; +import static io.airlift.concurrent.Threads.daemonThreadsNamed; +import static io.prestosql.testing.TestingConnectorSession.SESSION; +import static java.util.concurrent.Executors.newCachedThreadPool; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; + +@Test(singleThreaded = true) +public class TestCachingHiveMetastore +{ + private static final HiveIdentity IDENTITY = new HiveIdentity(SESSION); + + private MockThriftMetastoreClient mockClient; + private CachingHiveMetastore metastore; + private ThriftMetastoreStats stats; + + @BeforeMethod + public void setUp() + { + mockClient = new MockThriftMetastoreClient(); + MetastoreLocator metastoreLocator = new MockMetastoreLocator(mockClient); + ListeningExecutorService executor = listeningDecorator(newCachedThreadPool(daemonThreadsNamed("test-%s"))); + ListeningExecutorService executorRefresh = listeningDecorator(newCachedThreadPool(daemonThreadsNamed("test-%s"))); + ThriftHiveMetastore thriftHiveMetastore = new ThriftHiveMetastore(metastoreLocator, new ThriftHiveMetastoreConfig()); + metastore = new CachingHiveMetastore( + new BridgingHiveMetastore(thriftHiveMetastore), + executor, + executorRefresh, new Duration(6, TimeUnit.MINUTES), + new Duration(6, TimeUnit.MINUTES), + new Duration(5, TimeUnit.MINUTES), + new Duration(1, TimeUnit.MINUTES), + 1000, + false); + stats = thriftHiveMetastore.getStats(); + } + + @Test + public void testGetAllDatabases() + { + assertEquals(mockClient.getAccessCount(), 0); + assertEquals(metastore.getAllDatabases(), ImmutableList.of(MockThriftMetastoreClient.TEST_DATABASE)); + assertEquals(mockClient.getAccessCount(), 1); + assertEquals(metastore.getAllDatabases(), ImmutableList.of(MockThriftMetastoreClient.TEST_DATABASE)); + assertEquals(mockClient.getAccessCount(), 1); + + metastore.flushCache(); + + assertEquals(metastore.getAllDatabases(), ImmutableList.of(MockThriftMetastoreClient.TEST_DATABASE)); + assertEquals(mockClient.getAccessCount(), 2); + } + + @Test + public void testGetAllTable() + { + assertEquals(mockClient.getAccessCount(), 0); + assertEquals(metastore.getAllTables(MockThriftMetastoreClient.TEST_DATABASE).get(), ImmutableList.of(MockThriftMetastoreClient.TEST_TABLE)); + assertEquals(mockClient.getAccessCount(), 1); + assertEquals(metastore.getAllTables(MockThriftMetastoreClient.TEST_DATABASE).get(), ImmutableList.of(MockThriftMetastoreClient.TEST_TABLE)); + assertEquals(mockClient.getAccessCount(), 1); + + metastore.flushCache(); + + assertEquals(metastore.getAllTables(MockThriftMetastoreClient.TEST_DATABASE).get(), ImmutableList.of(MockThriftMetastoreClient.TEST_TABLE)); + assertEquals(mockClient.getAccessCount(), 2); + } + + @Test + public void testInvalidDbGetAllTAbles() + { + assertFalse(metastore.getAllTables(MockThriftMetastoreClient.BAD_DATABASE).isPresent()); + } + + @Test + public void testGetTable() + { + assertEquals(mockClient.getAccessCount(), 0); + assertNotNull(metastore.getTable(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE)); + assertEquals(mockClient.getAccessCount(), 1); + assertNotNull(metastore.getTable(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE)); + assertEquals(mockClient.getAccessCount(), 1); + + metastore.flushCache(); + + assertNotNull(metastore.getTable(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE)); + assertEquals(mockClient.getAccessCount(), 2); + } + + @Test + public void testInvalidDbGetTable() + { + assertFalse(metastore.getTable(IDENTITY, MockThriftMetastoreClient.BAD_DATABASE, MockThriftMetastoreClient.TEST_TABLE).isPresent()); + + assertEquals(stats.getGetTable().getThriftExceptions().getTotalCount(), 0); + assertEquals(stats.getGetTable().getTotalFailures().getTotalCount(), 0); + } + + @Test + public void testGetPartitionNames() + { + ImmutableList expectedPartitions = ImmutableList.of(MockThriftMetastoreClient.TEST_PARTITION1, MockThriftMetastoreClient.TEST_PARTITION2); + assertEquals(mockClient.getAccessCount(), 0); + assertEquals(metastore.getPartitionNames(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE).get(), expectedPartitions); + assertEquals(mockClient.getAccessCount(), 1); + assertEquals(metastore.getPartitionNames(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE).get(), expectedPartitions); + assertEquals(mockClient.getAccessCount(), 1); + + metastore.flushCache(); + + assertEquals(metastore.getPartitionNames(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE).get(), expectedPartitions); + assertEquals(mockClient.getAccessCount(), 2); + } + + @Test + public void testInvalidGetPartitionNames() + { + assertEquals(metastore.getPartitionNames(IDENTITY, MockThriftMetastoreClient.BAD_DATABASE, MockThriftMetastoreClient.TEST_TABLE).get(), ImmutableList.of()); + } + + @Test + public void testGetPartitionNamesByParts() + { + ImmutableList parts = ImmutableList.of(); + ImmutableList expectedPartitions = ImmutableList.of(MockThriftMetastoreClient.TEST_PARTITION1, MockThriftMetastoreClient.TEST_PARTITION2); + + assertEquals(mockClient.getAccessCount(), 0); + assertEquals(metastore.getPartitionNamesByParts(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE, parts).get(), expectedPartitions); + assertEquals(mockClient.getAccessCount(), 3); + assertEquals(metastore.getPartitionNamesByParts(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE, parts).get(), expectedPartitions); + assertEquals(mockClient.getAccessCount(), 3); + + metastore.flushCache(); + + assertEquals(metastore.getPartitionNamesByParts(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE, parts).get(), expectedPartitions); + assertEquals(mockClient.getAccessCount(), 6); + } + + @Test + public void testInvalidGetPartitionNamesByParts() + { + ImmutableList parts = ImmutableList.of(); + assertFalse(metastore.getPartitionNamesByParts(IDENTITY, MockThriftMetastoreClient.BAD_DATABASE, MockThriftMetastoreClient.TEST_TABLE, parts).isPresent()); + } + + @Test + public void testGetPartitionsByNames() + { + assertEquals(mockClient.getAccessCount(), 0); + metastore.getTable(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE); + assertEquals(mockClient.getAccessCount(), 1); + + // Select half of the available partitions and load them into the cache + assertEquals(metastore.getPartitionsByNames(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE, ImmutableList.of(MockThriftMetastoreClient.TEST_PARTITION1)).size(), 1); + assertEquals(mockClient.getAccessCount(), 4); + + // Now select all of the partitions + assertEquals(metastore.getPartitionsByNames(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE, ImmutableList.of(MockThriftMetastoreClient.TEST_PARTITION1, MockThriftMetastoreClient.TEST_PARTITION2)).size(), 2); + // There should be one more access to fetch the remaining partition + assertEquals(mockClient.getAccessCount(), 6); + + // Now if we fetch any or both of them, they should not hit the client + assertEquals(metastore.getPartitionsByNames(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE, ImmutableList.of(MockThriftMetastoreClient.TEST_PARTITION1)).size(), 1); + assertEquals(metastore.getPartitionsByNames(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE, ImmutableList.of(MockThriftMetastoreClient.TEST_PARTITION2)).size(), 1); + assertEquals(metastore.getPartitionsByNames(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE, ImmutableList.of(MockThriftMetastoreClient.TEST_PARTITION1, MockThriftMetastoreClient.TEST_PARTITION2)).size(), 2); + assertEquals(mockClient.getAccessCount(), 6); + + metastore.flushCache(); + + // Fetching both should only result in one batched access + assertEquals(metastore.getPartitionsByNames(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE, ImmutableList.of(MockThriftMetastoreClient.TEST_PARTITION1, MockThriftMetastoreClient.TEST_PARTITION2)).size(), 2); + assertEquals(mockClient.getAccessCount(), 10); + } + + @Test + public void testListRoles() + throws Exception + { + assertEquals(mockClient.getAccessCount(), 0); + + assertEquals(metastore.listRoles(), MockThriftMetastoreClient.TEST_ROLES); + assertEquals(mockClient.getAccessCount(), 1); + + assertEquals(metastore.listRoles(), MockThriftMetastoreClient.TEST_ROLES); + assertEquals(mockClient.getAccessCount(), 1); + + metastore.flushCache(); + + assertEquals(metastore.listRoles(), MockThriftMetastoreClient.TEST_ROLES); + assertEquals(mockClient.getAccessCount(), 2); + + metastore.createRole("role", "grantor"); + + assertEquals(metastore.listRoles(), MockThriftMetastoreClient.TEST_ROLES); + assertEquals(mockClient.getAccessCount(), 3); + + metastore.dropRole("testrole"); + + assertEquals(metastore.listRoles(), MockThriftMetastoreClient.TEST_ROLES); + assertEquals(mockClient.getAccessCount(), 4); + } + + @Test(expectedExceptions = { TableNotFoundException.class }) + public void testInvalidGetPartitionsByNames() + { + Map> partitionsByNames = metastore.getPartitionsByNames(IDENTITY, MockThriftMetastoreClient.BAD_DATABASE, MockThriftMetastoreClient.TEST_TABLE, ImmutableList.of(MockThriftMetastoreClient.TEST_PARTITION1)); + assertEquals(partitionsByNames.size(), 1); + Optional onlyElement = Iterables.getOnlyElement(partitionsByNames.values()); + assertFalse(onlyElement.isPresent()); + } + + @Test + public void testNoCacheExceptions() + { + // Throw exceptions on usage + mockClient.setThrowException(true); + try { + metastore.getAllDatabases(); + } + catch (RuntimeException ignored) { + } + assertEquals(mockClient.getAccessCount(), 1); + + // Second try should hit the client again + try { + metastore.getAllDatabases(); + } + catch (RuntimeException ignored) { + } + assertEquals(mockClient.getAccessCount(), 2); + } + + private static class MockMetastoreLocator + implements MetastoreLocator + { + private final ThriftMetastoreClient client; + + private MockMetastoreLocator(ThriftMetastoreClient client) + { + this.client = client; + } + + @Override + public ThriftMetastoreClient createMetastoreClient() + { + return client; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestMetastoreConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestMetastoreConfig.java new file mode 100644 index 00000000..6ea5958b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestMetastoreConfig.java @@ -0,0 +1,52 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.collect.ImmutableMap; +import org.testng.annotations.Test; + +import java.util.Map; + +import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; +import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; +import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; + +public class TestMetastoreConfig +{ + @Test + public void testDefaults() + { + assertRecordedDefaults(recordDefaults(MetastoreConfig.class) + .setMetastoreType("thrift") + .setThriftMetastoreImp("") + .setMetastoreClientFactoryImp("")); + } + + @Test + public void testExplicitPropertyMappings() + { + Map properties = new ImmutableMap.Builder() + .put("hive.metastore", "foo") + .put("hive.metastore.client-factory-imp", "packageName.testClient") + .put("hive.metastore.thrift-imp", "packageName.testThrift") + .build(); + + MetastoreConfig expected = new MetastoreConfig() + .setMetastoreType("foo") + .setMetastoreClientFactoryImp("packageName.testClient") + .setThriftMetastoreImp("packageName.testThrift"); + + assertFullMapping(properties, expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestMetastoreUtil.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestMetastoreUtil.java new file mode 100644 index 00000000..05c110d9 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestMetastoreUtil.java @@ -0,0 +1,173 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultimap; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.Order; +import org.apache.hadoop.hive.metastore.api.PrincipalPrivilegeSet; +import org.apache.hadoop.hive.metastore.api.SerDeInfo; +import org.apache.hadoop.hive.metastore.api.SkewedInfo; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.Properties; + +import static io.prestosql.plugin.hive.metastore.MetastoreUtil.META_PARTITION_COLUMNS; +import static org.apache.hadoop.hive.serde.serdeConstants.COLUMN_NAME_DELIMITER; +import static org.testng.Assert.assertEquals; + +public class TestMetastoreUtil +{ + private static final List TEST_SCHEMA = ImmutableList.of( + new FieldSchema("col1", "bigint", "comment1"), + new FieldSchema("col2", "binary", null), + new FieldSchema("col3", "string", null)); + private static final StorageDescriptor TEST_STORAGE_DESCRIPTOR = new StorageDescriptor( + TEST_SCHEMA, + "hdfs://VOL1:9000/db_name/table_name", + "com.facebook.hive.orc.OrcInputFormat", + "com.facebook.hive.orc.OrcOutputFormat", + false, + 100, + new SerDeInfo("table_name", "com.facebook.hive.orc.OrcSerde", ImmutableMap.of("sdk1", "sdv1", "sdk2", "sdv2")), + ImmutableList.of("col2", "col3"), + ImmutableList.of(new Order("col2", 1)), + ImmutableMap.of()); + private static final org.apache.hadoop.hive.metastore.api.Table TEST_TABLE = new org.apache.hadoop.hive.metastore.api.Table( + "table_name", + "db_name", + "owner_name", + 0, + 0, + 0, + TEST_STORAGE_DESCRIPTOR, + ImmutableList.of( + new FieldSchema("pk1", "string", "comment pk1"), + new FieldSchema("pk2", "string", null)), + ImmutableMap.of("k1", "v1", "k2", "v2", "k3", "v3"), + "view original text", + "view extended text", + "MANAGED_TABLE"); + + static { + TEST_TABLE.setPrivileges(new PrincipalPrivilegeSet(ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of())); + } + + private static final org.apache.hadoop.hive.metastore.api.Partition TEST_PARTITION = new org.apache.hadoop.hive.metastore.api.Partition( + ImmutableList.of("pk1v", "pk2v"), + "db_name", + "table_name", + 0, + 0, + TEST_STORAGE_DESCRIPTOR, + ImmutableMap.of("k1", "v1", "k2", "v2", "k3", "v3")); + private static final StorageDescriptor TEST_STORAGE_DESCRIPTOR_WITH_UNSUPPORTED_FIELDS = new StorageDescriptor( + TEST_SCHEMA, + "hdfs://VOL1:9000/db_name/table_name", + "com.facebook.hive.orc.OrcInputFormat", + "com.facebook.hive.orc.OrcOutputFormat", + false, + 100, + new SerDeInfo("table_name", "com.facebook.hive.orc.OrcSerde", ImmutableMap.of("sdk1", "sdv1", "sdk2", "sdv2")), + ImmutableList.of("col2", "col3"), + ImmutableList.of(new Order("col2", 0), new Order("col3", 1)), + ImmutableMap.of("sk1", "sv1")); + private static final org.apache.hadoop.hive.metastore.api.Table TEST_TABLE_WITH_UNSUPPORTED_FIELDS = new org.apache.hadoop.hive.metastore.api.Table( + "table_name", + "db_name", + "owner_name", + 1234567890, + 1234567891, + 34, + TEST_STORAGE_DESCRIPTOR_WITH_UNSUPPORTED_FIELDS, + ImmutableList.of( + new FieldSchema("pk1", "string", "comment pk1"), + new FieldSchema("pk2", "string", null)), + ImmutableMap.of("k1", "v1", "k2", "v2", "k3", "v3"), + "view original text", + "view extended text", + "MANAGED_TABLE"); + private static final org.apache.hadoop.hive.metastore.api.Partition TEST_PARTITION_WITH_UNSUPPORTED_FIELDS = new org.apache.hadoop.hive.metastore.api.Partition( + ImmutableList.of("pk1v", "pk2v"), + "db_name", + "table_name", + 1234567892, + 1234567893, + TEST_STORAGE_DESCRIPTOR_WITH_UNSUPPORTED_FIELDS, + ImmutableMap.of("k1", "v1", "k2", "v2", "k3", "v3")); + + static { + TEST_STORAGE_DESCRIPTOR_WITH_UNSUPPORTED_FIELDS.setSkewedInfo(new SkewedInfo( + ImmutableList.of("col1"), + ImmutableList.of(ImmutableList.of("val1")), + ImmutableMap.of(ImmutableList.of("val1"), "loc1"))); + } + + @Test + public void testTableRoundTrip() + { + Table table = ThriftMetastoreUtil.fromMetastoreApiTable(TEST_TABLE, TEST_SCHEMA); + PrincipalPrivileges privileges = new PrincipalPrivileges(ImmutableMultimap.of(), ImmutableMultimap.of()); + org.apache.hadoop.hive.metastore.api.Table metastoreApiTable = ThriftMetastoreUtil.toMetastoreApiTable(table, privileges); + assertEquals(metastoreApiTable, TEST_TABLE); + } + + @Test + public void testPartitionRoundTrip() + { + Partition partition = ThriftMetastoreUtil.fromMetastoreApiPartition(TEST_PARTITION); + org.apache.hadoop.hive.metastore.api.Partition metastoreApiPartition = ThriftMetastoreUtil.toMetastoreApiPartition(partition); + assertEquals(metastoreApiPartition, TEST_PARTITION); + } + + @Test + public void testHiveSchemaTable() + { + Properties expected = MetaStoreUtils.getTableMetadata(TEST_TABLE_WITH_UNSUPPORTED_FIELDS); + expected.remove(COLUMN_NAME_DELIMITER); + Properties actual = MetastoreUtil.getHiveSchema(ThriftMetastoreUtil.fromMetastoreApiTable(TEST_TABLE_WITH_UNSUPPORTED_FIELDS, TEST_SCHEMA)); + actual.remove(META_PARTITION_COLUMNS); + assertEquals(actual, expected); + } + + @Test + public void testHiveSchemaPartition() + { + Properties expected = MetaStoreUtils.getPartitionMetadata(TEST_PARTITION_WITH_UNSUPPORTED_FIELDS, TEST_TABLE_WITH_UNSUPPORTED_FIELDS); + expected.remove(COLUMN_NAME_DELIMITER); + Properties actual = MetastoreUtil.getHiveSchema(ThriftMetastoreUtil.fromMetastoreApiPartition(TEST_PARTITION_WITH_UNSUPPORTED_FIELDS), ThriftMetastoreUtil.fromMetastoreApiTable(TEST_TABLE_WITH_UNSUPPORTED_FIELDS, TEST_SCHEMA)); + actual.remove(META_PARTITION_COLUMNS); + assertEquals(actual, expected); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "Writing to skewed table/partition is not supported") + public void testTableRoundTripUnsupported() + { + Table table = ThriftMetastoreUtil.fromMetastoreApiTable(TEST_TABLE_WITH_UNSUPPORTED_FIELDS, TEST_SCHEMA); + ThriftMetastoreUtil.toMetastoreApiTable(table, null); + } + + @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "Writing to skewed table/partition is not supported") + public void testPartitionRoundTripUnsupported() + { + Partition partition = ThriftMetastoreUtil.fromMetastoreApiPartition(TEST_PARTITION_WITH_UNSUPPORTED_FIELDS); + ThriftMetastoreUtil.toMetastoreApiPartition(partition); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestPrincipalPrivileges.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestPrincipalPrivileges.java new file mode 100644 index 00000000..d5e70434 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestPrincipalPrivileges.java @@ -0,0 +1,42 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.collect.ImmutableSet; +import io.prestosql.spi.security.PrincipalType; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; + +public class TestPrincipalPrivileges +{ + @Test + public void testGetTablePrincipalPrivileges() + { + PrincipalPrivileges principalPrivileges = PrincipalPrivileges.fromHivePrivilegeInfos(ImmutableSet.of( + hivePrivilegeInfo(PrincipalType.USER, "user001"), + hivePrivilegeInfo(PrincipalType.USER, "user002"), + hivePrivilegeInfo(PrincipalType.ROLE, "role001"))); + + assertNotNull(principalPrivileges); + assertEquals(principalPrivileges.getUserPrivileges().size(), 2); + assertEquals(principalPrivileges.getRolePrivileges().size(), 1); + } + + private static HivePrivilegeInfo hivePrivilegeInfo(PrincipalType type, String key) + { + return new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.SELECT, false, new HivePrincipal(type, key), new HivePrincipal(type, key)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestRecordingHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestRecordingHiveMetastore.java new file mode 100644 index 00000000..56217962 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestRecordingHiveMetastore.java @@ -0,0 +1,291 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.HiveBasicStatistics; +import io.prestosql.plugin.hive.HiveBucketProperty; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.spi.security.PrestoPrincipal; +import io.prestosql.spi.security.PrincipalType; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.Type; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalLong; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +import static io.prestosql.plugin.hive.HiveBasicStatistics.createEmptyStatistics; +import static io.prestosql.plugin.hive.HiveBucketing.BucketingVersion.BUCKETING_V1; +import static io.prestosql.spi.security.PrincipalType.USER; +import static io.prestosql.spi.statistics.ColumnStatisticType.MAX_VALUE; +import static io.prestosql.spi.statistics.ColumnStatisticType.MIN_VALUE; +import static io.prestosql.spi.type.VarcharType.createVarcharType; +import static io.prestosql.testing.TestingConnectorSession.SESSION; +import static org.testng.Assert.assertEquals; + +public class TestRecordingHiveMetastore +{ + private static final Database DATABASE = new Database( + "database", + Optional.of("location"), + "owner", + USER, + Optional.of("comment"), + ImmutableMap.of("param", "value")); + private static final Column TABLE_COLUMN = new Column( + "column", + HiveType.HIVE_INT, + Optional.of("comment")); + private static final Storage TABLE_STORAGE = new Storage( + StorageFormat.create("serde", "input", "output"), + "location", + Optional.of(new HiveBucketProperty(ImmutableList.of("column"), BUCKETING_V1, 10, ImmutableList.of(new SortingColumn("column", SortingColumn.Order.ASCENDING)))), + true, + ImmutableMap.of("param", "value2")); + private static final Table TABLE = new Table( + "database", + "table", + "owner", + "table_type", + TABLE_STORAGE, + ImmutableList.of(TABLE_COLUMN), + ImmutableList.of(TABLE_COLUMN), + ImmutableMap.of("param", "value3"), + Optional.of("original_text"), + Optional.of("expanded_text")); + private static final Partition PARTITION = new Partition( + "database", + "table", + ImmutableList.of("value"), + TABLE_STORAGE, + ImmutableList.of(TABLE_COLUMN), + ImmutableMap.of("param", "value4")); + private static final PartitionStatistics PARTITION_STATISTICS = new PartitionStatistics( + new HiveBasicStatistics(10, 11, 10000, 10001), + ImmutableMap.of("column", new HiveColumnStatistics( + Optional.of(new IntegerStatistics( + OptionalLong.of(-100), + OptionalLong.of(102))), + Optional.empty(), + Optional.empty(), + Optional.empty(), + Optional.empty(), + OptionalLong.of(1234), + OptionalLong.of(1235), + OptionalLong.of(1), + OptionalLong.of(8)))); + private static final HivePrivilegeInfo PRIVILEGE_INFO = new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.SELECT, true, new HivePrincipal(PrincipalType.USER, "grantor"), new HivePrincipal(PrincipalType.USER, "grantee")); + private static final RoleGrant ROLE_GRANT = new RoleGrant(new PrestoPrincipal(USER, "grantee"), "role", true); + private static final HiveIdentity HIVE_CONTEXT = new HiveIdentity(SESSION); + + @Test + public void testRecordingHiveMetastore() + throws IOException + { + HiveConfig recordingHiveConfig = new HiveConfig() + .setRecordingPath(File.createTempFile("recording_test", "json").getAbsolutePath()) + .setRecordingDuration(new Duration(10, TimeUnit.MINUTES)); + + RecordingHiveMetastore recordingHiveMetastore = new RecordingHiveMetastore(new TestingHiveMetastore(), recordingHiveConfig); + validateMetadata(recordingHiveMetastore); + recordingHiveMetastore.dropDatabase(HIVE_CONTEXT, "other_database"); + recordingHiveMetastore.writeRecording(); + + HiveConfig replayingHiveConfig = recordingHiveConfig + .setReplay(true); + + recordingHiveMetastore = new RecordingHiveMetastore(new UnimplementedHiveMetastore(), replayingHiveConfig); + recordingHiveMetastore.loadRecording(); + validateMetadata(recordingHiveMetastore); + } + + private void validateMetadata(HiveMetastore hiveMetastore) + { + assertEquals(hiveMetastore.getDatabase("database"), Optional.of(DATABASE)); + assertEquals(hiveMetastore.getAllDatabases(), ImmutableList.of("database")); + assertEquals(hiveMetastore.getTable(HIVE_CONTEXT, "database", "table"), Optional.of(TABLE)); + assertEquals(hiveMetastore.getSupportedColumnStatistics(createVarcharType(123)), ImmutableSet.of(MIN_VALUE, MAX_VALUE)); + assertEquals(hiveMetastore.getTableStatistics(HIVE_CONTEXT, TABLE), PARTITION_STATISTICS); + assertEquals(hiveMetastore.getPartitionStatistics(HIVE_CONTEXT, TABLE, ImmutableList.of(PARTITION)), ImmutableMap.of("value", PARTITION_STATISTICS)); + assertEquals(hiveMetastore.getAllTables("database"), Optional.of(ImmutableList.of("table"))); + assertEquals(hiveMetastore.getAllViews("database"), Optional.empty()); + assertEquals(hiveMetastore.getPartition(HIVE_CONTEXT, "database", "table", ImmutableList.of("value")), Optional.of(PARTITION)); + assertEquals(hiveMetastore.getPartitionNames(HIVE_CONTEXT, "database", "table"), Optional.of(ImmutableList.of("value"))); + assertEquals(hiveMetastore.getPartitionNamesByParts(HIVE_CONTEXT, "database", "table", ImmutableList.of("value")), Optional.of(ImmutableList.of("value"))); + assertEquals(hiveMetastore.getPartitionsByNames(HIVE_CONTEXT, "database", "table", ImmutableList.of("value")), ImmutableMap.of("value", Optional.of(PARTITION))); + assertEquals(hiveMetastore.listTablePrivileges("database", "table", new HivePrincipal(USER, "user")), ImmutableSet.of(PRIVILEGE_INFO)); + assertEquals(hiveMetastore.listRoles(), ImmutableSet.of("role")); + assertEquals(hiveMetastore.listRoleGrants(new HivePrincipal(USER, "user")), ImmutableSet.of(ROLE_GRANT)); + } + + private static class TestingHiveMetastore + extends UnimplementedHiveMetastore + { + @Override + public Optional getDatabase(String databaseName) + { + if (databaseName.equals("database")) { + return Optional.of(DATABASE); + } + + return Optional.empty(); + } + + @Override + public List getAllDatabases() + { + return ImmutableList.of("database"); + } + + @Override + public Optional
getTable(HiveIdentity identity, String databaseName, String tableName) + { + if (databaseName.equals("database") && tableName.equals("table")) { + return Optional.of(TABLE); + } + + return Optional.empty(); + } + + @Override + public Set getSupportedColumnStatistics(Type type) + { + if (type.equals(createVarcharType(123))) { + return ImmutableSet.of(MIN_VALUE, MAX_VALUE); + } + + return ImmutableSet.of(); + } + + @Override + public PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) + { + if (table.getDatabaseName().equals("database") && table.getTableName().equals("table")) { + return PARTITION_STATISTICS; + } + + return new PartitionStatistics(createEmptyStatistics(), ImmutableMap.of()); + } + + @Override + public Map getPartitionStatistics(HiveIdentity identity, Table table, List partitions) + { + boolean partitionMatches = partitions.stream() + .anyMatch(partition -> partition.getValues().get(0).equals("value")); + if (table.getDatabaseName().equals("database") && table.getTableName().equals("table") && partitionMatches) { + return ImmutableMap.of("value", PARTITION_STATISTICS); + } + + return ImmutableMap.of(); + } + + @Override + public Optional> getAllTables(String databaseName) + { + if (databaseName.equals("database")) { + return Optional.of(ImmutableList.of("table")); + } + + return Optional.empty(); + } + + @Override + public Optional> getAllViews(String databaseName) + { + return Optional.empty(); + } + + @Override + public void dropDatabase(HiveIdentity identity, String databaseName) + { + // noop for test purpose + } + + @Override + public Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues) + { + if (databaseName.equals("database") && tableName.equals("table") && partitionValues.equals(ImmutableList.of("value"))) { + return Optional.of(PARTITION); + } + + return Optional.empty(); + } + + @Override + public Optional> getPartitionNames(HiveIdentity identity, String databaseName, String tableName) + { + if (databaseName.equals("database") && tableName.equals("table")) { + return Optional.of(ImmutableList.of("value")); + } + + return Optional.empty(); + } + + @Override + public Optional> getPartitionNamesByParts(HiveIdentity identity, String databaseName, String tableName, List parts) + { + if (databaseName.equals("database") && tableName.equals("table") && parts.equals(ImmutableList.of("value"))) { + return Optional.of(ImmutableList.of("value")); + } + + return Optional.empty(); + } + + @Override + public Map> getPartitionsByNames(HiveIdentity identity, String databaseName, String tableName, List partitionNames) + { + if (databaseName.equals("database") && tableName.equals("table") && partitionNames.contains("value")) { + return ImmutableMap.of("value", Optional.of(PARTITION)); + } + + return ImmutableMap.of(); + } + + @Override + public Set listTablePrivileges(String database, String table, HivePrincipal prestoPrincipal) + { + if (database.equals("database") && table.equals("table") && prestoPrincipal.getType() == USER && prestoPrincipal.getName().equals("user")) { + return ImmutableSet.of(PRIVILEGE_INFO); + } + + return ImmutableSet.of(); + } + + @Override + public Set listRoles() + { + return ImmutableSet.of("role"); + } + + @Override + public Set listRoleGrants(HivePrincipal principal) + { + return ImmutableSet.of(ROLE_GRANT); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestSemiTransactionalHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestSemiTransactionalHiveMetastore.java new file mode 100644 index 00000000..2191b890 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestSemiTransactionalHiveMetastore.java @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HiveBasicStatistics; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveSessionProperties; +import io.prestosql.plugin.hive.OrcFileWriterConfig; +import io.prestosql.plugin.hive.ParquetFileWriterConfig; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.thrift.MetastoreLocator; +import io.prestosql.plugin.hive.metastore.thrift.MockThriftMetastoreClient; +import io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastore; +import io.prestosql.plugin.hive.metastore.thrift.ThriftHiveMetastoreConfig; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreClient; +import io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreStats; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.testing.TestingConnectorSession; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.OptionalLong; +import java.util.function.Function; + +import static io.prestosql.plugin.hive.metastore.thrift.MockThriftMetastoreClient.TEST_DATABASE; +import static io.prestosql.plugin.hive.metastore.thrift.MockThriftMetastoreClient.TEST_PARTITION_UP1; +import static io.prestosql.plugin.hive.metastore.thrift.MockThriftMetastoreClient.TEST_PARTITION_UP2; +import static io.prestosql.plugin.hive.metastore.thrift.MockThriftMetastoreClient.TEST_TABLE_UP; +import static io.prestosql.plugin.hive.metastore.thrift.MockThriftMetastoreClient.TEST_TABLE_UP_NAME; +import static io.prestosql.plugin.hive.util.Statistics.merge; +import static io.prestosql.testing.TestingConnectorSession.SESSION; +import static org.testng.Assert.assertEquals; + +@Test(singleThreaded = true) +public class TestSemiTransactionalHiveMetastore +{ + private static final HiveIdentity IDENTITY = new HiveIdentity(SESSION); + private MockThriftMetastoreClient mockClient; + private ThriftHiveMetastore thriftHiveMetastore; + private ThriftMetastoreStats stats; + + protected String database; + protected HdfsEnvironment hdfsEnvironment; + + protected static final PartitionStatistics BASIC_STATISTICS_1 = new PartitionStatistics(new HiveBasicStatistics(OptionalLong.of(2), OptionalLong.of(10), OptionalLong.empty(), OptionalLong.empty()), ImmutableMap.of()); + + private static final PartitionStatistics STATISTICS_1 = + new PartitionStatistics( + BASIC_STATISTICS_1.getBasicStatistics(), + ImmutableMap.builder() + .put("t_bigint", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(2), OptionalLong.of(5), OptionalLong.of(0), OptionalLong.of(4))) + .build()); + + protected static final PartitionStatistics BASIC_STATISTICS_2 = new PartitionStatistics(new HiveBasicStatistics(OptionalLong.of(2), OptionalLong.of(10), OptionalLong.empty(), OptionalLong.empty()), ImmutableMap.of()); + + private static final PartitionStatistics STATISTICS_2 = + new PartitionStatistics( + BASIC_STATISTICS_2.getBasicStatistics(), + ImmutableMap.builder() + .put("t_bigint", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(1), OptionalLong.of(4), OptionalLong.of(0), OptionalLong.of(4))) + .build()); + + protected static final PartitionStatistics BASIC_STATISTICS_3 = new PartitionStatistics(new HiveBasicStatistics(OptionalLong.of(4), OptionalLong.of(20), OptionalLong.empty(), OptionalLong.empty()), ImmutableMap.of()); + private static final PartitionStatistics STATISTICS_3 = + new PartitionStatistics( + BASIC_STATISTICS_3.getBasicStatistics(), + ImmutableMap.builder() + .put("t_bigint", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(1), OptionalLong.of(5), OptionalLong.of(0), OptionalLong.of(4))) + .build()); + + private static List partitions = ImmutableList.of(MockThriftMetastoreClient.TEST_PARTITION_UP1, MockThriftMetastoreClient.TEST_PARTITION_UP2); + + private static final Map PARTITION_STATISTICS_MAP = new HashMap(){{ + put(TEST_PARTITION_UP1, STATISTICS_1); + put(TEST_PARTITION_UP2, STATISTICS_1); + }}; + + @BeforeMethod + public void setUp() + { + mockClient = new MockThriftMetastoreClient(); + MetastoreLocator metastoreLocator = new MockMetastoreLocator(mockClient); + thriftHiveMetastore = new ThriftHiveMetastore(metastoreLocator, new ThriftHiveMetastoreConfig()); + stats = thriftHiveMetastore.getStats(); + } + + private void updatePartitionsStatistics() + { + Map> partNamesUpdateMap = new HashMap<>(); + List statistics = ImmutableList.of(STATISTICS_1, STATISTICS_1); + for (int index = 0; index < partitions.size(); index++) { + PartitionStatistics stats = statistics.get(index); + partNamesUpdateMap.put(partitions.get(index), actualStatistics -> stats); + } + thriftHiveMetastore.updatePartitionsStatistics(IDENTITY, MockThriftMetastoreClient.TEST_DATABASE, MockThriftMetastoreClient.TEST_TABLE_UP_NAME, partNamesUpdateMap); + } + + private PartitionStatistics skipStats(PartitionStatistics currentStatistics, PartitionStatistics updatedStatistics, boolean isCollectColumnStatisticsOnWrite) + { + if (isCollectColumnStatisticsOnWrite) { + return merge(currentStatistics, updatedStatistics); + } + else { + return updatedStatistics; + } + } + + protected ConnectorSession newSession(Map propertyValues) + { + HiveSessionProperties properties = new HiveSessionProperties(new HiveConfig(), new OrcFileWriterConfig(), new ParquetFileWriterConfig()); + return new TestingConnectorSession(properties.getSessionProperties(), propertyValues); + } + + @Test + public void testIsCollectColumnStatisticsOnWriteTrue() + { + ConnectorSession session = newSession(ImmutableMap.of("collect_column_statistics_on_write", true)); + assertEquals(STATISTICS_3, skipStats(STATISTICS_1, STATISTICS_2, HiveSessionProperties.isCollectColumnStatisticsOnWrite(session))); + } + + @Test + public void testIsCollectColumnStatisticsOnWriteFalse() + { + ConnectorSession session = newSession(ImmutableMap.of("collect_column_statistics_on_write", false)); + assertEquals(STATISTICS_2, skipStats(STATISTICS_1, STATISTICS_2, HiveSessionProperties.isCollectColumnStatisticsOnWrite(session))); + } + + @Test + public void testGetPartitionStatistics() + { + updatePartitionsStatistics(); + assertEquals(PARTITION_STATISTICS_MAP, thriftHiveMetastore.getPartitionStatistics(IDENTITY, TEST_TABLE_UP, thriftHiveMetastore.getPartitionsByNames(IDENTITY, TEST_DATABASE, TEST_TABLE_UP_NAME, partitions))); + } + + @Test + public void testUpdatePartitionsStatistics() + { + updatePartitionsStatistics(); + assertEquals(STATISTICS_1, thriftHiveMetastore.getPartitionStatistics(IDENTITY, TEST_TABLE_UP, thriftHiveMetastore.getPartitionsByNames(IDENTITY, TEST_DATABASE, TEST_TABLE_UP_NAME, partitions)).get(TEST_PARTITION_UP1)); + assertEquals(STATISTICS_1, thriftHiveMetastore.getPartitionStatistics(IDENTITY, TEST_TABLE_UP, thriftHiveMetastore.getPartitionsByNames(IDENTITY, TEST_DATABASE, TEST_TABLE_UP_NAME, partitions)).get(TEST_PARTITION_UP2)); + } + + @Test + public void testAlterPartitions() + { + updatePartitionsStatistics(); + assertEquals(mockClient.getAlterPartitionCount(), 1); + } + + private static class MockMetastoreLocator + implements MetastoreLocator + { + private final ThriftMetastoreClient client; + + private MockMetastoreLocator(ThriftMetastoreClient client) + { + this.client = client; + } + + @Override + public ThriftMetastoreClient createMetastoreClient() + { + return client; + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestStorage.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestStorage.java new file mode 100644 index 00000000..c3f9af02 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/TestStorage.java @@ -0,0 +1,36 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import io.airlift.json.JsonCodec; +import org.testng.annotations.Test; + +import static io.airlift.json.JsonCodec.jsonCodec; +import static org.testng.Assert.assertEquals; + +public class TestStorage +{ + private static final JsonCodec CODEC = jsonCodec(Storage.class); + + @Test + public void testRoundTrip() + { + Storage storage = Storage.builder() + .setStorageFormat(StorageFormat.create("abc", "in", "out")) + .setLocation("/test") + .build(); + + assertEquals(CODEC.fromJson(CODEC.toJson(storage)), storage); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/UnimplementedHiveMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/UnimplementedHiveMetastore.java new file mode 100644 index 00000000..43ec6acb --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/UnimplementedHiveMetastore.java @@ -0,0 +1,277 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore; + +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.Type; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; + +class UnimplementedHiveMetastore + implements HiveMetastore +{ + @Override + public Optional getDatabase(String databaseName) + { + throw new UnsupportedOperationException(); + } + + @Override + public List getAllDatabases() + { + throw new UnsupportedOperationException(); + } + + @Override + public Optional
getTable(HiveIdentity identity, String databaseName, String tableName) + { + throw new UnsupportedOperationException(); + } + + @Override + public Set getSupportedColumnStatistics(Type type) + { + throw new UnsupportedOperationException(); + } + + @Override + public PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) + { + throw new UnsupportedOperationException(); + } + + @Override + public Map getPartitionStatistics(HiveIdentity identity, Table table, List partitions) + { + throw new UnsupportedOperationException(); + } + + @Override + public void updateTableStatistics(HiveIdentity identity, String databaseName, String tableName, Function update) + { + throw new UnsupportedOperationException(); + } + + @Override + public void updatePartitionStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, Function update) + { + throw new UnsupportedOperationException(); + } + + @Override + public void updatePartitionsStatistics(HiveIdentity identity, String databaseName, String tableName, Map> partNamesUpdateFunctionMap) + { + throw new UnsupportedOperationException(); + } + + @Override + public Optional> getAllTables(String databaseName) + { + throw new UnsupportedOperationException(); + } + + @Override + public Optional> getAllViews(String databaseName) + { + throw new UnsupportedOperationException(); + } + + @Override + public void createDatabase(HiveIdentity identity, Database database) + { + throw new UnsupportedOperationException(); + } + + @Override + public void dropDatabase(HiveIdentity identity, String databaseName) + { + throw new UnsupportedOperationException(); + } + + @Override + public void renameDatabase(HiveIdentity identity, String databaseName, String newDatabaseName) + { + throw new UnsupportedOperationException(); + } + + @Override + public void createTable(HiveIdentity identity, Table table, PrincipalPrivileges principalPrivileges) + { + throw new UnsupportedOperationException(); + } + + @Override + public void dropTable(HiveIdentity identity, String databaseName, String tableName, boolean deleteData) + { + throw new UnsupportedOperationException(); + } + + @Override + public void replaceTable(HiveIdentity identity, String databaseName, String tableName, Table newTable, PrincipalPrivileges principalPrivileges) + { + throw new UnsupportedOperationException(); + } + + @Override + public void renameTable(HiveIdentity identity, String databaseName, String tableName, String newDatabaseName, String newTableName) + { + throw new UnsupportedOperationException(); + } + + @Override + public void commentTable(HiveIdentity identity, String databaseName, String tableName, Optional comment) + { + throw new UnsupportedOperationException(); + } + + @Override + public void addColumn(HiveIdentity identity, String databaseName, String tableName, String columnName, HiveType columnType, String columnComment) + { + throw new UnsupportedOperationException(); + } + + @Override + public void renameColumn(HiveIdentity identity, String databaseName, String tableName, String oldColumnName, String newColumnName) + { + throw new UnsupportedOperationException(); + } + + @Override + public void dropColumn(HiveIdentity identity, String databaseName, String tableName, String columnName) + { + throw new UnsupportedOperationException(); + } + + @Override + public Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues) + { + throw new UnsupportedOperationException(); + } + + @Override + public Optional> getPartitionNames(HiveIdentity identity, String databaseName, String tableName) + { + throw new UnsupportedOperationException(); + } + + @Override + public Optional> getPartitionNamesByParts(HiveIdentity identity, String databaseName, String tableName, List parts) + { + throw new UnsupportedOperationException(); + } + + @Override + public Map> getPartitionsByNames(HiveIdentity identity, String databaseName, String tableName, List partitionNames) + { + throw new UnsupportedOperationException(); + } + + @Override + public void addPartitions(HiveIdentity identity, String databaseName, String tableName, List partitions) + { + throw new UnsupportedOperationException(); + } + + @Override + public void dropPartition(HiveIdentity identity, String databaseName, String tableName, List parts, boolean deleteData) + { + throw new UnsupportedOperationException(); + } + + @Override + public void alterPartition(HiveIdentity identity, String databaseName, String tableName, PartitionWithStatistics partition) + { + throw new UnsupportedOperationException(); + } + + @Override + public Set listTablePrivileges(String databaseName, String tableName, HivePrincipal prestoPrincipal) + { + throw new UnsupportedOperationException(); + } + + @Override + public Set listColumnPrivileges(String databaseName, String tableName, String columnName, HivePrincipal principal) + { + throw new UnsupportedOperationException(); + } + + @Override + public Set listSchemaPrivileges(String databaseName, String tableName, HivePrincipal principal) + { + throw new UnsupportedOperationException(); + } + + @Override + public void grantTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + throw new UnsupportedOperationException(); + } + + @Override + public void revokeTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + throw new UnsupportedOperationException(); + } + + @Override + public void createRole(String role, String grantor) + { + throw new UnsupportedOperationException(); + } + + @Override + public void dropRole(String role) + { + throw new UnsupportedOperationException(); + } + + @Override + public Set listRoles() + { + throw new UnsupportedOperationException(); + } + + @Override + public void grantRoles(Set roles, Set grantees, boolean withAdminOption, HivePrincipal grantor) + { + throw new UnsupportedOperationException(); + } + + @Override + public void revokeRoles(Set roles, Set grantees, boolean adminOptionFor, HivePrincipal grantor) + { + throw new UnsupportedOperationException(); + } + + @Override + public Set listRoleGrants(HivePrincipal principal) + { + throw new UnsupportedOperationException(); + } + + @Override + public boolean isImpersonationEnabled() + { + throw new UnsupportedOperationException(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueExpressionUtil.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueExpressionUtil.java new file mode 100644 index 00000000..a9908d04 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueExpressionUtil.java @@ -0,0 +1,85 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.glue; + +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.spi.PrestoException; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.Optional; + +import static io.prestosql.plugin.hive.metastore.glue.GlueExpressionUtil.buildGlueExpression; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNull; + +public class TestGlueExpressionUtil +{ + private static final List PARTITION_KEYS = ImmutableList.of( + getColumn("name", "string"), + getColumn("birthday", "date"), + getColumn("age", "int")); + + private static Column getColumn(String name, String type) + { + return new Column(name, HiveType.valueOf(type), Optional.empty()); + } + + @Test + public void testBuildExpression() + { + List partitionValues = ImmutableList.of("foo", "2018-01-02", "99"); + String expression = buildGlueExpression(PARTITION_KEYS, partitionValues); + assertEquals(expression, "(name='foo') AND (birthday='2018-01-02') AND (age=99)"); + + partitionValues = ImmutableList.of("foo", "2018-01-02", ""); + expression = buildGlueExpression(PARTITION_KEYS, partitionValues); + assertEquals(expression, "(name='foo') AND (birthday='2018-01-02')"); + } + + @Test + public void testBuildExpressionFromPartialSpecification() + { + List partitionValues = ImmutableList.of("", "2018-01-02", ""); + String expression = buildGlueExpression(PARTITION_KEYS, partitionValues); + assertEquals(expression, "(birthday='2018-01-02')"); + + partitionValues = ImmutableList.of("foo", "", "99"); + expression = buildGlueExpression(PARTITION_KEYS, partitionValues); + assertEquals(expression, "(name='foo') AND (age=99)"); + } + + @Test + public void testBuildExpressionNullOrEmptyValues() + { + assertNull(buildGlueExpression(PARTITION_KEYS, ImmutableList.of())); + assertNull(buildGlueExpression(PARTITION_KEYS, null)); + } + + @Test(expectedExceptions = PrestoException.class) + public void testBuildExpressionInvalidPartitionValueListSize() + { + List partitionValues = ImmutableList.of("foo", "2017-01-02", "99", "extra"); + buildGlueExpression(PARTITION_KEYS, partitionValues); + } + + @Test(expectedExceptions = PrestoException.class) + public void testBuildExpressionNullPartitionKeys() + { + List partitionValues = ImmutableList.of("foo", "2018-01-02", "99"); + buildGlueExpression(null, partitionValues); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueHiveMetastoreConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueHiveMetastoreConfig.java new file mode 100644 index 00000000..2766a7ec --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueHiveMetastoreConfig.java @@ -0,0 +1,67 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.glue; + +import com.google.common.collect.ImmutableMap; +import org.testng.annotations.Test; + +import java.util.Map; + +import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; +import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; +import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; + +public class TestGlueHiveMetastoreConfig +{ + @Test + public void testDefaults() + { + assertRecordedDefaults(recordDefaults(GlueHiveMetastoreConfig.class) + .setGlueRegion(null) + .setPinGlueClientToCurrentRegion(false) + .setMaxGlueConnections(5) + .setDefaultWarehouseDir(null) + .setIamRole(null) + .setAwsAccessKey(null) + .setAwsSecretKey(null) + .setCatalogId(null)); + } + + @Test + public void testExplicitPropertyMapping() + { + Map properties = new ImmutableMap.Builder() + .put("hive.metastore.glue.region", "us-east-1") + .put("hive.metastore.glue.pin-client-to-current-region", "true") + .put("hive.metastore.glue.max-connections", "10") + .put("hive.metastore.glue.default-warehouse-dir", "/location") + .put("hive.metastore.glue.iam-role", "role") + .put("hive.metastore.glue.aws-access-key", "ABC") + .put("hive.metastore.glue.aws-secret-key", "DEF") + .put("hive.metastore.glue.catalogid", "0123456789") + .build(); + + GlueHiveMetastoreConfig expected = new GlueHiveMetastoreConfig() + .setGlueRegion("us-east-1") + .setPinGlueClientToCurrentRegion(true) + .setMaxGlueConnections(10) + .setDefaultWarehouseDir("/location") + .setIamRole("role") + .setAwsAccessKey("ABC") + .setAwsSecretKey("DEF") + .setCatalogId("0123456789"); + + assertFullMapping(properties, expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueInputConverter.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueInputConverter.java new file mode 100644 index 00000000..05771e05 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueInputConverter.java @@ -0,0 +1,110 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.glue; + +import com.amazonaws.services.glue.model.DatabaseInput; +import com.amazonaws.services.glue.model.PartitionInput; +import com.amazonaws.services.glue.model.StorageDescriptor; +import com.amazonaws.services.glue.model.TableInput; +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.HiveBucketProperty; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Database; +import io.prestosql.plugin.hive.metastore.Partition; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.plugin.hive.metastore.glue.converter.GlueInputConverter; +import org.testng.annotations.Test; + +import java.util.List; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNull; + +public class TestGlueInputConverter +{ + private final Database testDb = TestingMetastoreObjects.getPrestoTestDatabase(); + private final Table testTbl = TestingMetastoreObjects.getPrestoTestTable(testDb.getDatabaseName()); + private final Partition testPartition = TestingMetastoreObjects.getPrestoTestPartition(testDb.getDatabaseName(), testTbl.getTableName(), ImmutableList.of("val1")); + + @Test + public void testConvertDatabase() + { + DatabaseInput dbInput = GlueInputConverter.convertDatabase(testDb); + + assertEquals(dbInput.getName(), testDb.getDatabaseName()); + assertEquals(dbInput.getDescription(), testDb.getComment().get()); + assertEquals(dbInput.getLocationUri(), testDb.getLocation().get()); + assertEquals(dbInput.getParameters(), testDb.getParameters()); + } + + @Test + public void testConvertTable() + { + TableInput tblInput = GlueInputConverter.convertTable(testTbl); + + assertEquals(tblInput.getName(), testTbl.getTableName()); + assertEquals(tblInput.getOwner(), testTbl.getOwner()); + assertEquals(tblInput.getTableType(), testTbl.getTableType()); + assertEquals(tblInput.getParameters(), testTbl.getParameters()); + assertColumnList(tblInput.getStorageDescriptor().getColumns(), testTbl.getDataColumns()); + assertColumnList(tblInput.getPartitionKeys(), testTbl.getPartitionColumns()); + assertStorage(tblInput.getStorageDescriptor(), testTbl.getStorage()); + assertEquals(tblInput.getViewExpandedText(), testTbl.getViewExpandedText().get()); + assertEquals(tblInput.getViewOriginalText(), testTbl.getViewOriginalText().get()); + } + + @Test + public void testConvertPartition() + { + PartitionInput partitionInput = GlueInputConverter.convertPartition(testPartition); + + assertEquals(partitionInput.getParameters(), testPartition.getParameters()); + assertStorage(partitionInput.getStorageDescriptor(), testPartition.getStorage()); + assertEquals(partitionInput.getValues(), testPartition.getValues()); + } + + private static void assertColumnList(List actual, List expected) + { + if (expected == null) { + assertNull(actual); + } + assertEquals(actual.size(), expected.size()); + + for (int i = 0; i < expected.size(); i++) { + assertColumn(actual.get(i), expected.get(i)); + } + } + + private static void assertColumn(com.amazonaws.services.glue.model.Column actual, Column expected) + { + assertEquals(actual.getName(), expected.getName()); + assertEquals(actual.getType(), expected.getType().getHiveTypeName().toString()); + assertEquals(actual.getComment(), expected.getComment().get()); + } + + private static void assertStorage(StorageDescriptor actual, Storage expected) + { + assertEquals(actual.getLocation(), expected.getLocation()); + assertEquals(actual.getSerdeInfo().getSerializationLibrary(), expected.getStorageFormat().getSerDe()); + assertEquals(actual.getInputFormat(), expected.getStorageFormat().getInputFormat()); + assertEquals(actual.getOutputFormat(), expected.getStorageFormat().getOutputFormat()); + + if (expected.getBucketProperty().isPresent()) { + HiveBucketProperty bucketProperty = expected.getBucketProperty().get(); + assertEquals(actual.getBucketColumns(), bucketProperty.getBucketedBy()); + assertEquals(actual.getNumberOfBuckets().intValue(), bucketProperty.getBucketCount()); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueToPrestoConverter.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueToPrestoConverter.java new file mode 100644 index 00000000..819aa691 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestGlueToPrestoConverter.java @@ -0,0 +1,169 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.glue; + +import com.amazonaws.services.glue.model.Database; +import com.amazonaws.services.glue.model.Partition; +import com.amazonaws.services.glue.model.StorageDescriptor; +import com.amazonaws.services.glue.model.Table; +import com.google.common.collect.ImmutableList; +import io.prestosql.plugin.hive.HiveBucketProperty; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.glue.converter.GlueToPrestoConverter; +import io.prestosql.spi.security.PrincipalType; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.List; + +import static com.amazonaws.util.CollectionUtils.isNullOrEmpty; +import static io.prestosql.plugin.hive.metastore.glue.TestingMetastoreObjects.getGlueTestColumn; +import static io.prestosql.plugin.hive.metastore.glue.TestingMetastoreObjects.getGlueTestDatabase; +import static io.prestosql.plugin.hive.metastore.glue.TestingMetastoreObjects.getGlueTestPartition; +import static io.prestosql.plugin.hive.metastore.glue.TestingMetastoreObjects.getGlueTestTable; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; + +@Test(singleThreaded = true) +public class TestGlueToPrestoConverter +{ + private static final String PUBLIC_OWNER = "PUBLIC"; + + private Database testDatabase; + private Table testTable; + private Partition testPartition; + + @BeforeMethod + public void setup() + { + testDatabase = getGlueTestDatabase(); + testTable = getGlueTestTable(testDatabase.getName()); + testPartition = getGlueTestPartition(testDatabase.getName(), testTable.getName(), ImmutableList.of("val1")); + } + + @Test + public void testConvertDatabase() + { + io.prestosql.plugin.hive.metastore.Database prestoDatabase = GlueToPrestoConverter.convertDatabase(testDatabase); + assertEquals(prestoDatabase.getDatabaseName(), testDatabase.getName()); + assertEquals(prestoDatabase.getLocation().get(), testDatabase.getLocationUri()); + assertEquals(prestoDatabase.getComment().get(), testDatabase.getDescription()); + assertEquals(prestoDatabase.getParameters(), testDatabase.getParameters()); + assertEquals(prestoDatabase.getOwnerName(), PUBLIC_OWNER); + assertEquals(prestoDatabase.getOwnerType(), PrincipalType.ROLE); + } + + @Test + public void testConvertTable() + { + io.prestosql.plugin.hive.metastore.Table prestoTable = GlueToPrestoConverter.convertTable(testTable, testDatabase.getName()); + assertEquals(prestoTable.getTableName(), testTable.getName()); + assertEquals(prestoTable.getDatabaseName(), testDatabase.getName()); + assertEquals(prestoTable.getTableType(), testTable.getTableType()); + assertEquals(prestoTable.getOwner(), testTable.getOwner()); + assertEquals(prestoTable.getParameters(), testTable.getParameters()); + assertColumnList(prestoTable.getDataColumns(), testTable.getStorageDescriptor().getColumns()); + assertColumnList(prestoTable.getPartitionColumns(), testTable.getPartitionKeys()); + assertStorage(prestoTable.getStorage(), testTable.getStorageDescriptor()); + assertEquals(prestoTable.getViewOriginalText().get(), testTable.getViewOriginalText()); + assertEquals(prestoTable.getViewExpandedText().get(), testTable.getViewExpandedText()); + } + + @Test + public void testConvertTableNullPartitions() + { + testTable.setPartitionKeys(null); + io.prestosql.plugin.hive.metastore.Table prestoTable = GlueToPrestoConverter.convertTable(testTable, testDatabase.getName()); + assertTrue(prestoTable.getPartitionColumns().isEmpty()); + } + + @Test + public void testConvertTableUppercaseColumnType() + { + com.amazonaws.services.glue.model.Column uppercaseColumn = getGlueTestColumn().withType("String"); + testTable.getStorageDescriptor().setColumns(ImmutableList.of(uppercaseColumn)); + GlueToPrestoConverter.convertTable(testTable, testDatabase.getName()); + } + + @Test + public void testConvertPartition() + { + io.prestosql.plugin.hive.metastore.Partition prestoPartition = GlueToPrestoConverter.convertPartition(testPartition); + assertEquals(prestoPartition.getDatabaseName(), testPartition.getDatabaseName()); + assertEquals(prestoPartition.getTableName(), testPartition.getTableName()); + assertColumnList(prestoPartition.getColumns(), testPartition.getStorageDescriptor().getColumns()); + assertEquals(prestoPartition.getValues(), testPartition.getValues()); + assertStorage(prestoPartition.getStorage(), testPartition.getStorageDescriptor()); + assertEquals(prestoPartition.getParameters(), testPartition.getParameters()); + } + + @Test + public void testDatabaseNullParameters() + { + testDatabase.setParameters(null); + assertNotNull(GlueToPrestoConverter.convertDatabase(testDatabase).getParameters()); + } + + @Test + public void testTableNullParameters() + { + testTable.setParameters(null); + testTable.getStorageDescriptor().getSerdeInfo().setParameters(null); + io.prestosql.plugin.hive.metastore.Table prestoTable = GlueToPrestoConverter.convertTable(testTable, testDatabase.getName()); + assertNotNull(prestoTable.getParameters()); + assertNotNull(prestoTable.getStorage().getSerdeParameters()); + } + + @Test + public void testPartitionNullParameters() + { + testPartition.setParameters(null); + assertNotNull(GlueToPrestoConverter.convertPartition(testPartition).getParameters()); + } + + private static void assertColumnList(List actual, List expected) + { + if (expected == null) { + assertNull(actual); + } + assertEquals(actual.size(), expected.size()); + + for (int i = 0; i < expected.size(); i++) { + assertColumn(actual.get(i), expected.get(i)); + } + } + + private static void assertColumn(Column actual, com.amazonaws.services.glue.model.Column expected) + { + assertEquals(actual.getName(), expected.getName()); + assertEquals(actual.getType().getHiveTypeName().toString(), expected.getType()); + assertEquals(actual.getComment().get(), expected.getComment()); + } + + private static void assertStorage(Storage actual, StorageDescriptor expected) + { + assertEquals(actual.getLocation(), expected.getLocation()); + assertEquals(actual.getStorageFormat().getSerDe(), expected.getSerdeInfo().getSerializationLibrary()); + assertEquals(actual.getStorageFormat().getInputFormat(), expected.getInputFormat()); + assertEquals(actual.getStorageFormat().getOutputFormat(), expected.getOutputFormat()); + if (!isNullOrEmpty(expected.getBucketColumns())) { + HiveBucketProperty bucketProperty = actual.getBucketProperty().get(); + assertEquals(bucketProperty.getBucketedBy(), expected.getBucketColumns()); + assertEquals(bucketProperty.getBucketCount(), expected.getNumberOfBuckets().intValue()); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestHiveGlueMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestHiveGlueMetastore.java new file mode 100644 index 00000000..371e7b2d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestHiveGlueMetastore.java @@ -0,0 +1,99 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.glue; + +import com.google.common.collect.ImmutableSet; +import io.prestosql.plugin.hive.AbstractTestHiveLocal; +import io.prestosql.plugin.hive.HdfsConfiguration; +import io.prestosql.plugin.hive.HdfsConfigurationInitializer; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveHdfsConfiguration; +import io.prestosql.plugin.hive.authentication.NoHdfsAuthentication; +import io.prestosql.plugin.hive.metastore.HiveMetastore; + +import java.io.File; + +import static java.util.Locale.ENGLISH; +import static java.util.UUID.randomUUID; + +public class TestHiveGlueMetastore + extends AbstractTestHiveLocal +{ + public TestHiveGlueMetastore() + { + super("test_glue" + randomUUID().toString().toLowerCase(ENGLISH).replace("-", "")); + } + + /** + * GlueHiveMetastore currently uses AWS Default Credential Provider Chain, + * See https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default + * on ways to set your AWS credentials which will be needed to run this test. + */ + @Override + protected HiveMetastore createMetastore(File tempDir) + { + HiveConfig hiveConfig = new HiveConfig(); + HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hiveConfig), ImmutableSet.of()); + HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hiveConfig, new NoHdfsAuthentication()); + GlueHiveMetastoreConfig glueConfig = new GlueHiveMetastoreConfig(); + glueConfig.setDefaultWarehouseDir(tempDir.toURI().toString()); + + return new GlueHiveMetastore(hdfsEnvironment, glueConfig); + } + + @Override + public void testRenameTable() + { + // rename table is not yet supported by Glue + } + + @Override + public void testPartitionStatisticsSampling() + throws Exception + { + // Glue metastore does not support column level statistics + } + + @Override + public void testUpdateTableColumnStatistics() + { + // column statistics are not supported by Glue + } + + @Override + public void testUpdateTableColumnStatisticsEmptyOptionalFields() + { + // column statistics are not supported by Glue + } + + @Override + public void testUpdatePartitionColumnStatistics() + { + // column statistics are not supported by Glue + } + + @Override + public void testUpdatePartitionColumnStatisticsEmptyOptionalFields() + { + // column statistics are not supported by Glue + } + + @Override + public void testStorePartitionWithStatistics() + throws Exception + { + testStorePartitionWithStatistics(STATISTICS_PARTITIONED_TABLE_COLUMNS, BASIC_STATISTICS_1, BASIC_STATISTICS_2, BASIC_STATISTICS_1, EMPTY_TABLE_STATISTICS); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestingMetastoreObjects.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestingMetastoreObjects.java new file mode 100644 index 00000000..d464d722 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/glue/TestingMetastoreObjects.java @@ -0,0 +1,155 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.glue; + +import com.amazonaws.services.glue.model.Column; +import com.amazonaws.services.glue.model.Database; +import com.amazonaws.services.glue.model.Partition; +import com.amazonaws.services.glue.model.SerDeInfo; +import com.amazonaws.services.glue.model.StorageDescriptor; +import com.amazonaws.services.glue.model.Table; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.spi.security.PrincipalType; +import org.apache.hadoop.hive.metastore.TableType; + +import java.util.List; +import java.util.Optional; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.Consumer; + +import static java.lang.String.format; + +public final class TestingMetastoreObjects +{ + private TestingMetastoreObjects() {} + + // --------------- Glue Objects --------------- + + public static Database getGlueTestDatabase() + { + return new Database() + .withName("test-db" + generateRandom()) + .withDescription("database desc") + .withLocationUri("/db") + .withParameters(ImmutableMap.of()); + } + + public static Table getGlueTestTable(String dbName) + { + return new Table() + .withDatabaseName(dbName) + .withName("test-tbl" + generateRandom()) + .withOwner("owner") + .withParameters(ImmutableMap.of()) + .withPartitionKeys(ImmutableList.of(getGlueTestColumn())) + .withStorageDescriptor(getGlueTestStorageDescriptor()) + .withTableType(TableType.EXTERNAL_TABLE.name()) + .withViewOriginalText("originalText") + .withViewExpandedText("expandedText"); + } + + public static Column getGlueTestColumn() + { + return new Column() + .withName("test-col" + generateRandom()) + .withType("string") + .withComment("column comment"); + } + + public static StorageDescriptor getGlueTestStorageDescriptor() + { + return new StorageDescriptor() + .withBucketColumns(ImmutableList.of("test-bucket-col")) + .withColumns(ImmutableList.of(getGlueTestColumn())) + .withParameters(ImmutableMap.of()) + .withSerdeInfo(new SerDeInfo() + .withSerializationLibrary("SerdeLib") + .withParameters(ImmutableMap.of())) + .withInputFormat("InputFormat") + .withOutputFormat("OutputFormat") + .withLocation("/test-tbl") + .withNumberOfBuckets(1); + } + + public static Partition getGlueTestPartition(String dbName, String tblName, List values) + { + return new Partition() + .withDatabaseName(dbName) + .withTableName(tblName) + .withValues(values) + .withParameters(ImmutableMap.of()) + .withStorageDescriptor(getGlueTestStorageDescriptor()); + } + + // --------------- Presto Objects --------------- + + public static io.prestosql.plugin.hive.metastore.Database getPrestoTestDatabase() + { + return io.prestosql.plugin.hive.metastore.Database.builder() + .setDatabaseName("test-db" + generateRandom()) + .setComment(Optional.of("database desc")) + .setLocation(Optional.of("/db")) + .setParameters(ImmutableMap.of()) + .setOwnerName("PUBLIC") + .setOwnerType(PrincipalType.ROLE).build(); + } + + public static io.prestosql.plugin.hive.metastore.Table getPrestoTestTable(String dbName) + { + return io.prestosql.plugin.hive.metastore.Table.builder() + .setDatabaseName(dbName) + .setTableName("test-tbl" + generateRandom()) + .setOwner("owner") + .setParameters(ImmutableMap.of()) + .setTableType(TableType.EXTERNAL_TABLE.name()) + .setDataColumns(ImmutableList.of(getPrestoTestColumn())) + .setPartitionColumns(ImmutableList.of(getPrestoTestColumn())) + .setViewOriginalText(Optional.of("originalText")) + .setViewExpandedText(Optional.of("expandedText")) + .withStorage(STORAGE_CONSUMER).build(); + } + + public static io.prestosql.plugin.hive.metastore.Partition getPrestoTestPartition(String dbName, String tblName, List values) + { + return io.prestosql.plugin.hive.metastore.Partition.builder() + .setDatabaseName(dbName) + .setTableName(tblName) + .setValues(values) + .setColumns(ImmutableList.of(getPrestoTestColumn())) + .setParameters(ImmutableMap.of()) + .withStorage(STORAGE_CONSUMER).build(); + } + + public static io.prestosql.plugin.hive.metastore.Column getPrestoTestColumn() + { + return new io.prestosql.plugin.hive.metastore.Column("test-col" + generateRandom(), HiveType.HIVE_STRING, Optional.of("column comment")); + } + + private static final Consumer STORAGE_CONSUMER = storage -> + { + storage.setStorageFormat(StorageFormat.create("SerdeLib", "InputFormat", "OutputFormat")) + .setLocation("/test-tbl") + .setBucketProperty(Optional.empty()) + .setSerdeParameters(ImmutableMap.of()); + }; + + private static String generateRandom() + { + return format("%04x", ThreadLocalRandom.current().nextInt()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/InMemoryThriftMetastore.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/InMemoryThriftMetastore.java new file mode 100644 index 00000000..ddf1eb6f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/InMemoryThriftMetastore.java @@ -0,0 +1,705 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.authentication.HiveIdentity; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.HivePrivilegeInfo; +import io.prestosql.plugin.hive.metastore.PartitionWithStatistics; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.SchemaAlreadyExistsException; +import io.prestosql.spi.connector.SchemaNotFoundException; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.connector.TableAlreadyExistsException; +import io.prestosql.spi.connector.TableNotFoundException; +import io.prestosql.spi.security.RoleGrant; +import io.prestosql.spi.statistics.ColumnStatisticType; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.PrincipalPrivilegeSet; +import org.apache.hadoop.hive.metastore.api.PrincipalType; +import org.apache.hadoop.hive.metastore.api.Table; + +import javax.annotation.concurrent.GuardedBy; + +import java.io.File; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.net.URI; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.google.common.io.MoreFiles.deleteRecursively; +import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static io.prestosql.plugin.hive.HiveBasicStatistics.createEmptyStatistics; +import static io.prestosql.plugin.hive.HiveUtil.toPartitionValues; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.toMetastoreApiPartition; +import static io.prestosql.spi.StandardErrorCode.SCHEMA_NOT_EMPTY; +import static java.util.Locale.US; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; +import static org.apache.hadoop.hive.common.FileUtils.makePartName; +import static org.apache.hadoop.hive.metastore.TableType.EXTERNAL_TABLE; +import static org.apache.hadoop.hive.metastore.TableType.MANAGED_TABLE; +import static org.apache.hadoop.hive.metastore.TableType.VIRTUAL_VIEW; + +public class InMemoryThriftMetastore + implements ThriftMetastore +{ + @GuardedBy("this") + private final Map databases = new HashMap<>(); + @GuardedBy("this") + private final Map relations = new HashMap<>(); + @GuardedBy("this") + private final Map views = new HashMap<>(); + @GuardedBy("this") + private final Map partitions = new HashMap<>(); + @GuardedBy("this") + private final Map columnStatistics = new HashMap<>(); + @GuardedBy("this") + private final Map partitionColumnStatistics = new HashMap<>(); + @GuardedBy("this") + private final Map> tablePrivileges = new HashMap<>(); + + private final File baseDirectory; + + public InMemoryThriftMetastore(File baseDirectory) + { + this.baseDirectory = requireNonNull(baseDirectory, "baseDirectory is null"); + checkArgument(!baseDirectory.exists(), "Base directory already exists"); + checkArgument(baseDirectory.mkdirs(), "Could not create base directory"); + } + + @Override + public synchronized void createDatabase(HiveIdentity identity, Database database) + { + requireNonNull(database, "database is null"); + + File directory; + if (database.getLocationUri() != null) { + directory = new File(URI.create(database.getLocationUri())); + } + else { + // use Hive default naming convention + directory = new File(baseDirectory, database.getName() + ".db"); + database = database.deepCopy(); + database.setLocationUri(directory.toURI().toString()); + } + + checkArgument(!directory.exists(), "Database directory already exists"); + checkArgument(isParentDir(directory, baseDirectory), "Database directory must be inside of the metastore base directory"); + checkArgument(directory.mkdirs(), "Could not create database directory"); + + if (databases.putIfAbsent(database.getName(), database) != null) { + throw new SchemaAlreadyExistsException(database.getName()); + } + } + + @Override + public synchronized void dropDatabase(HiveIdentity identity, String databaseName) + { + if (!databases.containsKey(databaseName)) { + throw new SchemaNotFoundException(databaseName); + } + if (!getAllTables(databaseName).orElse(ImmutableList.of()).isEmpty()) { + throw new PrestoException(SCHEMA_NOT_EMPTY, "Schema not empty: " + databaseName); + } + databases.remove(databaseName); + } + + @Override + public synchronized void alterDatabase(HiveIdentity identity, String databaseName, Database newDatabase) + { + String newDatabaseName = newDatabase.getName(); + + if (databaseName.equals(newDatabaseName)) { + if (databases.replace(databaseName, newDatabase) == null) { + throw new SchemaNotFoundException(databaseName); + } + return; + } + + Database database = databases.get(databaseName); + if (database == null) { + throw new SchemaNotFoundException(databaseName); + } + if (databases.putIfAbsent(newDatabaseName, database) != null) { + throw new SchemaAlreadyExistsException(newDatabaseName); + } + databases.remove(databaseName); + + rewriteKeys(relations, name -> new SchemaTableName(newDatabaseName, name.getTableName())); + rewriteKeys(views, name -> new SchemaTableName(newDatabaseName, name.getTableName())); + rewriteKeys(partitions, name -> name.withSchemaName(newDatabaseName)); + rewriteKeys(tablePrivileges, name -> name.withDatabase(newDatabaseName)); + } + + @Override + public synchronized List getAllDatabases() + { + return ImmutableList.copyOf(databases.keySet()); + } + + @Override + public synchronized void createTable(HiveIdentity identity, Table table) + { + TableType tableType = TableType.valueOf(table.getTableType()); + checkArgument(EnumSet.of(MANAGED_TABLE, EXTERNAL_TABLE, VIRTUAL_VIEW).contains(tableType), "Invalid table type: %s", tableType); + + if (tableType == VIRTUAL_VIEW) { + checkArgument(table.getSd().getLocation() == null, "Storage location for view must be null"); + } + else { + File directory = new File(new Path(table.getSd().getLocation()).toUri()); + checkArgument(directory.exists(), "Table directory does not exist"); + if (tableType == MANAGED_TABLE) { + checkArgument(isParentDir(directory, baseDirectory), "Table directory must be inside of the metastore base directory"); + } + } + + SchemaTableName schemaTableName = new SchemaTableName(table.getDbName(), table.getTableName()); + Table tableCopy = table.deepCopy(); + + if (relations.putIfAbsent(schemaTableName, tableCopy) != null) { + throw new TableAlreadyExistsException(schemaTableName); + } + + if (tableType == VIRTUAL_VIEW) { + views.put(schemaTableName, tableCopy); + } + + PrincipalPrivilegeSet privileges = table.getPrivileges(); + if (privileges != null) { + throw new UnsupportedOperationException(); + } + } + + @Override + public synchronized void dropTable(HiveIdentity identity, String databaseName, String tableName, boolean deleteData) + { + List locations = listAllDataPaths(identity, this, databaseName, tableName); + + SchemaTableName schemaTableName = new SchemaTableName(databaseName, tableName); + Table table = relations.remove(schemaTableName); + if (table == null) { + throw new TableNotFoundException(schemaTableName); + } + views.remove(schemaTableName); + partitions.keySet().removeIf(partitionName -> partitionName.matches(databaseName, tableName)); + + // remove data + if (deleteData && table.getTableType().equals(MANAGED_TABLE.name())) { + for (String location : locations) { + if (location != null) { + File directory = new File(new Path(location).toUri()); + checkArgument(isParentDir(directory, baseDirectory), "Table directory must be inside of the metastore base directory"); + deleteDirectory(directory); + } + } + } + } + + private static List listAllDataPaths(HiveIdentity identity, ThriftMetastore metastore, String schemaName, String tableName) + { + ImmutableList.Builder locations = ImmutableList.builder(); + Table table = metastore.getTable(identity, schemaName, tableName).get(); + if (table.getSd().getLocation() != null) { + // For unpartitioned table, there should be nothing directly under this directory. + // But including this location in the set makes the directory content assert more + // extensive, which is desirable. + locations.add(table.getSd().getLocation()); + } + + Optional> partitionNames = metastore.getPartitionNames(identity, schemaName, tableName); + if (partitionNames.isPresent()) { + metastore.getPartitionsByNames(identity, schemaName, tableName, partitionNames.get()).stream() + .map(partition -> partition.getSd().getLocation()) + .filter(location -> !location.startsWith(table.getSd().getLocation())) + .forEach(locations::add); + } + + return locations.build(); + } + + @Override + public synchronized void alterTable(HiveIdentity identity, String databaseName, String tableName, Table newTable) + { + SchemaTableName oldName = new SchemaTableName(databaseName, tableName); + SchemaTableName newName = new SchemaTableName(newTable.getDbName(), newTable.getTableName()); + + // if the name did not change, this is a simple schema change + if (oldName.equals(newName)) { + if (relations.replace(oldName, newTable) == null) { + throw new TableNotFoundException(oldName); + } + return; + } + + // remove old table definition and add the new one + Table table = relations.get(oldName); + if (table == null) { + throw new TableNotFoundException(oldName); + } + + if (relations.putIfAbsent(newName, newTable) != null) { + throw new TableAlreadyExistsException(newName); + } + relations.remove(oldName); + } + + @Override + public synchronized Optional> getAllTables(String databaseName) + { + ImmutableList.Builder tables = ImmutableList.builder(); + for (SchemaTableName schemaTableName : this.relations.keySet()) { + if (schemaTableName.getSchemaName().equals(databaseName)) { + tables.add(schemaTableName.getTableName()); + } + } + return Optional.of(tables.build()); + } + + @Override + public synchronized Optional> getAllViews(String databaseName) + { + ImmutableList.Builder tables = ImmutableList.builder(); + for (SchemaTableName schemaTableName : this.views.keySet()) { + if (schemaTableName.getSchemaName().equals(databaseName)) { + tables.add(schemaTableName.getTableName()); + } + } + return Optional.of(tables.build()); + } + + @Override + public synchronized Optional getDatabase(String databaseName) + { + return Optional.ofNullable(databases.get(databaseName)); + } + + @Override + public synchronized void addPartitions(HiveIdentity identity, String databaseName, String tableName, List partitionsWithStatistics) + { + for (PartitionWithStatistics partitionWithStatistics : partitionsWithStatistics) { + Partition partition = toMetastoreApiPartition(partitionWithStatistics.getPartition()); + if (partition.getParameters() == null) { + partition.setParameters(ImmutableMap.of()); + } + PartitionName partitionKey = PartitionName.partition(databaseName, tableName, partitionWithStatistics.getPartitionName()); + partitions.put(partitionKey, partition); + partitionColumnStatistics.put(partitionKey, partitionWithStatistics.getStatistics()); + } + } + + @Override + public synchronized void dropPartition(HiveIdentity identity, String databaseName, String tableName, List parts, boolean deleteData) + { + partitions.entrySet().removeIf(entry -> + entry.getKey().matches(databaseName, tableName) && entry.getValue().getValues().equals(parts)); + } + + @Override + public synchronized void alterPartition(HiveIdentity identity, String databaseName, String tableName, PartitionWithStatistics partitionWithStatistics) + { + Partition partition = toMetastoreApiPartition(partitionWithStatistics.getPartition()); + if (partition.getParameters() == null) { + partition.setParameters(ImmutableMap.of()); + } + PartitionName partitionKey = PartitionName.partition(databaseName, tableName, partitionWithStatistics.getPartitionName()); + partitions.put(partitionKey, partition); + partitionColumnStatistics.put(partitionKey, partitionWithStatistics.getStatistics()); + } + + @Override + public synchronized Optional> getPartitionNames(HiveIdentity identity, String databaseName, String tableName) + { + return Optional.of(ImmutableList.copyOf(partitions.entrySet().stream() + .filter(entry -> entry.getKey().matches(databaseName, tableName)) + .map(entry -> entry.getKey().getPartitionName()) + .collect(toList()))); + } + + @Override + public synchronized Optional getPartition(HiveIdentity identity, String databaseName, String tableName, List partitionValues) + { + PartitionName name = PartitionName.partition(databaseName, tableName, partitionValues); + Partition partition = partitions.get(name); + if (partition == null) { + return Optional.empty(); + } + return Optional.of(partition.deepCopy()); + } + + @Override + public synchronized Optional> getPartitionNamesByParts(HiveIdentity identity, String databaseName, String tableName, List parts) + { + return Optional.of(partitions.entrySet().stream() + .filter(entry -> partitionMatches(entry.getValue(), databaseName, tableName, parts)) + .map(entry -> entry.getKey().getPartitionName()) + .collect(toList())); + } + + private static boolean partitionMatches(Partition partition, String databaseName, String tableName, List parts) + { + if (!partition.getDbName().equals(databaseName) || + !partition.getTableName().equals(tableName)) { + return false; + } + List values = partition.getValues(); + if (values.size() != parts.size()) { + return false; + } + for (int i = 0; i < values.size(); i++) { + String part = parts.get(i); + if (!part.isEmpty() && !values.get(i).equals(part)) { + return false; + } + } + return true; + } + + @Override + public synchronized List getPartitionsByNames(HiveIdentity identity, String databaseName, String tableName, List partitionNames) + { + ImmutableList.Builder builder = ImmutableList.builder(); + for (String name : partitionNames) { + PartitionName partitionName = PartitionName.partition(databaseName, tableName, name); + Partition partition = partitions.get(partitionName); + if (partition == null) { + return ImmutableList.of(); + } + builder.add(partition.deepCopy()); + } + return builder.build(); + } + + @Override + public synchronized Optional
getTable(HiveIdentity identity, String databaseName, String tableName) + { + SchemaTableName schemaTableName = new SchemaTableName(databaseName, tableName); + return Optional.ofNullable(relations.get(schemaTableName)); + } + + @Override + public Set getSupportedColumnStatistics(Type type) + { + return ThriftMetastoreUtil.getSupportedColumnStatistics(type); + } + + @Override + public synchronized PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) + { + return getTableStatistics(identity, table.getDbName(), table.getTableName()); + } + + private PartitionStatistics getTableStatistics(HiveIdentity identity, String databaseName, String tableName) + { + SchemaTableName schemaTableName = new SchemaTableName(databaseName, tableName); + PartitionStatistics statistics = columnStatistics.get(schemaTableName); + if (statistics == null) { + statistics = new PartitionStatistics(createEmptyStatistics(), ImmutableMap.of()); + } + return statistics; + } + + @Override + public synchronized Map getPartitionStatistics(HiveIdentity identity, Table table, List partitions) + { + List partitionColumns = table.getPartitionKeys().stream() + .map(FieldSchema::getName) + .collect(toImmutableList()); + Set partitionNames = partitions.stream() + .map(partition -> makePartName(partitionColumns, partition.getValues())) + .collect(toImmutableSet()); + return getPartitionStatistics(identity, table.getDbName(), table.getTableName(), partitionNames); + } + + private ImmutableMap getPartitionStatistics(HiveIdentity identity, String databaseName, String tableName, Set partitionNames) + { + ImmutableMap.Builder result = ImmutableMap.builder(); + for (String partitionName : partitionNames) { + PartitionName partitionKey = PartitionName.partition(databaseName, tableName, partitionName); + PartitionStatistics statistics = partitionColumnStatistics.get(partitionKey); + if (statistics == null) { + statistics = new PartitionStatistics(createEmptyStatistics(), ImmutableMap.of()); + } + result.put(partitionName, statistics); + } + return result.build(); + } + + @Override + public synchronized void updateTableStatistics(HiveIdentity identity, String databaseName, String tableName, Function update) + { + columnStatistics.put(new SchemaTableName(databaseName, tableName), update.apply(getTableStatistics(identity, databaseName, tableName))); + } + + @Override + public synchronized void updatePartitionStatistics(HiveIdentity identity, String databaseName, String tableName, String partitionName, Function update) + { + PartitionName partitionKey = PartitionName.partition(databaseName, tableName, partitionName); + partitionColumnStatistics.put(partitionKey, update.apply(getPartitionStatistics(identity, databaseName, tableName, ImmutableSet.of(partitionName)).get(partitionName))); + } + + @Override + public void updatePartitionsStatistics(HiveIdentity identity, String databaseName, String tableName, Map> partNamesUpdateFunctionMap) + { + partNamesUpdateFunctionMap.entrySet().stream().forEach(e -> { + updatePartitionStatistics(identity, databaseName, tableName, e.getKey(), e.getValue()); + }); + } + + @Override + public void createRole(String role, String grantor) + { + throw new UnsupportedOperationException(); + } + + @Override + public void dropRole(String role) + { + throw new UnsupportedOperationException(); + } + + @Override + public Set listRoles() + { + throw new UnsupportedOperationException(); + } + + @Override + public void grantRoles(Set roles, Set grantees, boolean withAdminOption, HivePrincipal grantor) + { + throw new UnsupportedOperationException(); + } + + @Override + public void revokeRoles(Set roles, Set grantees, boolean adminOptionFor, HivePrincipal grantor) + { + throw new UnsupportedOperationException(); + } + + @Override + public Set listRoleGrants(HivePrincipal principal) + { + throw new UnsupportedOperationException(); + } + + @Override + public Set listTablePrivileges(String databaseName, String tableName, HivePrincipal principal) + { + return ImmutableSet.of(); + } + + @Override + public Set listColumnPrivileges(String databaseName, String tableName, String columnName, HivePrincipal principal) + { + throw new UnsupportedOperationException(); + } + + @Override + public Set listSchemaPrivileges(String databaseName, String tableName, HivePrincipal principal) + { + throw new UnsupportedOperationException(); + } + + @Override + public void grantTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + throw new UnsupportedOperationException(); + } + + @Override + public void revokeTablePrivileges(String databaseName, String tableName, HivePrincipal grantee, Set privileges) + { + throw new UnsupportedOperationException(); + } + + @Override + public boolean isImpersonationEnabled() + { + return false; + } + + private static boolean isParentDir(File directory, File baseDirectory) + { + for (File parent = directory.getParentFile(); parent != null; parent = parent.getParentFile()) { + if (parent.equals(baseDirectory)) { + return true; + } + } + return false; + } + + private static void deleteDirectory(File dir) + { + try { + deleteRecursively(dir.toPath(), ALLOW_INSECURE); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static class PartitionName + { + private final String schemaName; + private final String tableName; + private final List partitionValues; + private final String partitionName; // does not participate in equals and hashValue + + private PartitionName(String schemaName, String tableName, List partitionValues, String partitionName) + { + this.schemaName = requireNonNull(schemaName, "schemaName is null").toLowerCase(US); + this.tableName = requireNonNull(tableName, "tableName is null").toLowerCase(US); + this.partitionValues = requireNonNull(partitionValues, "partitionValues is null"); + this.partitionName = partitionName; + } + + public static PartitionName partition(String schemaName, String tableName, String partitionName) + { + return new PartitionName(schemaName.toLowerCase(US), tableName.toLowerCase(US), toPartitionValues(partitionName), partitionName); + } + + public static PartitionName partition(String schemaName, String tableName, List partitionValues) + { + return new PartitionName(schemaName.toLowerCase(US), tableName.toLowerCase(US), partitionValues, null); + } + + public String getPartitionName() + { + return requireNonNull(partitionName, "partitionName is null"); + } + + public boolean matches(String schemaName, String tableName) + { + return this.schemaName.equals(schemaName) && + this.tableName.equals(tableName); + } + + public PartitionName withSchemaName(String schemaName) + { + return new PartitionName(schemaName, tableName, partitionValues, partitionName); + } + + @Override + public int hashCode() + { + return Objects.hash(schemaName, tableName, partitionValues); + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + PartitionName other = (PartitionName) obj; + return Objects.equals(this.schemaName, other.schemaName) + && Objects.equals(this.tableName, other.tableName) + && Objects.equals(this.partitionValues, other.partitionValues); + } + + @Override + public String toString() + { + return schemaName + "/" + tableName + "/" + partitionName; + } + } + + private static class PrincipalTableKey + { + private final String principalName; + private final PrincipalType principalType; + private final String database; + private final String table; + + public PrincipalTableKey(String principalName, PrincipalType principalType, String table, String database) + { + this.principalName = requireNonNull(principalName, "principalName is null"); + this.principalType = requireNonNull(principalType, "principalType is null"); + this.table = requireNonNull(table, "table is null"); + this.database = requireNonNull(database, "database is null"); + } + + public PrincipalTableKey withDatabase(String database) + { + return new PrincipalTableKey(principalName, principalType, table, database); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + PrincipalTableKey that = (PrincipalTableKey) o; + return Objects.equals(principalName, that.principalName) && + Objects.equals(principalType, that.principalType) && + Objects.equals(table, that.table) && + Objects.equals(database, that.database); + } + + @Override + public int hashCode() + { + return Objects.hash(principalName, principalType, table, database); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("principalName", principalName) + .add("principalType", principalType) + .add("table", table) + .add("database", database) + .toString(); + } + } + + private static void rewriteKeys(Map map, Function keyRewriter) + { + for (K key : ImmutableSet.copyOf(map.keySet())) { + K newKey = keyRewriter.apply(key); + if (!newKey.equals(key)) { + map.put(newKey, map.remove(key)); + } + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/MockThriftMetastoreClient.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/MockThriftMetastoreClient.java new file mode 100644 index 00000000..fbf8ad78 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/MockThriftMetastoreClient.java @@ -0,0 +1,574 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.HiveObjectPrivilege; +import org.apache.hadoop.hive.metastore.api.HiveObjectRef; +import org.apache.hadoop.hive.metastore.api.LockRequest; +import org.apache.hadoop.hive.metastore.api.LockResponse; +import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.PrincipalType; +import org.apache.hadoop.hive.metastore.api.PrivilegeBag; +import org.apache.hadoop.hive.metastore.api.Role; +import org.apache.hadoop.hive.metastore.api.RolePrincipalGrant; +import org.apache.hadoop.hive.metastore.api.SerDeInfo; +import org.apache.hadoop.hive.metastore.api.ShowLocksRequest; +import org.apache.hadoop.hive.metastore.api.ShowLocksResponse; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.thrift.TException; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.apache.hadoop.hive.metastore.api.PrincipalType.ROLE; +import static org.apache.hadoop.hive.metastore.api.PrincipalType.USER; + +public class MockThriftMetastoreClient + implements ThriftMetastoreClient +{ + public static final String TEST_DATABASE = "testdb"; + public static final String BAD_DATABASE = "baddb"; + public static final String TEST_TABLE = "testtbl"; + public static final String TEST_TABLE_UP_NAME = "testtblup"; + public static final String TEST_PARTITION1 = "key=testpartition1"; + public static final String TEST_PARTITION2 = "key=testpartition2"; + public static final String TEST_PARTITION_UP1 = "key=testpartitionup1"; + public static final String TEST_PARTITION_UP2 = "key=testpartitionup2"; + public static final List TEST_PARTITION_VALUES1 = ImmutableList.of("testpartition1"); + public static final List TEST_PARTITION_VALUES2 = ImmutableList.of("testpartition2"); + public static final List TEST_PARTITION_UP_VALUES1 = ImmutableList.of("testpartitionup1"); + public static final List TEST_PARTITION_UP_VALUES2 = ImmutableList.of("testpartitionup2"); + public static final List TEST_ROLES = ImmutableList.of("testrole"); + public static final List TEST_ROLE_GRANTS = ImmutableList.of( + new RolePrincipalGrant("role1", "user", USER, false, 0, "grantor1", USER), + new RolePrincipalGrant("role2", "role1", ROLE, true, 0, "grantor2", ROLE)); + + private static final StorageDescriptor DEFAULT_STORAGE_DESCRIPTOR = + new StorageDescriptor(ImmutableList.of(), "", null, null, false, 0, new SerDeInfo(TEST_TABLE, null, ImmutableMap.of()), null, null, ImmutableMap.of()); + private static final StorageDescriptor UPDATE_STORAGE_DESCRIPTOR = + new StorageDescriptor(ImmutableList.of(new FieldSchema("t_bigint", "int", null)), "", null, null, false, 0, new SerDeInfo(TEST_TABLE_UP_NAME, null, ImmutableMap.of()), null, null, ImmutableMap.of()); + + private static final Map parameters = new HashMap(){{ + put("numFiles", "4"); + put("numRows", "10"); + }}; + public static final Table TEST_TABLE_UP = new Table(TEST_TABLE_UP_NAME, + TEST_DATABASE, + "user", + 0, + 0, + 0, + UPDATE_STORAGE_DESCRIPTOR, + ImmutableList.of( + new FieldSchema("key", "string", null)), + parameters, + "view original text", + "view extended text", + "MANAGED_TABLE"); + private static Map> partitionColumnStatistics = new HashMap>(){{ + put(TEST_PARTITION_UP1, ImmutableList.of(new ColumnStatisticsObj("t_bigint", "int", new ColumnStatisticsData(ColumnStatisticsData._Fields.LONG_STATS, new LongColumnStatsData(0, 4))))); + put(TEST_PARTITION_UP2, ImmutableList.of(new ColumnStatisticsObj("t_bigint", "int", new ColumnStatisticsData(ColumnStatisticsData._Fields.LONG_STATS, new LongColumnStatsData(0, 4))))); + }}; + private static Partition testPartitionUp1 = new Partition(TEST_PARTITION_UP_VALUES1, TEST_DATABASE, TEST_TABLE_UP_NAME, 0, 0, UPDATE_STORAGE_DESCRIPTOR, parameters); + private static Partition testPartitionUp2 = new Partition(TEST_PARTITION_UP_VALUES2, TEST_DATABASE, TEST_TABLE_UP_NAME, 0, 0, UPDATE_STORAGE_DESCRIPTOR, parameters); + + private final AtomicInteger accessCount = new AtomicInteger(); + private final AtomicInteger alterPartitionsCount = new AtomicInteger(); + private boolean throwException; + + private String hostAddress; + + public void setThrowException(boolean throwException) + { + this.throwException = throwException; + } + + public int getAccessCount() + { + return accessCount.get(); + } + + public int getAlterPartitionCount() + { + return alterPartitionsCount.get(); + } + + public void setHostAddress(String hostAddress) + { + this.hostAddress = hostAddress; + } + + public String getHostAddress() + { + return hostAddress; + } + + @Override + public List getAllDatabases() + { + accessCount.incrementAndGet(); + if (throwException) { + throw new IllegalStateException(); + } + return ImmutableList.of(TEST_DATABASE); + } + + @Override + public List getAllTables(String dbName) + { + accessCount.incrementAndGet(); + if (throwException) { + throw new RuntimeException(); + } + if (!dbName.equals(TEST_DATABASE)) { + return ImmutableList.of(); // As specified by Hive specification + } + return ImmutableList.of(TEST_TABLE); + } + + @Override + public Database getDatabase(String name) + throws TException + { + accessCount.incrementAndGet(); + if (throwException) { + throw new RuntimeException(); + } + if (!name.equals(TEST_DATABASE)) { + throw new NoSuchObjectException(); + } + return new Database(TEST_DATABASE, null, null, null); + } + + @Override + public Table getTable(String dbName, String tableName) + throws TException + { + accessCount.incrementAndGet(); + if (throwException) { + throw new RuntimeException(); + } + if (!dbName.equals(TEST_DATABASE) || !ImmutableList.of(TEST_TABLE, TEST_TABLE_UP_NAME).contains(tableName)) { + throw new NoSuchObjectException(); + } + if (tableName.equals(TEST_TABLE_UP_NAME)) { + return TEST_TABLE_UP; + } + else { + return new Table( + TEST_TABLE, + TEST_DATABASE, + "", + 0, + 0, + 0, + DEFAULT_STORAGE_DESCRIPTOR, + ImmutableList.of(new FieldSchema("key", "string", null)), + null, + "", + "", + TableType.MANAGED_TABLE.name()); + } + } + + @Override + public Table getTableWithCapabilities(String databaseName, String tableName) + { + throw new UnsupportedOperationException(); + } + + @Override + public List getFields(String databaseName, String tableName) + throws TException + { + return ImmutableList.of(new FieldSchema("key", "string", null)); + } + + @Override + public List getTableColumnStatistics(String databaseName, String tableName, List columnNames) + { + throw new UnsupportedOperationException(); + } + + @Override + public void setTableColumnStatistics(String databaseName, String tableName, List statistics) + { + throw new UnsupportedOperationException(); + } + + @Override + public void deleteTableColumnStatistics(String databaseName, String tableName, String columnName) + { + throw new UnsupportedOperationException(); + } + + @Override + public Map> getPartitionColumnStatistics(String databaseName, String tableName, List partitionNames, List columnNames) + throws TException + { + accessCount.incrementAndGet(); + if (throwException) { + throw new RuntimeException(); + } + if (!databaseName.equals(TEST_DATABASE) || !tableName.equals(TEST_TABLE_UP_NAME) || !ImmutableList.of(TEST_PARTITION1, TEST_PARTITION2, TEST_PARTITION_UP1, TEST_PARTITION_UP2).containsAll(partitionNames)) { + throw new NoSuchObjectException(); + } + return partitionColumnStatistics; + } + + @Override + public void setPartitionColumnStatistics(String databaseName, String tableName, String partitionName, List statistics) + throws TException + { + accessCount.incrementAndGet(); + if (throwException) { + throw new RuntimeException(); + } + if (!databaseName.equals(TEST_DATABASE) || !ImmutableList.of(TEST_TABLE, TEST_TABLE_UP_NAME).contains(tableName) || !ImmutableList.of(TEST_PARTITION_UP1, TEST_PARTITION_UP2).contains(partitionName)) { + throw new NoSuchObjectException(); + } + if (partitionColumnStatistics.containsKey(partitionName)) { + partitionColumnStatistics.replace(partitionName, statistics); + } + else { + partitionColumnStatistics.put(partitionName, statistics); + } + } + + @Override + public void deletePartitionColumnStatistics(String databaseName, String tableName, String partitionName, String columnName) + { + throw new UnsupportedOperationException(); + } + + @Override + public List getTableNamesByFilter(String databaseName, String filter) + { + throw new UnsupportedOperationException(); + } + + @Override + public List getPartitionNames(String dbName, String tableName) + { + accessCount.incrementAndGet(); + if (throwException) { + throw new RuntimeException(); + } + if (!dbName.equals(TEST_DATABASE) || !tableName.equals(TEST_TABLE)) { + return ImmutableList.of(); + } + return ImmutableList.of(TEST_PARTITION1, TEST_PARTITION2); + } + + @Override + public List getPartitionNamesFiltered(String dbName, String tableName, List partValues) + throws TException + { + accessCount.incrementAndGet(); + if (throwException) { + throw new RuntimeException(); + } + if (!dbName.equals(TEST_DATABASE) || !tableName.equals(TEST_TABLE)) { + throw new NoSuchObjectException(); + } + return ImmutableList.of(TEST_PARTITION1, TEST_PARTITION2); + } + + @Override + public Partition getPartition(String dbName, String tableName, List partitionValues) + throws TException + { + accessCount.incrementAndGet(); + if (throwException) { + throw new RuntimeException(); + } + if (!dbName.equals(TEST_DATABASE) || !tableName.equals(TEST_TABLE) || !ImmutableSet.of(TEST_PARTITION_VALUES1, TEST_PARTITION_VALUES2).contains(partitionValues)) { + throw new NoSuchObjectException(); + } + return new Partition(null, TEST_DATABASE, TEST_TABLE, 0, 0, DEFAULT_STORAGE_DESCRIPTOR, ImmutableMap.of()); + } + + @Override + public List getPartitionsByNames(String dbName, String tableName, List names) + throws TException + { + accessCount.incrementAndGet(); + if (throwException) { + throw new RuntimeException(); + } + if (!dbName.equals(TEST_DATABASE) || !ImmutableList.of(TEST_TABLE, TEST_TABLE_UP_NAME).contains(tableName) || !ImmutableSet.of(TEST_PARTITION1, TEST_PARTITION2, TEST_PARTITION_UP1, TEST_PARTITION_UP2).containsAll(names)) { + throw new NoSuchObjectException(); + } + if (names.containsAll(ImmutableList.of(TEST_PARTITION_UP1, TEST_PARTITION_UP2))) { + return ImmutableList.of(testPartitionUp1, testPartitionUp2); + } + else { + return Lists.transform(names, name -> { + try { + return new Partition(ImmutableList.copyOf(Warehouse.getPartValuesFromPartName(name)), TEST_DATABASE, TEST_TABLE, 0, 0, DEFAULT_STORAGE_DESCRIPTOR, ImmutableMap.of()); + } + catch (MetaException e) { + throw new RuntimeException(e); + } + }); + } + } + + @Override + public void createDatabase(Database database) + { + throw new UnsupportedOperationException(); + } + + @Override + public void dropDatabase(String databaseName, boolean deleteData, boolean cascade) + { + throw new UnsupportedOperationException(); + } + + @Override + public void alterDatabase(String databaseName, Database database) + { + throw new UnsupportedOperationException(); + } + + @Override + public void createTable(Table table) + { + throw new UnsupportedOperationException(); + } + + @Override + public void dropTable(String databaseName, String name, boolean deleteData) + { + throw new UnsupportedOperationException(); + } + + @Override + public void alterTable(String databaseName, String tableName, Table newTable) + { + throw new UnsupportedOperationException(); + } + + @Override + public int addPartitions(List newPartitions) + { + throw new UnsupportedOperationException(); + } + + @Override + public boolean dropPartition(String databaseName, String tableName, List partitionValues, boolean deleteData) + { + throw new UnsupportedOperationException(); + } + + @Override + public void alterPartition(String databaseName, String tableName, Partition partition) + { + throw new UnsupportedOperationException(); + } + + @Override + public void alterPartitions(String databaseName, String tableName, List partitions) + throws TException + { + accessCount.incrementAndGet(); + alterPartitionsCount.incrementAndGet(); + if (throwException) { + throw new IllegalStateException(); + } + if (!databaseName.equals(TEST_DATABASE) || !tableName.equals(TEST_TABLE_UP_NAME)) { + throw new NoSuchObjectException(); + } + testPartitionUp1 = partitions.get(0); + testPartitionUp2 = partitions.get(1); + } + + @Override + public List listRoles(String principalName, PrincipalType principalType) + { + throw new UnsupportedOperationException(); + } + + @Override + public List listPrivileges(String principalName, PrincipalType principalType, HiveObjectRef hiveObjectRef) + { + throw new UnsupportedOperationException(); + } + + @Override + public List getRoleNames() + { + accessCount.incrementAndGet(); + if (throwException) { + throw new IllegalStateException(); + } + return TEST_ROLES; + } + + @Override + public void createRole(String role, String grantor) + throws TException + { + // No-op + } + + @Override + public void dropRole(String role) + throws TException + { + // No-op + } + + @Override + public boolean grantPrivileges(PrivilegeBag privilegeBag) + { + throw new UnsupportedOperationException(); + } + + @Override + public boolean revokePrivileges(PrivilegeBag privilegeBag) + { + throw new UnsupportedOperationException(); + } + + @Override + public void grantRole(String role, String granteeName, PrincipalType granteeType, String grantorName, PrincipalType grantorType, boolean grantOption) + throws TException + { + // No-op + } + + @Override + public void revokeRole(String role, String granteeName, PrincipalType granteeType, boolean grantOption) + throws TException + { + // No-op + } + + @Override + public List listRoleGrants(String name, PrincipalType principalType) + throws TException + { + accessCount.incrementAndGet(); + if (throwException) { + throw new IllegalStateException(); + } + return TEST_ROLE_GRANTS; + } + + @Override + public void close() + { + // No-op + } + + @Override + public void setUGI(String userName) + { + // No-op + } + + @Override + public long openTransaction(String user) + throws TException + { + throw new UnsupportedOperationException(); + } + + @Override + public void commitTransaction(long transactionId) + throws TException + { + throw new UnsupportedOperationException(); + } + + @Override + public void abortTransaction(long transactionId) + throws TException + { + throw new UnsupportedOperationException(); + } + + @Override + public void sendTransactionHeartbeat(long transactionId) + throws TException + { + throw new UnsupportedOperationException(); + } + + @Override + public LockResponse acquireLock(LockRequest lockRequest) + throws TException + { + throw new UnsupportedOperationException(); + } + + @Override + public LockResponse checkLock(long lockId) + throws TException + { + throw new UnsupportedOperationException(); + } + + @Override + public String getValidWriteIds(List tableList, long currentTransactionId, boolean isVacuum) + throws TException + { + throw new UnsupportedOperationException(); + } + + @Override + public long getTableWriteId(String dbName, String tableName, long transactionId) + throws TException + { + throw new UnsupportedOperationException(); + } + + @Override + public ShowLocksResponse showLocks(ShowLocksRequest rqst) + throws TException + { + throw new UnsupportedOperationException(); + } + + @Override + public String get_config_value(String name, String defaultValue) + throws TException + { + throw new UnsupportedOperationException(); + } + + @Override + public String getDelegationToken(String userName) + { + throw new UnsupportedOperationException(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/MockThriftMetastoreClientFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/MockThriftMetastoreClientFactory.java new file mode 100644 index 00000000..da05c70d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/MockThriftMetastoreClientFactory.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.net.HostAndPort; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.authentication.NoHiveMetastoreAuthentication; +import org.apache.thrift.transport.TTransportException; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkState; +import static java.util.Objects.requireNonNull; + +public class MockThriftMetastoreClientFactory + extends ThriftMetastoreClientFactory +{ + private final List clients; + + public MockThriftMetastoreClientFactory(Optional socksProxy, Duration timeout, List clients) + { + super(Optional.empty(), socksProxy, timeout, new NoHiveMetastoreAuthentication(), "localhost"); + this.clients = new ArrayList<>(requireNonNull(clients, "clients is null")); + } + + @Override + public ThriftMetastoreClient create(HostAndPort address) + throws TTransportException + { + checkState(!clients.isEmpty(), "mock not given enough clients"); + ThriftMetastoreClient client = clients.remove(0); + if (client == null) { + throw new TTransportException(TTransportException.TIMED_OUT); + } + if (client instanceof MockThriftMetastoreClient) { + ((MockThriftMetastoreClient) client).setHostAddress(address.getHost() + ":" + address.getPortOrDefault(8080)); + } + return client; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestStaticMetastoreConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestStaticMetastoreConfig.java new file mode 100644 index 00000000..d43ab633 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestStaticMetastoreConfig.java @@ -0,0 +1,73 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import org.testng.annotations.Test; + +import java.net.URI; +import java.util.Map; + +import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; +import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; +import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; +import static org.testng.Assert.assertEquals; + +public class TestStaticMetastoreConfig +{ + @Test + public void testDefaults() + { + assertRecordedDefaults(recordDefaults(StaticMetastoreConfig.class) + .setMetastoreUris(null) + .setMetastoreUsername(null)); + } + + @Test + public void testExplicitPropertyMappingsSingleMetastore() + { + Map properties = new ImmutableMap.Builder() + .put("hive.metastore.uri", "thrift://localhost:9083") + .put("hive.metastore.username", "presto") + .build(); + + StaticMetastoreConfig expected = new StaticMetastoreConfig() + .setMetastoreUris("thrift://localhost:9083") + .setMetastoreUsername("presto"); + + assertFullMapping(properties, expected); + assertEquals(expected.getMetastoreUris(), ImmutableList.of(URI.create("thrift://localhost:9083"))); + assertEquals(expected.getMetastoreUsername(), "presto"); + } + + @Test + public void testExplicitPropertyMappingsMultipleMetastores() + { + Map properties = new ImmutableMap.Builder() + .put("hive.metastore.uri", "thrift://localhost:9083,thrift://192.0.2.3:8932") + .put("hive.metastore.username", "presto") + .build(); + + StaticMetastoreConfig expected = new StaticMetastoreConfig() + .setMetastoreUris("thrift://localhost:9083,thrift://192.0.2.3:8932") + .setMetastoreUsername("presto"); + + assertFullMapping(properties, expected); + assertEquals(expected.getMetastoreUris(), ImmutableList.of( + URI.create("thrift://localhost:9083"), + URI.create("thrift://192.0.2.3:8932"))); + assertEquals(expected.getMetastoreUsername(), "presto"); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestStaticMetastoreLocator.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestStaticMetastoreLocator.java new file mode 100644 index 00000000..4b59ce65 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestStaticMetastoreLocator.java @@ -0,0 +1,149 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import io.airlift.units.Duration; +import org.apache.thrift.TException; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +import static java.util.Arrays.asList; +import static java.util.Collections.singletonList; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.testng.Assert.assertEquals; + +public class TestStaticMetastoreLocator +{ + private static final ThriftMetastoreClient DEFAULT_CLIENT = createFakeMetastoreClient(); + private static final ThriftMetastoreClient FALLBACK_CLIENT = createFakeMetastoreClient(); + private static final ThriftMetastoreClient FALLBACK_CLIENT2 = createFakeMetastoreClient(); + + private static final StaticMetastoreConfig CONFIG_WITH_FALLBACK = new StaticMetastoreConfig() + .setMetastoreUris("thrift://default:8080,thrift://fallback:8090,thrift://fallback2:8090"); + + private static final StaticMetastoreConfig CONFIG_WITHOUT_FALLBACK = new StaticMetastoreConfig() + .setMetastoreUris("thrift://default:8080"); + + private static final StaticMetastoreConfig CONFIG_WITH_FALLBACK_WITH_USER = new StaticMetastoreConfig() + .setMetastoreUris("thrift://default:8080,thrift://fallback:8090,thrift://fallback2:8090") + .setMetastoreUsername("presto"); + + private static final StaticMetastoreConfig CONFIG_WITHOUT_FALLBACK_WITH_USER = new StaticMetastoreConfig() + .setMetastoreUris("thrift://default:8080") + .setMetastoreUsername("presto"); + + @Test + public void testDefaultHiveMetastore() + throws TException + { + MetastoreLocator locator = createMetastoreLocator(CONFIG_WITH_FALLBACK, singletonList(DEFAULT_CLIENT)); + assertEquals(locator.createMetastoreClient(), DEFAULT_CLIENT); + } + + @Test + public void testFallbackHiveMetastore() + throws TException + { + MetastoreLocator locator = createMetastoreLocator(CONFIG_WITH_FALLBACK, asList(null, null, FALLBACK_CLIENT)); + assertEquals(locator.createMetastoreClient(), FALLBACK_CLIENT); + } + + @Test + public void testFallbackHiveMetastoreFails() + { + MetastoreLocator locator = createMetastoreLocator(CONFIG_WITH_FALLBACK, asList(null, null, null)); + assertCreateClientFails(locator, "Failed connecting to Hive metastore:"); + } + + @Test + public void testMetastoreFailedWithoutFallback() + { + MetastoreLocator locator = createMetastoreLocator(CONFIG_WITHOUT_FALLBACK, singletonList(null)); + assertCreateClientFails(locator, "Failed connecting to Hive metastore: [default:8080]"); + } + + @Test + public void testFallbackHiveMetastoreWithHiveUser() + throws TException + { + MetastoreLocator locator = createMetastoreLocator(CONFIG_WITH_FALLBACK_WITH_USER, asList(null, null, FALLBACK_CLIENT)); + assertEquals(locator.createMetastoreClient(), FALLBACK_CLIENT); + } + + @Test + public void testMetastoreFailedWithoutFallbackWithHiveUser() + { + MetastoreLocator locator = createMetastoreLocator(CONFIG_WITHOUT_FALLBACK_WITH_USER, singletonList(null)); + assertCreateClientFails(locator, "Failed connecting to Hive metastore: [default:8080]"); + } + + @Test + public void testRoundRobinHiveMetastore() + throws TException + { + MetastoreLocator locator = createMetastoreLocator(CONFIG_WITH_FALLBACK, + asList(DEFAULT_CLIENT, FALLBACK_CLIENT, FALLBACK_CLIENT2, + DEFAULT_CLIENT, FALLBACK_CLIENT, FALLBACK_CLIENT2)); + MockThriftMetastoreClient client; + List pass1 = new ArrayList(){{ + add("default:8080"); + add("fallback:8090"); + add("fallback2:8090"); + }}; + List pass2 = new ArrayList<>(); + + /* PASS-1 */ + client = (MockThriftMetastoreClient) locator.createMetastoreClient(); + pass2.add(client.getHostAddress()); + assertEquals(pass1.contains(client.getHostAddress()), true); + pass1.remove(client.getHostAddress()); + client = (MockThriftMetastoreClient) locator.createMetastoreClient(); + pass2.add(client.getHostAddress()); + assertEquals(pass1.contains(client.getHostAddress()), true); + pass1.remove(client.getHostAddress()); + client = (MockThriftMetastoreClient) locator.createMetastoreClient(); + pass2.add(client.getHostAddress()); + assertEquals(pass1.contains(client.getHostAddress()), true); + pass1.remove(client.getHostAddress()); + + /* PASS-2 */ + client = (MockThriftMetastoreClient) locator.createMetastoreClient(); + assertEquals(pass2.get(0), client.getHostAddress()); + client = (MockThriftMetastoreClient) locator.createMetastoreClient(); + assertEquals(pass2.get(1), client.getHostAddress()); + client = (MockThriftMetastoreClient) locator.createMetastoreClient(); + assertEquals(pass2.get(2), client.getHostAddress()); + } + + private static void assertCreateClientFails(MetastoreLocator locator, String message) + { + assertThatThrownBy(locator::createMetastoreClient) + .hasCauseInstanceOf(TException.class) + .hasMessageStartingWith(message); + } + + private static MetastoreLocator createMetastoreLocator(StaticMetastoreConfig config, List clients) + { + return new StaticMetastoreLocator(config, new MockThriftMetastoreClientFactory(Optional.empty(), new Duration(1, SECONDS), clients)); + } + + private static ThriftMetastoreClient createFakeMetastoreClient() + { + return new MockThriftMetastoreClient(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestThriftHiveMetastoreConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestThriftHiveMetastoreConfig.java new file mode 100644 index 00000000..8f088ee6 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestThriftHiveMetastoreConfig.java @@ -0,0 +1,70 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.collect.ImmutableMap; +import io.airlift.units.Duration; +import org.testng.annotations.Test; + +import java.util.Map; + +import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; +import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; +import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; +import static java.util.concurrent.TimeUnit.MINUTES; +import static java.util.concurrent.TimeUnit.SECONDS; + +public class TestThriftHiveMetastoreConfig +{ + @Test + public void testDefaults() + { + assertRecordedDefaults(recordDefaults(ThriftHiveMetastoreConfig.class) + .setMaxRetries(9) + .setBackoffScaleFactor(2.0) + .setMinBackoffDelay(new Duration(1, SECONDS)) + .setMaxBackoffDelay(new Duration(1, SECONDS)) + .setMaxRetryTime(new Duration(30, SECONDS)) + .setMaxWaitForTransactionLock(new Duration(10, MINUTES)) + .setRoleNameCaseSensitive(false) + .setImpersonationEnabled(false)); + } + + @Test + public void testExplicitPropertyMappings() + { + Map properties = new ImmutableMap.Builder() + .put("hive.metastore.thrift.client.max-retries", "15") + .put("hive.metastore.thrift.client.backoff-scale-factor", "3.0") + .put("hive.metastore.thrift.client.min-backoff-delay", "2s") + .put("hive.metastore.thrift.client.max-backoff-delay", "4s") + .put("hive.metastore.thrift.client.max-retry-time", "60s") + .put("hive.metastore.thrift.txn-lock-max-wait", "5m") + .put("hive.metastore.thrift.is-role-name-case-sensitive", "true") + .put("hive.metastore.thrift.impersonation.enabled", "true") + .build(); + + ThriftHiveMetastoreConfig expected = new ThriftHiveMetastoreConfig() + .setMaxRetries(15) + .setBackoffScaleFactor(3.0) + .setMinBackoffDelay(new Duration(2, SECONDS)) + .setMaxBackoffDelay(new Duration(4, SECONDS)) + .setMaxRetryTime(new Duration(60, SECONDS)) + .setMaxWaitForTransactionLock(new Duration(5, MINUTES)) + .setRoleNameCaseSensitive(true) + .setImpersonationEnabled(true); + + assertFullMapping(properties, expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestThriftMetastoreUtil.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestThriftMetastoreUtil.java new file mode 100644 index 00000000..edd8268d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestThriftMetastoreUtil.java @@ -0,0 +1,414 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HiveBasicStatistics; +import io.prestosql.plugin.hive.metastore.BooleanStatistics; +import io.prestosql.plugin.hive.metastore.DateStatistics; +import io.prestosql.plugin.hive.metastore.DecimalStatistics; +import io.prestosql.plugin.hive.metastore.DoubleStatistics; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; +import io.prestosql.plugin.hive.metastore.HivePrincipal; +import io.prestosql.plugin.hive.metastore.IntegerStatistics; +import io.prestosql.spi.security.PrincipalType; +import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData; +import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.Date; +import org.apache.hadoop.hive.metastore.api.DateColumnStatsData; +import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData; +import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; +import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; +import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; +import org.testng.annotations.Test; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.util.Locale; +import java.util.Optional; +import java.util.OptionalDouble; +import java.util.OptionalLong; + +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.applyRoleNameCaseSensitive; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.fromMetastoreApiColumnStatistics; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.getHiveBasicStatistics; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.toMetastoreDecimal; +import static io.prestosql.plugin.hive.metastore.thrift.ThriftMetastoreUtil.updateStatisticsParameters; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.binaryStats; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.booleanStats; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.dateStats; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.decimalStats; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.doubleStats; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.longStats; +import static org.apache.hadoop.hive.metastore.api.ColumnStatisticsData.stringStats; +import static org.apache.hadoop.hive.serde.serdeConstants.BIGINT_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.BINARY_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.BOOLEAN_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.DATE_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.DECIMAL_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.DOUBLE_TYPE_NAME; +import static org.apache.hadoop.hive.serde.serdeConstants.STRING_TYPE_NAME; +import static org.testng.Assert.assertEquals; + +public class TestThriftMetastoreUtil +{ + @Test + public void testLongStatsToColumnStatistics() + { + LongColumnStatsData longColumnStatsData = new LongColumnStatsData(); + longColumnStatsData.setLowValue(0); + longColumnStatsData.setHighValue(100); + longColumnStatsData.setNumNulls(1); + longColumnStatsData.setNumDVs(20); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BIGINT_TYPE_NAME, longStats(longColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); + + assertEquals(actual.getIntegerStatistics(), Optional.of(new IntegerStatistics(OptionalLong.of(0), OptionalLong.of(100)))); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.of(1)); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(19)); + } + + @Test + public void testEmptyLongStatsToColumnStatistics() + { + LongColumnStatsData emptyLongColumnStatsData = new LongColumnStatsData(); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BIGINT_TYPE_NAME, longStats(emptyLongColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + + assertEquals(actual.getIntegerStatistics(), Optional.of(new IntegerStatistics(OptionalLong.empty(), OptionalLong.empty()))); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.empty()); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); + } + + @Test + public void testDoubleStatsToColumnStatistics() + { + DoubleColumnStatsData doubleColumnStatsData = new DoubleColumnStatsData(); + doubleColumnStatsData.setLowValue(0); + doubleColumnStatsData.setHighValue(100); + doubleColumnStatsData.setNumNulls(1); + doubleColumnStatsData.setNumDVs(20); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DOUBLE_TYPE_NAME, doubleStats(doubleColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.of(new DoubleStatistics(OptionalDouble.of(0), OptionalDouble.of(100)))); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.of(1)); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(19)); + } + + @Test + public void testEmptyDoubleStatsToColumnStatistics() + { + DoubleColumnStatsData emptyDoubleColumnStatsData = new DoubleColumnStatsData(); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DOUBLE_TYPE_NAME, doubleStats(emptyDoubleColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.of(new DoubleStatistics(OptionalDouble.empty(), OptionalDouble.empty()))); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.empty()); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); + } + + @Test + public void testDecimalStatsToColumnStatistics() + { + DecimalColumnStatsData decimalColumnStatsData = new DecimalColumnStatsData(); + BigDecimal low = new BigDecimal("0"); + decimalColumnStatsData.setLowValue(toMetastoreDecimal(low)); + BigDecimal high = new BigDecimal("100"); + decimalColumnStatsData.setHighValue(toMetastoreDecimal(high)); + decimalColumnStatsData.setNumNulls(1); + decimalColumnStatsData.setNumDVs(20); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DECIMAL_TYPE_NAME, decimalStats(decimalColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.of(new DecimalStatistics(Optional.of(low), Optional.of(high)))); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.of(1)); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(19)); + } + + @Test + public void testEmptyDecimalStatsToColumnStatistics() + { + DecimalColumnStatsData emptyDecimalColumnStatsData = new DecimalColumnStatsData(); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DECIMAL_TYPE_NAME, decimalStats(emptyDecimalColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.of(new DecimalStatistics(Optional.empty(), Optional.empty()))); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.empty()); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); + } + + @Test + public void testBooleanStatsToColumnStatistics() + { + BooleanColumnStatsData booleanColumnStatsData = new BooleanColumnStatsData(); + booleanColumnStatsData.setNumTrues(100); + booleanColumnStatsData.setNumFalses(10); + booleanColumnStatsData.setNumNulls(0); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BOOLEAN_TYPE_NAME, booleanStats(booleanColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.of(new BooleanStatistics(OptionalLong.of(100), OptionalLong.of(10)))); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.of(0)); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); + } + + @Test + public void testImpalaGeneratedBooleanStatistics() + { + BooleanColumnStatsData statsData = new BooleanColumnStatsData(1L, -1L, 2L); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BOOLEAN_TYPE_NAME, booleanStats(statsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.of(2)); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.of(new BooleanStatistics(OptionalLong.empty(), OptionalLong.empty()))); + } + + @Test + public void testEmptyBooleanStatsToColumnStatistics() + { + BooleanColumnStatsData emptyBooleanColumnStatsData = new BooleanColumnStatsData(); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BOOLEAN_TYPE_NAME, booleanStats(emptyBooleanColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.of(new BooleanStatistics(OptionalLong.empty(), OptionalLong.empty()))); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.empty()); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); + } + + @Test + public void testDateStatsToColumnStatistics() + { + DateColumnStatsData dateColumnStatsData = new DateColumnStatsData(); + dateColumnStatsData.setLowValue(new Date(1000)); + dateColumnStatsData.setHighValue(new Date(2000)); + dateColumnStatsData.setNumNulls(1); + dateColumnStatsData.setNumDVs(20); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DATE_TYPE_NAME, dateStats(dateColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(1000)); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.of(new DateStatistics(Optional.of(LocalDate.ofEpochDay(1000)), Optional.of(LocalDate.ofEpochDay(2000))))); + assertEquals(actual.getBooleanStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.of(1)); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(19)); + } + + @Test + public void testEmptyDateStatsToColumnStatistics() + { + DateColumnStatsData emptyDateColumnStatsData = new DateColumnStatsData(); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DATE_TYPE_NAME, dateStats(emptyDateColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.of(new DateStatistics(Optional.empty(), Optional.empty()))); + assertEquals(actual.getBooleanStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.empty()); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); + } + + @Test + public void testStringStatsToColumnStatistics() + { + StringColumnStatsData stringColumnStatsData = new StringColumnStatsData(); + stringColumnStatsData.setMaxColLen(100); + stringColumnStatsData.setAvgColLen(23.333); + stringColumnStatsData.setNumNulls(1); + stringColumnStatsData.setNumDVs(20); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", STRING_TYPE_NAME, stringStats(stringColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(2)); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.of(100)); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.of(23)); + assertEquals(actual.getNullsCount(), OptionalLong.of(1)); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(1)); + } + + @Test + public void testEmptyStringColumnStatsData() + { + StringColumnStatsData emptyStringColumnStatsData = new StringColumnStatsData(); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", STRING_TYPE_NAME, stringStats(emptyStringColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.empty()); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); + } + + @Test + public void testBinaryStatsToColumnStatistics() + { + BinaryColumnStatsData binaryColumnStatsData = new BinaryColumnStatsData(); + binaryColumnStatsData.setMaxColLen(100); + binaryColumnStatsData.setAvgColLen(22.2); + binaryColumnStatsData.setNumNulls(2); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BINARY_TYPE_NAME, binaryStats(binaryColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(4)); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.of(100)); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.of(44)); + assertEquals(actual.getNullsCount(), OptionalLong.of(2)); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); + } + + @Test + public void testEmptyBinaryStatsToColumnStatistics() + { + BinaryColumnStatsData emptyBinaryColumnStatsData = new BinaryColumnStatsData(); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", BINARY_TYPE_NAME, binaryStats(emptyBinaryColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.empty()); + + assertEquals(actual.getIntegerStatistics(), Optional.empty()); + assertEquals(actual.getDoubleStatistics(), Optional.empty()); + assertEquals(actual.getDecimalStatistics(), Optional.empty()); + assertEquals(actual.getDateStatistics(), Optional.empty()); + assertEquals(actual.getBooleanStatistics(), Optional.empty()); + assertEquals(actual.getMaxValueSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getTotalSizeInBytes(), OptionalLong.empty()); + assertEquals(actual.getNullsCount(), OptionalLong.empty()); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.empty()); + } + + @Test + public void testSingleDistinctValue() + { + DoubleColumnStatsData doubleColumnStatsData = new DoubleColumnStatsData(); + doubleColumnStatsData.setNumNulls(10); + doubleColumnStatsData.setNumDVs(1); + ColumnStatisticsObj columnStatisticsObj = new ColumnStatisticsObj("my_col", DOUBLE_TYPE_NAME, doubleStats(doubleColumnStatsData)); + HiveColumnStatistics actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(10)); + + assertEquals(actual.getNullsCount(), OptionalLong.of(10)); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(0)); + + doubleColumnStatsData = new DoubleColumnStatsData(); + doubleColumnStatsData.setNumNulls(10); + doubleColumnStatsData.setNumDVs(1); + columnStatisticsObj = new ColumnStatisticsObj("my_col", DOUBLE_TYPE_NAME, doubleStats(doubleColumnStatsData)); + actual = fromMetastoreApiColumnStatistics(columnStatisticsObj, OptionalLong.of(11)); + + assertEquals(actual.getNullsCount(), OptionalLong.of(10)); + assertEquals(actual.getDistinctValuesCount(), OptionalLong.of(1)); + } + + @Test + public void testBasicStatisticsRoundTrip() + { + testBasicStatisticsRoundTrip(new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.empty(), OptionalLong.empty())); + testBasicStatisticsRoundTrip(new HiveBasicStatistics(OptionalLong.of(1), OptionalLong.empty(), OptionalLong.of(2), OptionalLong.empty())); + testBasicStatisticsRoundTrip(new HiveBasicStatistics(OptionalLong.of(1), OptionalLong.of(2), OptionalLong.of(3), OptionalLong.of(4))); + } + + private static void testBasicStatisticsRoundTrip(HiveBasicStatistics expected) + { + assertEquals(getHiveBasicStatistics(updateStatisticsParameters(ImmutableMap.of(), expected)), expected); + } + + @Test + public void testApplyRoleNameCaseSensitive() + { + assertEquals(applyRoleNameCaseSensitive(null, true), null); + + String userName = "TestString"; + HivePrincipal caseSensitivePrincipalUser = new HivePrincipal(PrincipalType.USER, userName); + HivePrincipal caseSensitivePrincipalRole = new HivePrincipal(PrincipalType.ROLE, userName); + assertEquals(applyRoleNameCaseSensitive(caseSensitivePrincipalUser, false).getName(), userName); + assertEquals(applyRoleNameCaseSensitive(caseSensitivePrincipalUser, true).getName(), userName); + assertEquals(applyRoleNameCaseSensitive(caseSensitivePrincipalRole, true).getName(), userName); + // only impact the role principal and isRoleNameCaseSensitive=false + assertEquals(applyRoleNameCaseSensitive(caseSensitivePrincipalRole, false).getName(), userName.toLowerCase(Locale.ENGLISH)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestingMetastoreLocator.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestingMetastoreLocator.java new file mode 100644 index 00000000..900ce951 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/metastore/thrift/TestingMetastoreLocator.java @@ -0,0 +1,41 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.metastore.thrift; + +import com.google.common.net.HostAndPort; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.authentication.NoHiveMetastoreAuthentication; +import org.apache.thrift.TException; + +import static java.util.Objects.requireNonNull; + +public class TestingMetastoreLocator + implements MetastoreLocator +{ + private final HiveConfig config; + private final HostAndPort address; + + public TestingMetastoreLocator(HiveConfig config, String host, int port) + { + this.config = requireNonNull(config, "config is null"); + this.address = HostAndPort.fromParts(requireNonNull(host, "host is null"), port); + } + + @Override + public ThriftMetastoreClient createMetastoreClient() + throws TException + { + return new ThriftMetastoreClientFactory(config, new NoHiveMetastoreAuthentication()).create(address); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/orc/TestOrcAcidPageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/orc/TestOrcAcidPageSource.java new file mode 100644 index 00000000..aca0f525 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/orc/TestOrcAcidPageSource.java @@ -0,0 +1,270 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.airlift.tpch.Nation; +import io.airlift.tpch.NationColumn; +import io.airlift.tpch.NationGenerator; +import io.prestosql.metadata.Metadata; +import io.prestosql.orc.OrcCacheStore; +import io.prestosql.plugin.hive.DeleteDeltaLocations; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HivePageSourceFactory; +import io.prestosql.plugin.hive.HiveTestUtils; +import io.prestosql.plugin.hive.HiveTypeTranslator; +import io.prestosql.spi.Page; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeManager; +import io.prestosql.type.InternalTypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.JobConf; +import org.testng.annotations.Test; + +import java.io.File; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.OptionalLong; +import java.util.Properties; +import java.util.Set; +import java.util.function.LongPredicate; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.airlift.tpch.NationColumn.COMMENT; +import static io.airlift.tpch.NationColumn.NAME; +import static io.airlift.tpch.NationColumn.NATION_KEY; +import static io.airlift.tpch.NationColumn.REGION_KEY; +import static io.prestosql.metadata.MetadataManager.createTestMetadataManager; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HiveStorageFormat.ORC; +import static io.prestosql.plugin.hive.HiveType.toHiveType; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.VarcharType.VARCHAR; +import static java.util.Collections.nCopies; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.TABLE_IS_TRANSACTIONAL; +import static org.apache.hadoop.hive.ql.io.AcidUtils.deleteDeltaSubdir; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; +import static org.testng.Assert.assertEquals; + +public class TestOrcAcidPageSource +{ + private static final Metadata METADATA = createTestMetadataManager(); + public static final TypeManager TYPE_MANAGER = new InternalTypeManager(METADATA.getFunctionAndTypeManager()); + + private static final HivePageSourceFactory PAGE_SOURCE_FACTORY = new OrcPageSourceFactory( + TYPE_MANAGER, + new HiveConfig().setUseOrcColumnNames(false), + HiveTestUtils.createTestHdfsEnvironment(new HiveConfig()), + new FileFormatDataSourceStats(), OrcCacheStore.builder().newCacheStore( + new HiveConfig().getOrcFileTailCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcFileTailCacheTtl().toMillis()), + new HiveConfig().getOrcStripeFooterCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcStripeFooterCacheTtl().toMillis()), + new HiveConfig().getOrcRowIndexCacheLimit(), Duration.ofMillis(new HiveConfig().getOrcRowIndexCacheTtl().toMillis()), + new HiveConfig().getOrcBloomFiltersCacheLimit(), + Duration.ofMillis(new HiveConfig().getOrcBloomFiltersCacheTtl().toMillis()), + new HiveConfig().getOrcRowDataCacheMaximumWeight(), Duration.ofMillis(new HiveConfig().getOrcRowDataCacheTtl().toMillis()), + new HiveConfig().isOrcCacheStatsMetricCollectionEnabled())); + + @Test + public void testFullFileRead() + { + assertRead(ImmutableSet.copyOf(NationColumn.values()), OptionalLong.empty(), Optional.empty(), nationKey -> false); + } + + @Test + public void testSingleColumnRead() + { + assertRead(ImmutableSet.of(REGION_KEY), OptionalLong.empty(), Optional.empty(), nationKey -> false); + } + + /** + * tests file stats based pruning works fine + */ + @Test + public void testFullFileSkipped() + { + assertRead(ImmutableSet.copyOf(NationColumn.values()), OptionalLong.of(100L), Optional.empty(), nationKey -> false); + } + + /** + * Tests stripe stats and row groups stats based pruning works fine + */ + @Test + public void testSomeStripesAndRowGroupRead() + { + assertRead(ImmutableSet.copyOf(NationColumn.values()), OptionalLong.of(5L), Optional.empty(), nationKey -> false); + } + + @Test + public void testDeletedRows() + { + Path partitionLocation = new Path(getClass().getClassLoader().getResource("nation_delete_deltas") + "/"); + Optional deleteDeltaLocations = DeleteDeltaLocations.builder(partitionLocation) + .addDeleteDelta(new Path(partitionLocation, deleteDeltaSubdir(3L, 3L, 0)), 3L, 3L, 0) + .addDeleteDelta(new Path(partitionLocation, deleteDeltaSubdir(4L, 4L, 0)), 4L, 4L, 0) + .build(); + + assertRead(ImmutableSet.copyOf(NationColumn.values()), OptionalLong.empty(), deleteDeltaLocations, nationKey -> nationKey == 5 || nationKey == 19); + } + + private static void assertRead(Set columns, OptionalLong nationKeyPredicate, Optional deleteDeltaLocations, LongPredicate deletedRows) + { + TupleDomain tupleDomain = TupleDomain.all(); + if (nationKeyPredicate.isPresent()) { + tupleDomain = TupleDomain.withColumnDomains(ImmutableMap.of(toHiveColumnHandle(NATION_KEY), Domain.singleValue(BIGINT, nationKeyPredicate.getAsLong()))); + } + + List actual = readFile(columns, tupleDomain, deleteDeltaLocations); + + List expected = new ArrayList<>(); + for (Nation nation : ImmutableList.copyOf(new NationGenerator().iterator())) { + if (nationKeyPredicate.isPresent() && nationKeyPredicate.getAsLong() != nation.getNationKey()) { + continue; + } + if (deletedRows.test(nation.getNationKey())) { + continue; + } + expected.addAll(nCopies(1000, nation)); + } + + assertEqualsByColumns(columns, actual, expected); + } + + private static List readFile(Set columns, TupleDomain tupleDomain, Optional deleteDeltaLocations) + { + List columnHandles = columns.stream() + .map(TestOrcAcidPageSource::toHiveColumnHandle) + .collect(toImmutableList()); + + List columnNames = columnHandles.stream() + .map(HiveColumnHandle::getName) + .collect(toImmutableList()); + + // This file has the contains the TPC-H nation table which each row repeated 1000 times + File nationFileWithReplicatedRows = new File(TestOrcAcidPageSource.class.getClassLoader().getResource("nationFile25kRowsSortedOnNationKey/bucket_00000").getPath()); + + ConnectorPageSource pageSource = PAGE_SOURCE_FACTORY.createPageSource( + new JobConf(new Configuration(false)), + HiveTestUtils.SESSION, + new Path(nationFileWithReplicatedRows.getAbsoluteFile().toURI()), + 0, + nationFileWithReplicatedRows.length(), + nationFileWithReplicatedRows.length(), + createSchema(), + columnHandles, + tupleDomain, + Optional.empty(), + deleteDeltaLocations, + Optional.empty(), + Optional.empty(), + null, + false, -1L).get(); + + int nationKeyColumn = columnNames.indexOf("n_nationkey"); + int nameColumn = columnNames.indexOf("n_name"); + int regionKeyColumn = columnNames.indexOf("n_regionkey"); + int commentColumn = columnNames.indexOf("n_comment"); + + ImmutableList.Builder rows = ImmutableList.builder(); + while (!pageSource.isFinished()) { + Page page = pageSource.getNextPage(); + if (page == null) { + continue; + } + + page = page.getLoadedPage(); + for (int position = 0; position < page.getPositionCount(); position++) { + long nationKey = -42; + if (nationKeyColumn >= 0) { + nationKey = BIGINT.getLong(page.getBlock(nationKeyColumn), position); + } + + String name = ""; + if (nameColumn >= 0) { + name = VARCHAR.getSlice(page.getBlock(nameColumn), position).toStringUtf8(); + } + + long regionKey = -42; + if (regionKeyColumn >= 0) { + regionKey = BIGINT.getLong(page.getBlock(regionKeyColumn), position); + } + + String comment = ""; + if (commentColumn >= 0) { + comment = VARCHAR.getSlice(page.getBlock(commentColumn), position).toStringUtf8(); + } + + rows.add(new Nation(position, nationKey, name, regionKey, comment)); + } + } + return rows.build(); + } + + private static HiveColumnHandle toHiveColumnHandle(NationColumn nationColumn) + { + Type prestoType; + switch (nationColumn.getType().getBase()) { + case IDENTIFIER: + case INTEGER: + prestoType = BIGINT; + break; + case VARCHAR: + prestoType = VARCHAR; + break; + default: + throw new IllegalStateException("Unexpected value: " + nationColumn.getType().getBase()); + } + + return new HiveColumnHandle( + nationColumn.getColumnName(), + toHiveType(new HiveTypeTranslator(), prestoType), + prestoType.getTypeSignature(), + 0, + REGULAR, + Optional.empty()); + } + + private static Properties createSchema() + { + Properties schema = new Properties(); + schema.setProperty(SERIALIZATION_LIB, ORC.getSerDe()); + schema.setProperty(FILE_INPUT_FORMAT, ORC.getInputFormat()); + schema.setProperty(TABLE_IS_TRANSACTIONAL, "true"); + return schema; + } + + private static void assertEqualsByColumns(Set columns, List actualRows, List expectedRows) + { + assertEquals(actualRows.size(), expectedRows.size(), "row count"); + for (int i = 0; i < actualRows.size(); i++) { + Nation actual = actualRows.get(i); + Nation expected = expectedRows.get(i); + assertEquals(actual.getNationKey(), columns.contains(NATION_KEY) ? expected.getNationKey() : -42); + assertEquals(actual.getName(), columns.contains(NAME) ? expected.getName() : ""); + assertEquals(actual.getRegionKey(), columns.contains(REGION_KEY) ? expected.getRegionKey() : -42); + assertEquals(actual.getComment(), columns.contains(COMMENT) ? expected.getComment() : ""); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/orc/TestOrcDeleteDeltaPageSource.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/orc/TestOrcDeleteDeltaPageSource.java new file mode 100644 index 00000000..3f02c3cd --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/orc/TestOrcDeleteDeltaPageSource.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import com.google.common.collect.ImmutableList; +import io.airlift.units.DataSize; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HiveTestUtils; +import io.prestosql.testing.MaterializedResult; +import io.prestosql.testing.MaterializedRow; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.JobConf; +import org.testng.annotations.Test; + +import java.io.File; + +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static org.testng.Assert.assertEquals; + +public class TestOrcDeleteDeltaPageSource +{ + private static final File DELETE_FILE = new File(TestOrcDeleteDeltaPageSource.class.getClassLoader().getResource("fullacid_delete_delta_test/delete_delta_0000004_0000004_0000/bucket_00000").getPath()); + + @Test + public void testReadingDeletedRows() + { + OrcDeleteDeltaPageSourceFactory pageSourceFactory = new OrcDeleteDeltaPageSourceFactory( + "test", + new JobConf(new Configuration(false)), + HiveTestUtils.HDFS_ENVIRONMENT, + new DataSize(1, MEGABYTE), + new DataSize(8, MEGABYTE), + new DataSize(8, MEGABYTE), + new DataSize(16, MEGABYTE), + new DataSize(8, MEGABYTE), + true, + false, + new FileFormatDataSourceStats()); + + OrcDeleteDeltaPageSource pageSource = pageSourceFactory.createPageSource(new Path(DELETE_FILE.toURI()), DELETE_FILE.length(), DELETE_FILE.lastModified()); + MaterializedResult materializedRows = MaterializedResult.materializeSourceDataStream(HiveTestUtils.SESSION, pageSource, ImmutableList.of(BIGINT, INTEGER, BIGINT)); + + assertEquals(materializedRows.getRowCount(), 1); + assertEquals(materializedRows.getMaterializedRows().get(0), new MaterializedRow(5, 2L, 536870912, 0L)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/orc/TestOrcDeletedRows.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/orc/TestOrcDeletedRows.java new file mode 100644 index 00000000..6575a298 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/orc/TestOrcDeletedRows.java @@ -0,0 +1,141 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.orc; + +import com.google.common.collect.ImmutableSet; +import io.airlift.units.DataSize; +import io.prestosql.plugin.hive.DeleteDeltaLocations; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HiveTestUtils; +import io.prestosql.spi.Page; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.block.RunLengthEncodedBlock; +import io.prestosql.testing.MaterializedResult; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.mapred.JobConf; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.Optional; +import java.util.Set; + +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static org.testng.Assert.assertEquals; + +public class TestOrcDeletedRows +{ + private Path partitionDirectory; + private Block bucketBlock; + private Block rowIdBlock; + + @BeforeClass + public void setUp() + { + partitionDirectory = new Path(TestOrcDeletedRows.class.getClassLoader().getResource("fullacid_delete_delta_test") + "/"); + bucketBlock = INTEGER.createFixedSizeBlockBuilder(1) + .writeInt(536870912) + .build(); + rowIdBlock = BIGINT.createFixedSizeBlockBuilder(1) + .writeLong(0) + .build(); + } + + @Test + public void testEmptyDeleteLocations() + { + OrcDeletedRows deletedRows = createOrcDeletedRows(Optional.empty()); + + Page testPage = createTestPage(0, 10); + Block block = deletedRows.getMaskDeletedRowsFunction(testPage, Optional.empty()).apply(testPage.getBlock(2)); + assertEquals(block.getPositionCount(), 10); + } + + @Test + public void testDeleteLocations() + { + DeleteDeltaLocations.Builder deleteDeltaLocationsBuilder = DeleteDeltaLocations.builder(partitionDirectory); + addDeleteDelta(deleteDeltaLocationsBuilder, 4L, 4L, 0); + addDeleteDelta(deleteDeltaLocationsBuilder, 7L, 7L, 0); + + OrcDeletedRows deletedRows = createOrcDeletedRows(deleteDeltaLocationsBuilder.build()); + + // page with deleted rows + Page testPage = createTestPage(0, 10); + Block block = deletedRows.getMaskDeletedRowsFunction(testPage, Optional.empty()).apply(testPage.getBlock(0)); + Set validRows = MaterializedResult.resultBuilder(HiveTestUtils.SESSION, BIGINT) + .page(new Page(block)) + .build() + .getOnlyColumnAsSet(); + + assertEquals(validRows.size(), 8); + assertEquals(validRows, ImmutableSet.of(0L, 1L, 3L, 4L, 5L, 7L, 8L, 9L)); + + // page with no deleted rows + testPage = createTestPage(10, 20); + block = deletedRows.getMaskDeletedRowsFunction(testPage, Optional.empty()).apply(testPage.getBlock(2)); + assertEquals(block.getPositionCount(), 10); + } + + private void addDeleteDelta(DeleteDeltaLocations.Builder deleteDeltaLocationsBuilder, long minWriteId, long maxWriteId, int statementId) + { + Path deleteDeltaPath = new Path(partitionDirectory, AcidUtils.deleteDeltaSubdir(minWriteId, maxWriteId, statementId)); + deleteDeltaLocationsBuilder.addDeleteDelta(deleteDeltaPath, minWriteId, maxWriteId, statementId); + } + + private OrcDeletedRows createOrcDeletedRows(Optional deleteDeltaLocations) + { + JobConf configuration = new JobConf(new Configuration(false)); + OrcDeleteDeltaPageSourceFactory pageSourceFactory = new OrcDeleteDeltaPageSourceFactory( + "test", + configuration, + HiveTestUtils.HDFS_ENVIRONMENT, + new DataSize(1, MEGABYTE), + new DataSize(8, MEGABYTE), + new DataSize(8, MEGABYTE), + new DataSize(16, MEGABYTE), + new DataSize(8, MEGABYTE), + true, + false, + new FileFormatDataSourceStats()); + + return new OrcDeletedRows( + "bucket_00000", + deleteDeltaLocations, + pageSourceFactory, + "test", + configuration, + HiveTestUtils.HDFS_ENVIRONMENT, + Optional.empty()); + } + + private Page createTestPage(int originalTransactionStart, int originalTransactionEnd) + { + int size = originalTransactionEnd - originalTransactionStart; + BlockBuilder originalTransaction = BIGINT.createFixedSizeBlockBuilder(size); + for (long i = originalTransactionStart; i < originalTransactionEnd; i++) { + originalTransaction.writeLong(i); + } + + return new Page( + size, + originalTransaction.build(), + new RunLengthEncodedBlock(bucketBlock, size), + new RunLengthEncodedBlock(rowIdBlock, size)); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/AbstractTestParquetReader.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/AbstractTestParquetReader.java new file mode 100644 index 00000000..61cb16b5 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/AbstractTestParquetReader.java @@ -0,0 +1,1815 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet; + +import com.google.common.collect.AbstractIterator; +import com.google.common.collect.AbstractSequentialIterator; +import com.google.common.collect.ContiguousSet; +import com.google.common.collect.DiscreteDomain; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Range; +import com.google.common.primitives.Shorts; +import io.airlift.units.DataSize; +import io.prestosql.spi.type.ArrayType; +import io.prestosql.spi.type.RowType; +import io.prestosql.spi.type.SqlDate; +import io.prestosql.spi.type.SqlDecimal; +import io.prestosql.spi.type.SqlTimestamp; +import io.prestosql.spi.type.SqlVarbinary; +import io.prestosql.spi.type.Type; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.parquet.schema.MessageType; +import org.joda.time.DateTimeZone; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Random; +import java.util.UUID; +import java.util.concurrent.ThreadLocalRandom; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static com.google.common.base.Functions.compose; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.Iterables.concat; +import static com.google.common.collect.Iterables.cycle; +import static com.google.common.collect.Iterables.limit; +import static com.google.common.collect.Iterables.transform; +import static io.prestosql.plugin.hive.parquet.ParquetTester.insertNullEvery; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DecimalType.createDecimalType; +import static io.prestosql.spi.type.Decimals.MAX_PRECISION; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.RowType.field; +import static io.prestosql.spi.type.TimestampType.TIMESTAMP; +import static io.prestosql.spi.type.VarbinaryType.VARBINARY; +import static io.prestosql.spi.type.VarcharType.VARCHAR; +import static io.prestosql.spi.type.VarcharType.createUnboundedVarcharType; +import static io.prestosql.testing.DateTimeTestingUtils.sqlTimestampOf; +import static io.prestosql.tests.StructuralTestUtil.mapType; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; +import static java.lang.String.join; +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Arrays.asList; +import static java.util.Collections.singletonList; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardListObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardMapObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaBooleanObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaByteObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDateObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaDoubleObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaFloatObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaIntObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaLongObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaShortObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector; +import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaTimestampObjectInspector; +import static org.apache.parquet.schema.MessageTypeParser.parseMessageType; +import static org.testng.Assert.assertEquals; + +public abstract class AbstractTestParquetReader +{ + private static final int MAX_PRECISION_INT32 = toIntExact(maxPrecision(4)); + private static final int MAX_PRECISION_INT64 = toIntExact(maxPrecision(8)); + + private Logger parquetLogger; + + private final ParquetTester tester; + + public AbstractTestParquetReader(ParquetTester tester) + { + this.tester = tester; + } + + @BeforeClass + public void setUp() + { + assertEquals(DateTimeZone.getDefault(), DateTimeZone.forID("America/Bahia_Banderas")); + + // Parquet has excessive logging at INFO level + parquetLogger = Logger.getLogger("org.apache.parquet.hadoop"); + parquetLogger.setLevel(Level.WARNING); + } + + @Test + public void testArray() + throws Exception + { + Iterable> values = createTestArrays(limit(cycle(asList(1, null, 3, 5, null, null, null, 7, 11, null, 13, 17)), 30_000)); + tester.testRoundTrip(getStandardListObjectInspector(javaIntObjectInspector), values, values, new ArrayType(INTEGER)); + } + + @Test + public void testEmptyArrays() + throws Exception + { + Iterable> values = limit(cycle(singletonList(Collections.emptyList())), 30_000); + tester.testRoundTrip(getStandardListObjectInspector(javaIntObjectInspector), values, values, new ArrayType(INTEGER)); + } + + @Test + public void testNestedArrays() + throws Exception + { + int nestingLevel = ThreadLocalRandom.current().nextInt(1, 15); + ObjectInspector objectInspector = getStandardListObjectInspector(javaIntObjectInspector); + Type type = new ArrayType(INTEGER); + Iterable values = limit(cycle(asList(1, null, 3, null, 5, null, 7, null, null, null, 11, null, 13)), 3_210); + for (int i = 0; i < nestingLevel; i++) { + values = createNullableTestArrays(values); + objectInspector = getStandardListObjectInspector(objectInspector); + type = new ArrayType(type); + } + values = createTestArrays(values); + tester.testRoundTrip(objectInspector, values, values, type); + } + + @Test + public void testSingleLevelSchemaNestedArrays() + throws Exception + { + int nestingLevel = ThreadLocalRandom.current().nextInt(1, 15); + ObjectInspector objectInspector = getStandardListObjectInspector(javaIntObjectInspector); + Type type = new ArrayType(INTEGER); + Iterable values = intsBetween(0, 31_234); + for (int i = 0; i < nestingLevel; i++) { + values = createTestArrays(values); + objectInspector = getStandardListObjectInspector(objectInspector); + type = new ArrayType(type); + } + values = createTestArrays(values); + tester.testSingleLevelArraySchemaRoundTrip(objectInspector, values, values, type); + } + + @Test + public void testArrayOfStructs() + throws Exception + { + Iterable> structs = createNullableTestStructs(transform(intsBetween(0, 31_234), Object::toString), longsBetween(0, 31_234)); + Iterable>> values = createTestArrays(structs); + List structFieldNames = asList("stringField", "longField"); + Type structType = RowType.from(asList(field("stringField", VARCHAR), field("longField", BIGINT))); + tester.testRoundTrip( + getStandardListObjectInspector(getStandardStructObjectInspector(structFieldNames, asList(javaStringObjectInspector, javaLongObjectInspector))), + values, values, new ArrayType(structType)); + } + + @Test + public void testCustomSchemaArrayOfStructs() + throws Exception + { + MessageType customSchemaArrayOfStructs = parseMessageType("message ParquetSchema { " + + " optional group self (LIST) { " + + " repeated group self_tuple { " + + " optional int64 a; " + + " optional boolean b; " + + " required binary c (UTF8); " + + " } " + + " } " + + "}"); + Iterable aValues = limit(cycle(asList(1L, null, 3L, 5L, null, null, null, 7L, 11L, null, 13L, 17L)), 30_000); + Iterable bValues = limit(cycle(asList(null, true, false, null, null, true, false)), 30_000); + Iterable cValues = transform(intsBetween(0, 31_234), Object::toString); + + Iterable> structs = createTestStructs(aValues, bValues, cValues); + Iterable>> values = createTestArrays(structs); + List structFieldNames = asList("a", "b", "c"); + Type structType = RowType.from(asList(field("a", BIGINT), field("b", BOOLEAN), field("c", VARCHAR))); + tester.testSingleLevelArrayRoundTrip( + getStandardListObjectInspector(getStandardStructObjectInspector(structFieldNames, asList(javaLongObjectInspector, javaBooleanObjectInspector, javaStringObjectInspector))), + values, values, "self", new ArrayType(structType), Optional.of(customSchemaArrayOfStructs)); + } + + @Test + public void testSingleLevelSchemaArrayOfStructs() + throws Exception + { + Iterable aValues = limit(cycle(asList(1L, null, 3L, 5L, null, null, null, 7L, 11L, null, 13L, 17L)), 30_000); + Iterable bValues = limit(cycle(asList(null, true, false, null, null, true, false)), 30_000); + Iterable cValues = transform(intsBetween(0, 31_234), Object::toString); + + Iterable> structs = createTestStructs(aValues, bValues, cValues); + Iterable>> values = createTestArrays(structs); + List structFieldNames = asList("a", "b", "c"); + Type structType = RowType.from(asList(field("a", BIGINT), field("b", BOOLEAN), field("c", VARCHAR))); + ObjectInspector objectInspector = getStandardListObjectInspector(getStandardStructObjectInspector(structFieldNames, asList(javaLongObjectInspector, javaBooleanObjectInspector, javaStringObjectInspector))); + tester.testSingleLevelArraySchemaRoundTrip(objectInspector, values, values, new ArrayType(structType)); + } + + @Test + public void testArrayOfArrayOfStructOfArray() + throws Exception + { + Iterable> stringArrayField = createNullableTestArrays(transform(intsBetween(0, 31_234), Object::toString)); + Iterable> structs = createNullableTestStructs(stringArrayField, limit(cycle(asList(1, null, 3, 5, null, 7, 11, null, 17)), 31_234)); + List structFieldNames = asList("stringArrayField", "intField"); + Type structType = RowType.from(asList(field("stringArrayField", new ArrayType(VARCHAR)), field("intField", INTEGER))); + Iterable>> arrays = createNullableTestArrays(structs); + Iterable>>> values = createTestArrays(arrays); + tester.testRoundTrip( + getStandardListObjectInspector( + getStandardListObjectInspector( + getStandardStructObjectInspector( + structFieldNames, + asList(getStandardListObjectInspector(javaStringObjectInspector), javaIntObjectInspector)))), + values, values, new ArrayType(new ArrayType(structType))); + } + + @Test + public void testSingleLevelSchemaArrayOfArrayOfStructOfArray() + throws Exception + { + Iterable> stringArrayField = createNullableTestArrays(transform(intsBetween(0, 31_234), Object::toString)); + Iterable> structs = createTestStructs(stringArrayField, limit(cycle(asList(1, null, 3, 5, null, 7, 11, null, 17)), 31_234)); + List structFieldNames = asList("stringArrayField", "intField"); + Type structType = RowType.from(asList(field("stringArrayField", new ArrayType(VARCHAR)), field("intField", INTEGER))); + Iterable>> arrays = createTestArrays(structs); + Iterable>>> values = createTestArrays(arrays); + tester.testSingleLevelArraySchemaRoundTrip( + getStandardListObjectInspector( + getStandardListObjectInspector( + getStandardStructObjectInspector( + structFieldNames, + asList(getStandardListObjectInspector(javaStringObjectInspector), javaIntObjectInspector)))), + values, values, new ArrayType(new ArrayType(structType))); + } + + @Test + public void testArrayOfStructOfArray() + throws Exception + { + Iterable> stringArrayField = createNullableTestArrays(transform(intsBetween(0, 31_234), Object::toString)); + Iterable> structs = createNullableTestStructs(stringArrayField, limit(cycle(asList(1, 3, null, 5, 7, null, 11, 13, null, 17)), 31_234)); + List structFieldNames = asList("stringArrayField", "intField"); + Type structType = RowType.from(asList(field("stringArrayField", new ArrayType(VARCHAR)), field("intField", INTEGER))); + Iterable>> values = createTestArrays(structs); + tester.testRoundTrip( + getStandardListObjectInspector( + getStandardStructObjectInspector( + structFieldNames, + asList(getStandardListObjectInspector(javaStringObjectInspector), javaIntObjectInspector))), + values, values, new ArrayType(structType)); + } + + @Test + public void testSingleLevelSchemaArrayOfStructOfArray() + throws Exception + { + Iterable> stringArrayField = createNullableTestArrays(transform(intsBetween(0, 31_234), Object::toString)); + Iterable> structs = createTestStructs(stringArrayField, limit(cycle(asList(1, 3, null, 5, 7, null, 11, 13, null, 17)), 31_234)); + List structFieldNames = asList("stringArrayField", "intField"); + Type structType = RowType.from(asList(field("stringArrayField", new ArrayType(VARCHAR)), field("intField", INTEGER))); + Iterable>> values = createTestArrays(structs); + tester.testSingleLevelArraySchemaRoundTrip( + getStandardListObjectInspector( + getStandardStructObjectInspector( + structFieldNames, + asList(getStandardListObjectInspector(javaStringObjectInspector), javaIntObjectInspector))), + values, values, new ArrayType(structType)); + } + + @Test + public void testMap() + throws Exception + { + Iterable> values = createTestMaps(transform(intsBetween(0, 100_000), Object::toString), longsBetween(0, 10_000)); + tester.testRoundTrip(getStandardMapObjectInspector(javaStringObjectInspector, javaLongObjectInspector), values, values, mapType(VARCHAR, BIGINT)); + } + + @Test + public void testNestedMaps() + throws Exception + { + int nestingLevel = ThreadLocalRandom.current().nextInt(1, 15); + Iterable keys = intsBetween(0, 3_210); + Iterable maps = limit(cycle(asList(null, "value2", "value3", null, null, "value6", "value7")), 3_210); + ObjectInspector objectInspector = getStandardMapObjectInspector(javaIntObjectInspector, javaStringObjectInspector); + Type type = mapType(INTEGER, VARCHAR); + for (int i = 0; i < nestingLevel; i++) { + maps = createNullableTestMaps(keys, maps); + objectInspector = getStandardMapObjectInspector(javaIntObjectInspector, objectInspector); + type = mapType(INTEGER, type); + } + maps = createTestMaps(keys, maps); + tester.testRoundTrip(objectInspector, maps, maps, type); + } + + @Test + public void testArrayOfMaps() + throws Exception + { + Iterable> maps = createNullableTestMaps(transform(intsBetween(0, 10), Object::toString), longsBetween(0, 10)); + List>> values = createTestArrays(maps); + tester.testRoundTrip(getStandardListObjectInspector(getStandardMapObjectInspector(javaStringObjectInspector, javaLongObjectInspector)), + values, values, new ArrayType(mapType(VARCHAR, BIGINT))); + } + + @Test + public void testSingleLevelSchemaArrayOfMaps() + throws Exception + { + Iterable> maps = createTestMaps(transform(intsBetween(0, 10), Object::toString), longsBetween(0, 10)); + List>> values = createTestArrays(maps); + ObjectInspector objectInspector = getStandardListObjectInspector(getStandardMapObjectInspector(javaStringObjectInspector, javaLongObjectInspector)); + tester.testSingleLevelArraySchemaRoundTrip(objectInspector, values, values, new ArrayType(mapType(VARCHAR, BIGINT))); + } + + @Test + public void testArrayOfMapOfStruct() + throws Exception + { + Iterable keys = intsBetween(0, 10_000); + Iterable> structs = createNullableTestStructs(transform(intsBetween(0, 10_000), Object::toString), longsBetween(0, 10_000)); + List structFieldNames = asList("stringField", "longField"); + Type structType = RowType.from(asList(field("stringField", VARCHAR), field("longField", BIGINT))); + Iterable>> maps = createNullableTestMaps(keys, structs); + List>>> values = createTestArrays(maps); + tester.testRoundTrip(getStandardListObjectInspector( + getStandardMapObjectInspector( + javaIntObjectInspector, + getStandardStructObjectInspector(structFieldNames, asList(javaStringObjectInspector, javaLongObjectInspector)))), + values, values, new ArrayType(mapType(INTEGER, structType))); + } + + @Test + public void testSingleLevelArrayOfMapOfStruct() + throws Exception + { + Iterable keys = intsBetween(0, 10_000); + Iterable> structs = createNullableTestStructs(transform(intsBetween(0, 10_000), Object::toString), longsBetween(0, 10_000)); + List structFieldNames = asList("stringField", "longField"); + Type structType = RowType.from(asList(field("stringField", VARCHAR), field("longField", BIGINT))); + Iterable>> maps = createTestMaps(keys, structs); + List>>> values = createTestArrays(maps); + tester.testSingleLevelArraySchemaRoundTrip(getStandardListObjectInspector( + getStandardMapObjectInspector( + javaIntObjectInspector, + getStandardStructObjectInspector(structFieldNames, asList(javaStringObjectInspector, javaLongObjectInspector)))), + values, values, new ArrayType(mapType(INTEGER, structType))); + } + + @Test + public void testSingleLevelArrayOfStructOfSingleElement() + throws Exception + { + Iterable> structs = createTestStructs(transform(intsBetween(0, 31_234), Object::toString)); + Iterable>> values = createTestArrays(structs); + List structFieldNames = singletonList("test"); + Type structType = RowType.from(singletonList(field("test", VARCHAR))); + tester.testRoundTrip( + getStandardListObjectInspector(getStandardStructObjectInspector(structFieldNames, singletonList(javaStringObjectInspector))), + values, values, new ArrayType(structType)); + tester.testSingleLevelArraySchemaRoundTrip( + getStandardListObjectInspector(getStandardStructObjectInspector(structFieldNames, singletonList(javaStringObjectInspector))), + values, values, new ArrayType(structType)); + } + + @Test + public void testSingleLevelArrayOfStructOfStructOfSingleElement() + throws Exception + { + Iterable> structs = createTestStructs(transform(intsBetween(0, 31_234), Object::toString)); + Iterable> structsOfStructs = createTestStructs(structs); + Iterable>> values = createTestArrays(structsOfStructs); + List structFieldNames = singletonList("test"); + List structsOfStructsFieldNames = singletonList("test"); + Type structType = RowType.from(singletonList(field("test", VARCHAR))); + Type structsOfStructsType = RowType.from(singletonList(field("test", structType))); + ObjectInspector structObjectInspector = getStandardStructObjectInspector(structFieldNames, singletonList(javaStringObjectInspector)); + tester.testRoundTrip( + getStandardListObjectInspector( + getStandardStructObjectInspector(structsOfStructsFieldNames, singletonList(structObjectInspector))), + values, values, new ArrayType(structsOfStructsType)); + tester.testSingleLevelArraySchemaRoundTrip( + getStandardListObjectInspector( + getStandardStructObjectInspector(structsOfStructsFieldNames, singletonList(structObjectInspector))), + values, values, new ArrayType(structsOfStructsType)); + } + + @Test + public void testArrayOfMapOfArray() + throws Exception + { + Iterable> arrays = createNullableTestArrays(limit(cycle(asList(1, null, 3, 5, null, null, null, 7, 11, null, 13, 17)), 10_000)); + Iterable keys = transform(intsBetween(0, 10_000), Object::toString); + Iterable>> maps = createNullableTestMaps(keys, arrays); + List>>> values = createTestArrays(maps); + tester.testRoundTrip(getStandardListObjectInspector( + getStandardMapObjectInspector( + javaStringObjectInspector, + getStandardListObjectInspector(javaIntObjectInspector))), + values, values, new ArrayType(mapType(VARCHAR, new ArrayType(INTEGER)))); + } + + @Test + public void testSingleLevelArrayOfMapOfArray() + throws Exception + { + Iterable> arrays = createNullableTestArrays(intsBetween(0, 10_000)); + Iterable keys = transform(intsBetween(0, 10_000), Object::toString); + Iterable>> maps = createTestMaps(keys, arrays); + List>>> values = createTestArrays(maps); + tester.testSingleLevelArraySchemaRoundTrip(getStandardListObjectInspector( + getStandardMapObjectInspector( + javaStringObjectInspector, + getStandardListObjectInspector(javaIntObjectInspector))), + values, values, new ArrayType(mapType(VARCHAR, new ArrayType(INTEGER)))); + } + + @Test + public void testMapOfArray() + throws Exception + { + Iterable> arrays = createNullableTestArrays(limit(cycle(asList(1, null, 3, 5, null, null, null, 7, 11, null, 13, 17)), 30_000)); + Iterable keys = intsBetween(0, 30_000); + Iterable>> values = createTestMaps(keys, arrays); + tester.testRoundTrip(getStandardMapObjectInspector( + javaIntObjectInspector, + getStandardListObjectInspector(javaIntObjectInspector)), + values, values, mapType(INTEGER, new ArrayType(INTEGER))); + } + + @Test + public void testMapOfSingleLevelArray() + throws Exception + { + Iterable> arrays = createNullableTestArrays(intsBetween(0, 30_000)); + Iterable keys = intsBetween(0, 30_000); + Iterable>> values = createTestMaps(keys, arrays); + tester.testSingleLevelArraySchemaRoundTrip(getStandardMapObjectInspector( + javaIntObjectInspector, + getStandardListObjectInspector(javaIntObjectInspector)), + values, values, mapType(INTEGER, new ArrayType(INTEGER))); + } + + @Test + public void testMapOfStruct() + throws Exception + { + Iterable keys = longsBetween(0, 30_000); + Iterable> structs = createNullableTestStructs(transform(intsBetween(0, 30_000), Object::toString), longsBetween(0, 30_000)); + List structFieldNames = asList("stringField", "longField"); + Type structType = RowType.from(asList(field("stringField", VARCHAR), field("longField", BIGINT))); + Iterable>> values = createTestMaps(keys, structs); + tester.testRoundTrip(getStandardMapObjectInspector( + javaLongObjectInspector, + getStandardStructObjectInspector(structFieldNames, asList(javaStringObjectInspector, javaLongObjectInspector))), + values, values, mapType(BIGINT, structType)); + } + + @Test + public void testMapWithNullValues() + throws Exception + { + Iterable mapKeys = intsBetween(0, 31_234); + Iterable mapValues = limit(cycle(asList(null, "value2", "value3", null, null, "value6", "value7")), 31_234); + Iterable> values = createTestMaps(mapKeys, mapValues); + tester.testRoundTrip(getStandardMapObjectInspector(javaIntObjectInspector, javaStringObjectInspector), values, values, mapType(INTEGER, VARCHAR)); + } + + @Test + public void testStruct() + throws Exception + { + List> values = createTestStructs(transform(intsBetween(0, 31_234), Object::toString), longsBetween(0, 31_234)); + List structFieldNames = asList("stringField", "longField"); + Type structType = RowType.from(asList(field("stringField", VARCHAR), field("longField", BIGINT))); + tester.testRoundTrip(getStandardStructObjectInspector(structFieldNames, asList(javaStringObjectInspector, javaLongObjectInspector)), values, values, structType); + } + + @Test + public void testNestedStructs() + throws Exception + { + int nestingLevel = ThreadLocalRandom.current().nextInt(1, 15); + Optional> structFieldNames = Optional.of(singletonList("structField")); + Iterable values = limit(cycle(asList(1, null, 3, null, 5, null, 7, null, null, null, 11, null, 13)), 3_210); + ObjectInspector objectInspector = getStandardStructObjectInspector(structFieldNames.get(), singletonList(javaIntObjectInspector)); + Type type = RowType.from(singletonList(field("structField", INTEGER))); + for (int i = 0; i < nestingLevel; i++) { + values = createNullableTestStructs(values); + objectInspector = getStandardStructObjectInspector(structFieldNames.get(), singletonList(objectInspector)); + type = RowType.from(singletonList(field("structField", type))); + } + values = createTestStructs(values); + tester.testRoundTrip(objectInspector, values, values, type); + } + + @Test + public void testComplexNestedStructs() + throws Exception + { + final int n = 30; + Iterable mapKeys = intsBetween(0, n); + Iterable intPrimitives = limit(cycle(asList(1, null, 3, null, 5, null, 7, null, null, null, 11, null, 13)), n); + Iterable stringPrimitives = limit(cycle(asList(null, "value2", "value3", null, null, "value6", "value7")), n); + Iterable doublePrimitives = limit(cycle(asList(1.1, null, 3.3, null, 5.5, null, 7.7, null, null, null, 11.11, null, 13.13)), n); + Iterable booleanPrimitives = limit(cycle(asList(null, true, false, null, null, true, false)), n); + Iterable mapStringKeys = Stream.generate(() -> UUID.randomUUID().toString()).limit(n).collect(Collectors.toList()); + Iterable> mapsIntString = createNullableTestMaps(mapKeys, stringPrimitives); + Iterable> arraysString = createNullableTestArrays(stringPrimitives); + Iterable> mapsIntDouble = createNullableTestMaps(mapKeys, doublePrimitives); + Iterable> arraysBoolean = createNullableTestArrays(booleanPrimitives); + Iterable> mapsStringString = createNullableTestMaps(mapStringKeys, stringPrimitives); + + List struct1FieldNames = asList("mapIntStringField", "stringArrayField", "intField"); + Iterable structs1 = createNullableTestStructs(mapsIntString, arraysString, intPrimitives); + ObjectInspector struct1ObjectInspector = getStandardStructObjectInspector(struct1FieldNames, + asList( + getStandardMapObjectInspector(javaIntObjectInspector, javaStringObjectInspector), + getStandardListObjectInspector(javaStringObjectInspector), + javaIntObjectInspector)); + Type struct1Type = RowType.from(asList( + field("mapIntStringField", mapType(INTEGER, VARCHAR)), + field("stringArrayField", new ArrayType(VARCHAR)), + field("intField", INTEGER))); + + List struct2FieldNames = asList("mapIntStringField", "stringArrayField", "structField"); + Iterable structs2 = createNullableTestStructs(mapsIntString, arraysString, structs1); + ObjectInspector struct2ObjectInspector = getStandardStructObjectInspector(struct2FieldNames, + asList( + getStandardMapObjectInspector(javaIntObjectInspector, javaStringObjectInspector), + getStandardListObjectInspector(javaStringObjectInspector), + struct1ObjectInspector)); + Type struct2Type = RowType.from(asList( + field("mapIntStringField", mapType(INTEGER, VARCHAR)), + field("stringArrayField", new ArrayType(VARCHAR)), + field("structField", struct1Type))); + + List struct3FieldNames = asList("mapIntDoubleField", "booleanArrayField", "booleanField"); + Iterable structs3 = createNullableTestStructs(mapsIntDouble, arraysBoolean, booleanPrimitives); + ObjectInspector struct3ObjectInspector = getStandardStructObjectInspector(struct3FieldNames, + asList( + getStandardMapObjectInspector(javaIntObjectInspector, javaDoubleObjectInspector), + getStandardListObjectInspector(javaBooleanObjectInspector), + javaBooleanObjectInspector)); + Type struct3Type = RowType.from(asList( + field("mapIntDoubleField", mapType(INTEGER, DOUBLE)), + field("booleanArrayField", new ArrayType(BOOLEAN)), + field("booleanField", BOOLEAN))); + + List struct4FieldNames = asList("mapIntDoubleField", "booleanArrayField", "structField"); + Iterable structs4 = createNullableTestStructs(mapsIntDouble, arraysBoolean, structs3); + ObjectInspector struct4ObjectInspector = getStandardStructObjectInspector(struct4FieldNames, + asList( + getStandardMapObjectInspector(javaIntObjectInspector, javaDoubleObjectInspector), + getStandardListObjectInspector(javaBooleanObjectInspector), + struct3ObjectInspector)); + Type struct4Type = RowType.from(asList( + field("mapIntDoubleField", mapType(INTEGER, DOUBLE)), + field("booleanArrayField", new ArrayType(BOOLEAN)), + field("structField", struct3Type))); + + List structFieldNames = asList("structField1", "structField2", "structField3", "structField4", "mapIntDoubleField", "booleanArrayField", "mapStringStringField"); + List objectInspectors = + asList( + struct1ObjectInspector, + struct2ObjectInspector, + struct3ObjectInspector, + struct4ObjectInspector, + getStandardMapObjectInspector(javaIntObjectInspector, javaDoubleObjectInspector), + getStandardListObjectInspector(javaBooleanObjectInspector), + getStandardMapObjectInspector(javaStringObjectInspector, javaStringObjectInspector)); + List types = ImmutableList.of(struct1Type, struct2Type, struct3Type, struct4Type, mapType(INTEGER, DOUBLE), new ArrayType(BOOLEAN), mapType(VARCHAR, VARCHAR)); + + Iterable[] values = new Iterable[] {structs1, structs2, structs3, structs4, mapsIntDouble, arraysBoolean, mapsStringString}; + tester.assertRoundTrip(objectInspectors, values, values, structFieldNames, types, Optional.empty()); + } + + @Test + public void testStructOfMaps() + throws Exception + { + Iterable mapKeys = Stream.generate(() -> ThreadLocalRandom.current().nextInt(10_000)).limit(10_000).collect(Collectors.toList()); + Iterable intPrimitives = limit(cycle(asList(1, null, 3, null, 5, null, 7, null, null, null, 11, null, 13)), 10_000); + Iterable stringPrimitives = limit(cycle(asList(null, "value2", "value3", null, null, "value6", "value7")), 10_000); + Iterable> maps = createNullableTestMaps(mapKeys, stringPrimitives); + Iterable> stringArrayField = createNullableTestArrays(stringPrimitives); + List> values = createTestStructs(maps, stringArrayField, intPrimitives); + List structFieldNames = asList("mapIntStringField", "stringArrayField", "intField"); + + Type structType = RowType.from(asList(field("mapIntStringField", mapType(INTEGER, VARCHAR)), field("stringArrayField", new ArrayType(VARCHAR)), field("intField", INTEGER))); + tester.testRoundTrip(getStandardStructObjectInspector(structFieldNames, + asList( + getStandardMapObjectInspector(javaIntObjectInspector, javaStringObjectInspector), + getStandardListObjectInspector(javaStringObjectInspector), + javaIntObjectInspector)), + values, values, structType); + } + + @Test + public void testStructOfNullableMapBetweenNonNullFields() + throws Exception + { + Iterable intPrimitives = intsBetween(0, 10_000); + Iterable stringPrimitives = limit(cycle(asList(null, "value2", "value3", null, null, "value6", "value7")), 10_000); + Iterable> maps = createNullableTestMaps(intPrimitives, stringPrimitives); + List> values = createTestStructs(intPrimitives, maps, intPrimitives); + List structFieldNames = asList("intField1", "mapIntStringField", "intField2"); + + Type structType = RowType.from(asList(field("intField1", INTEGER), field("mapIntStringField", mapType(INTEGER, VARCHAR)), field("intField2", INTEGER))); + tester.testRoundTrip(getStandardStructObjectInspector(structFieldNames, + asList( + javaIntObjectInspector, + getStandardMapObjectInspector(javaIntObjectInspector, javaStringObjectInspector), + javaIntObjectInspector)), + values, values, structType); + } + + @Test + public void testStructOfNullableArrayBetweenNonNullFields() + throws Exception + { + Iterable intPrimitives = intsBetween(0, 10_000); + Iterable stringPrimitives = limit(cycle(asList(null, "value2", "value3", null, null, "value6", "value7")), 10_000); + Iterable> stringArrayField = createNullableTestArrays(stringPrimitives); + List> values = createTestStructs(intPrimitives, stringArrayField, intPrimitives); + List structFieldNames = asList("intField1", "arrayStringField", "intField2"); + + Type structType = RowType.from(asList(field("intField1", INTEGER), field("arrayStringField", new ArrayType(VARCHAR)), field("intField2", INTEGER))); + tester.testRoundTrip(getStandardStructObjectInspector(structFieldNames, + asList( + javaIntObjectInspector, + getStandardListObjectInspector(javaStringObjectInspector), + javaIntObjectInspector)), + values, values, structType); + } + + @Test + public void testStructOfArrayAndPrimitive() + throws Exception + { + Iterable> stringArrayField = createNullableTestArrays(transform(intsBetween(0, 31_234), Object::toString)); + List> values = createTestStructs(stringArrayField, limit(cycle(ImmutableList.of(1, 3, 5, 7, 11, 13, 17)), 31_234)); + List structFieldNames = asList("stringArrayField", "intField"); + + Type structType = RowType.from(asList(field("stringArrayField", new ArrayType(VARCHAR)), field("intField", INTEGER))); + tester.testRoundTrip(getStandardStructObjectInspector(structFieldNames, + asList(getStandardListObjectInspector(javaStringObjectInspector), javaIntObjectInspector)), values, values, structType); + } + + @Test + public void testStructOfSingleLevelArrayAndPrimitive() + throws Exception + { + Iterable> stringArrayField = createNullableTestArrays(transform(intsBetween(0, 31_234), Object::toString)); + List> values = createTestStructs(stringArrayField, limit(cycle(ImmutableList.of(1, 3, 5, 7, 11, 13, 17)), 31_234)); + List structFieldNames = asList("stringArrayField", "intField"); + + Type structType = RowType.from(asList(field("stringArrayField", new ArrayType(VARCHAR)), field("intField", INTEGER))); + tester.testSingleLevelArraySchemaRoundTrip(getStandardStructObjectInspector(structFieldNames, + asList(getStandardListObjectInspector(javaStringObjectInspector), javaIntObjectInspector)), values, values, structType); + } + + @Test + public void testStructOfPrimitiveAndArray() + throws Exception + { + Iterable> stringArrayField = createNullableTestArrays(transform(intsBetween(0, 31_234), Object::toString)); + Iterable intField = limit(cycle(ImmutableList.of(1, 3, 5, 7, 11, 13, 17)), 31_234); + List> values = createTestStructs(intField, stringArrayField); + List structFieldNames = asList("intField", "stringArrayField"); + + Type structType = RowType.from(asList(field("intField", INTEGER), field("stringArrayField", new ArrayType(VARCHAR)))); + tester.testRoundTrip(getStandardStructObjectInspector(structFieldNames, + asList(javaIntObjectInspector, getStandardListObjectInspector(javaStringObjectInspector))), values, values, structType); + } + + @Test + public void testStructOfPrimitiveAndSingleLevelArray() + throws Exception + { + Iterable> stringArrayField = createNullableTestArrays(transform(intsBetween(0, 31_234), Object::toString)); + Iterable intField = limit(cycle(ImmutableList.of(1, 3, 5, 7, 11, 13, 17)), 31_234); + List> values = createTestStructs(intField, stringArrayField); + List structFieldNames = asList("intField", "stringArrayField"); + + Type structType = RowType.from(asList(field("intField", INTEGER), field("stringArrayField", new ArrayType(VARCHAR)))); + tester.testSingleLevelArraySchemaRoundTrip(getStandardStructObjectInspector(structFieldNames, + asList(javaIntObjectInspector, getStandardListObjectInspector(javaStringObjectInspector))), values, values, structType); + } + + @Test + public void testStructOfTwoArrays() + throws Exception + { + Iterable> intArrayField = createNullableTestArrays(limit(cycle(ImmutableList.of(1, 3, 5, 7, 11, 13, 17)), 30_000)); + Iterable> stringArrayField = createNullableTestArrays(transform(intsBetween(0, 30_000), Object::toString)); + List> values = createTestStructs(stringArrayField, intArrayField); + List structFieldNames = asList("stringArrayField", "intArrayField"); + + Type structType = RowType.from(asList(field("stringArrayField", new ArrayType(VARCHAR)), field("intArrayField", new ArrayType(INTEGER)))); + tester.testRoundTrip(getStandardStructObjectInspector(structFieldNames, + asList(getStandardListObjectInspector(javaStringObjectInspector), getStandardListObjectInspector(javaIntObjectInspector))), values, values, structType); + } + + @Test + public void testStructOfTwoNestedArrays() + throws Exception + { + Iterable>> intArrayField = createNullableTestArrays(createNullableTestArrays(limit(cycle(ImmutableList.of(1, 3, 5, 7, 11, 13, 17)), 30_000))); + Iterable>> stringArrayField = createNullableTestArrays(createNullableTestArrays(transform(intsBetween(0, 31_234), Object::toString))); + List> values = createTestStructs(stringArrayField, intArrayField); + List structFieldNames = asList("stringArrayField", "intArrayField"); + Type structType = RowType.from(asList(field("stringArrayField", new ArrayType(new ArrayType(VARCHAR))), field("intArrayField", new ArrayType(new ArrayType(INTEGER))))); + tester.testRoundTrip(getStandardStructObjectInspector(structFieldNames, + asList( + getStandardListObjectInspector(getStandardListObjectInspector(javaStringObjectInspector)), + getStandardListObjectInspector(getStandardListObjectInspector(javaIntObjectInspector)))), + values, values, structType); + } + + @Test + public void testStructOfTwoNestedSingleLevelSchemaArrays() + throws Exception + { + Iterable>> intArrayField = createNullableTestArrays(createTestArrays(limit(cycle(ImmutableList.of(1, 3, 5, 7, 11, 13, 17)), 30_000))); + Iterable>> stringArrayField = createNullableTestArrays(createTestArrays(transform(intsBetween(0, 31_234), Object::toString))); + List> values = createTestStructs(stringArrayField, intArrayField); + List structFieldNames = asList("stringArrayField", "intArrayField"); + + Type structType = RowType.from(asList(field("stringArrayField", new ArrayType(new ArrayType(VARCHAR))), field("intArrayField", new ArrayType(new ArrayType(INTEGER))))); + ObjectInspector objectInspector = getStandardStructObjectInspector(structFieldNames, + asList( + getStandardListObjectInspector(getStandardListObjectInspector(javaStringObjectInspector)), + getStandardListObjectInspector(getStandardListObjectInspector(javaIntObjectInspector)))); + tester.testSingleLevelArraySchemaRoundTrip(objectInspector, values, values, structType); + } + + @Test + public void testBooleanSequence() + throws Exception + { + tester.testRoundTrip(javaBooleanObjectInspector, limit(cycle(ImmutableList.of(true, false, false)), 30_000), BOOLEAN); + } + + @Test + public void testLongSequence() + throws Exception + { + testRoundTripNumeric(intsBetween(0, 31_234)); + } + + @Test + public void testLongSequenceWithHoles() + throws Exception + { + testRoundTripNumeric(skipEvery(5, intsBetween(0, 31_234))); + } + + @Test + public void testLongDirect() + throws Exception + { + testRoundTripNumeric(limit(cycle(ImmutableList.of(1, 3, 5, 7, 11, 13, 17)), 30_000)); + } + + @Test + public void testLongDirect2() + throws Exception + { + List values = new ArrayList<>(31_234); + for (int i = 0; i < 31_234; i++) { + values.add(i); + } + Collections.shuffle(values, new Random(0)); + testRoundTripNumeric(values); + } + + @Test + public void testLongShortRepeat() + throws Exception + { + testRoundTripNumeric(limit(repeatEach(4, cycle(ImmutableList.of(1, 3, 5, 7, 11, 13, 17))), 30_000)); + } + + @Test + public void testLongPatchedBase() + throws Exception + { + testRoundTripNumeric(limit(cycle(concat(intsBetween(0, 18), ImmutableList.of(30_000, 20_000))), 30_000)); + } + + // copied from Parquet code to determine the max decimal precision supported by INT32/INT64 + private static long maxPrecision(int numBytes) + { + return Math.round(Math.floor(Math.log10(Math.pow(2, 8 * numBytes - 1) - 1))); + } + + @Test + public void testDecimalBackedByINT32() + throws Exception + { + for (int precision = 1; precision <= MAX_PRECISION_INT32; precision++) { + int scale = ThreadLocalRandom.current().nextInt(precision); + MessageType parquetSchema = parseMessageType(format("message hive_decimal { optional INT32 test (DECIMAL(%d, %d)); }", precision, scale)); + ContiguousSet intValues = intsBetween(1, 1_000); + ImmutableList.Builder expectedValues = new ImmutableList.Builder<>(); + for (Integer value : intValues) { + expectedValues.add(SqlDecimal.of(value, precision, scale)); + } + tester.testRoundTrip(javaIntObjectInspector, intValues, expectedValues.build(), createDecimalType(precision, scale), Optional.of(parquetSchema)); + } + } + + @Test + public void testDecimalBackedByINT64() + throws Exception + { + for (int precision = MAX_PRECISION_INT32 + 1; precision <= MAX_PRECISION_INT64; precision++) { + int scale = ThreadLocalRandom.current().nextInt(precision); + MessageType parquetSchema = parseMessageType(format("message hive_decimal { optional INT64 test (DECIMAL(%d, %d)); }", precision, scale)); + ContiguousSet longValues = longsBetween(1, 1_000); + ImmutableList.Builder expectedValues = new ImmutableList.Builder<>(); + for (Long value : longValues) { + expectedValues.add(SqlDecimal.of(value, precision, scale)); + } + tester.testRoundTrip(javaLongObjectInspector, longValues, expectedValues.build(), createDecimalType(precision, scale), Optional.of(parquetSchema)); + } + } + + @Test + public void testDecimalBackedByFixedLenByteArray() + throws Exception + { + for (int precision = MAX_PRECISION_INT64 + 1; precision < MAX_PRECISION; precision++) { + int scale = ThreadLocalRandom.current().nextInt(precision); + ContiguousSet values = bigIntegersBetween(BigDecimal.valueOf(Math.pow(10, precision - 1)).toBigInteger(), BigDecimal.valueOf(Math.pow(10, precision)).toBigInteger()); + ImmutableList.Builder expectedValues = new ImmutableList.Builder<>(); + ImmutableList.Builder writeValues = new ImmutableList.Builder<>(); + for (BigInteger value : limit(values, 1_000)) { + writeValues.add(HiveDecimal.create(value, scale)); + expectedValues.add(new SqlDecimal(value, precision, scale)); + } + tester.testRoundTrip(new JavaHiveDecimalObjectInspector(new DecimalTypeInfo(precision, scale)), + writeValues.build(), + expectedValues.build(), + createDecimalType(precision, scale)); + } + } + + @Test + public void testSchemaWithRepeatedOptionalRequiredFields() + throws Exception + { + MessageType parquetSchema = parseMessageType("message hive_schema {" + + " optional group address_book {" + + " required binary owner (UTF8);" + + " optional group owner_phone_numbers (LIST) {" + + " repeated group bag {" + + " optional binary array_element (UTF8);" + + " }" + + " }" + + " optional group contacts (LIST) {" + + " repeated group bag {" + + " optional group array_element {" + + " required binary name (UTF8);" + + " optional binary phone_number (UTF8);" + + " }" + + " }" + + " }" + + " }" + + "} "); + + Iterable owner = limit(cycle(asList("owner1", "owner2", "owner3")), 50_000); + Iterable> ownerPhoneNumbers = limit(cycle(asList(null, asList("phoneNumber2", "phoneNumber3", null), asList(null, "phoneNumber6", "phoneNumber7"))), 50_000); + Iterable name = asList("name1", "name2", "name3", "name4", "name5", "name6", "name7"); + Iterable phoneNumber = asList(null, "phoneNumber2", "phoneNumber3", null, null, "phoneNumber6", "phoneNumber7"); + Iterable> contact = createNullableTestStructs(name, phoneNumber); + Iterable>> contacts = createNullableTestArrays(limit(cycle(contact), 50_000)); + List> values = createTestStructs(owner, ownerPhoneNumbers, contacts); + List addressBookFieldNames = asList("owner", "owner_phone_numbers", "contacts"); + List contactsFieldNames = asList("name", "phone_number"); + Type contactsType = new ArrayType(RowType.from(asList(field("name", VARCHAR), field("phone_number", VARCHAR)))); + Type addressBookType = RowType.from(asList(field("owner", VARCHAR), field("owner_phone_numbers", new ArrayType(VARCHAR)), field("contacts", contactsType))); + tester.testRoundTrip(getStandardStructObjectInspector(addressBookFieldNames, + asList( + javaStringObjectInspector, + getStandardListObjectInspector(javaStringObjectInspector), + getStandardListObjectInspector( + getStandardStructObjectInspector(contactsFieldNames, asList(javaStringObjectInspector, javaStringObjectInspector))))), + values, values, "address_book", addressBookType, Optional.of(parquetSchema)); + } + + @Test + public void testSchemaWithOptionalOptionalRequiredFields() + throws Exception + { + MessageType parquetSchema = parseMessageType("message hive_schema {" + + " optional group a {" + + " optional group b {" + + " optional group c {" + + " required binary d (UTF8);" + + " }" + + " }" + + " }" + + "} "); + Type cType = RowType.from(singletonList(field("d", VARCHAR))); + Type bType = RowType.from(singletonList(field("c", cType))); + Type aType = RowType.from(singletonList(field("b", bType))); + Iterable dValues = asList("d1", "d2", "d3", "d4", "d5", "d6", "d7"); + Iterable> cValues = createNullableTestStructs(dValues); + Iterable> bValues = createNullableTestStructs(cValues); + List> aValues = createTestStructs(bValues); + ObjectInspector cInspector = getStandardStructObjectInspector(singletonList("d"), singletonList(javaStringObjectInspector)); + ObjectInspector bInspector = getStandardStructObjectInspector(singletonList("c"), singletonList(cInspector)); + ObjectInspector aInspector = getStandardStructObjectInspector(singletonList("b"), singletonList(bInspector)); + tester.testRoundTrip(aInspector, aValues, aValues, "a", aType, Optional.of(parquetSchema)); + } + + @Test + public void testSchemaWithOptionalRequiredOptionalFields() + throws Exception + { + MessageType parquetSchema = parseMessageType("message hive_schema {" + + " optional group a {" + + " optional group b {" + + " required group c {" + + " optional int32 d;" + + " }" + + " }" + + " }" + + "} "); + Type cType = RowType.from(singletonList(field("d", INTEGER))); + Type bType = RowType.from(singletonList(field("c", cType))); + Type aType = RowType.from(singletonList(field("b", bType))); + Iterable dValues = asList(111, null, 333, 444, null, 666, 777); + List> cValues = createTestStructs(dValues); + Iterable> bValues = createNullableTestStructs(cValues); + List> aValues = createTestStructs(bValues); + ObjectInspector cInspector = getStandardStructObjectInspector(singletonList("d"), singletonList(javaIntObjectInspector)); + ObjectInspector bInspector = getStandardStructObjectInspector(singletonList("c"), singletonList(cInspector)); + ObjectInspector aInspector = getStandardStructObjectInspector(singletonList("b"), singletonList(bInspector)); + tester.testRoundTrip(aInspector, aValues, aValues, "a", aType, Optional.of(parquetSchema)); + } + + @Test + public void testSchemaWithRequiredRequiredOptionalFields() + throws Exception + { + MessageType parquetSchema = parseMessageType("message hive_schema {" + + " optional group a {" + + " required group b {" + + " required group c {" + + " optional int32 d;" + + " }" + + " }" + + " }" + + "} "); + Type cType = RowType.from(singletonList(field("d", INTEGER))); + Type bType = RowType.from(singletonList(field("c", cType))); + Type aType = RowType.from(singletonList(field("b", bType))); + Iterable dValues = asList(111, null, 333, 444, null, 666, 777); + List> cValues = createTestStructs(dValues); + List> bValues = createTestStructs(cValues); + List> aValues = createTestStructs(bValues); + ObjectInspector cInspector = getStandardStructObjectInspector(singletonList("d"), singletonList(javaIntObjectInspector)); + ObjectInspector bInspector = getStandardStructObjectInspector(singletonList("c"), singletonList(cInspector)); + ObjectInspector aInspector = getStandardStructObjectInspector(singletonList("b"), singletonList(bInspector)); + tester.testRoundTrip(aInspector, aValues, aValues, "a", aType, Optional.of(parquetSchema)); + } + + @Test + public void testSchemaWithRequiredOptionalOptionalFields() + throws Exception + { + MessageType parquetSchema = parseMessageType("message hive_schema {" + + " optional group a {" + + " required group b {" + + " optional group c {" + + " optional int32 d;" + + " }" + + " }" + + " }" + + "} "); + Type cType = RowType.from(singletonList(field("d", INTEGER))); + Type bType = RowType.from(singletonList(field("c", cType))); + Type aType = RowType.from(singletonList(field("b", bType))); + Iterable dValues = asList(111, null, 333, 444, null, 666, 777); + Iterable> cValues = createNullableTestStructs(dValues); + List> bValues = createTestStructs(cValues); + List> aValues = createTestStructs(bValues); + ObjectInspector cInspector = getStandardStructObjectInspector(singletonList("d"), singletonList(javaIntObjectInspector)); + ObjectInspector bInspector = getStandardStructObjectInspector(singletonList("c"), singletonList(cInspector)); + ObjectInspector aInspector = getStandardStructObjectInspector(singletonList("b"), singletonList(bInspector)); + tester.testRoundTrip(aInspector, aValues, aValues, "a", aType, Optional.of(parquetSchema)); + } + + @Test + public void testSchemaWithRequiredOptionalRequiredFields() + throws Exception + { + MessageType parquetSchema = parseMessageType("message hive_schema {" + + " optional group a {" + + " required group b {" + + " optional group c {" + + " required binary d (UTF8);" + + " }" + + " }" + + " }" + + "} "); + Type cType = RowType.from(singletonList(field("d", VARCHAR))); + Type bType = RowType.from(singletonList(field("c", cType))); + Type aType = RowType.from(singletonList(field("b", bType))); + Iterable dValues = asList("d1", "d2", "d3", "d4", "d5", "d6", "d7"); + Iterable> cValues = createNullableTestStructs(dValues); + List> bValues = createTestStructs(cValues); + List> aValues = createTestStructs(bValues); + ObjectInspector cInspector = getStandardStructObjectInspector(singletonList("d"), singletonList(javaStringObjectInspector)); + ObjectInspector bInspector = getStandardStructObjectInspector(singletonList("c"), singletonList(cInspector)); + ObjectInspector aInspector = getStandardStructObjectInspector(singletonList("b"), singletonList(bInspector)); + tester.testRoundTrip(aInspector, aValues, aValues, "a", aType, Optional.of(parquetSchema)); + } + + @Test + public void testSchemaWithRequiredStruct() + throws Exception + { + MessageType parquetSchema = parseMessageType("message hive_schema {" + + " required group a {" + + " required group b {" + + " required binary c (UTF8);" + + " required int32 d;" + + " }" + + " required binary e (UTF8);" + + " }" + + "} "); + Type bType = RowType.from(asList(field("c", VARCHAR), field("d", INTEGER))); + Type aType = RowType.from(asList(field("b", bType), field("e", VARCHAR))); + Iterable cValues = limit(cycle(asList("c0", "c1", "c2", "c3", "c4", "c5", "c6", "c7")), 30000); + Iterable dValues = intsBetween(0, 30000); + Iterable eValues = limit(cycle(asList("e0", "e1", "e2", "e3", "e4", "e5", "e6", "e7")), 30000); + List> bValues = createTestStructs(cValues, dValues); + List> aValues = createTestStructs(bValues, eValues); + ObjectInspector bInspector = getStandardStructObjectInspector(asList("c", "d"), asList(javaStringObjectInspector, javaIntObjectInspector)); + ObjectInspector aInspector = getStandardStructObjectInspector(asList("b", "e"), asList(bInspector, javaStringObjectInspector)); + tester.assertRoundTrip(singletonList(aInspector), new Iterable[] {aValues}, new Iterable[] { + aValues}, singletonList("a"), singletonList(aType), Optional.of(parquetSchema)); + } + + @Test + public void testSchemaWithRequiredOptionalRequired2Fields() + throws Exception + { + MessageType parquetSchema = parseMessageType("message hive_schema {" + + " optional group a {" + + " required group b {" + + " optional group c {" + + " required binary d (UTF8);" + + " }" + + " }" + + " }" + + " optional group e {" + + " required group f {" + + " optional group g {" + + " required binary h (UTF8);" + + " }" + + " }" + + " }" + + "} "); + + Type cType = RowType.from(singletonList(field("d", VARCHAR))); + Type bType = RowType.from(singletonList(field("c", cType))); + Type aType = RowType.from(singletonList(field("b", bType))); + Iterable dValues = asList("d1", "d2", "d3", "d4", "d5", "d6", "d7"); + Iterable> cValues = createNullableTestStructs(dValues); + List> bValues = createTestStructs(cValues); + List> aValues = createTestStructs(bValues); + + Type gType = RowType.from(singletonList(field("h", VARCHAR))); + Type fType = RowType.from(singletonList(field("g", gType))); + Type eType = RowType.from(singletonList(field("f", fType))); + Iterable hValues = asList("h1", "h2", "h3", "h4", "h5", "h6", "h7"); + Iterable> gValues = createNullableTestStructs(hValues); + List> fValues = createTestStructs(gValues); + List> eValues = createTestStructs(fValues); + + ObjectInspector cInspector = getStandardStructObjectInspector(singletonList("d"), singletonList(javaStringObjectInspector)); + ObjectInspector bInspector = getStandardStructObjectInspector(singletonList("c"), singletonList(cInspector)); + ObjectInspector aInspector = getStandardStructObjectInspector(singletonList("b"), singletonList(bInspector)); + ObjectInspector gInspector = getStandardStructObjectInspector(singletonList("h"), singletonList(javaStringObjectInspector)); + ObjectInspector fInspector = getStandardStructObjectInspector(singletonList("g"), singletonList(gInspector)); + ObjectInspector eInspector = getStandardStructObjectInspector(singletonList("f"), singletonList(fInspector)); + tester.testRoundTrip(asList(aInspector, eInspector), + new Iterable[] {aValues, eValues}, new Iterable[] {aValues, eValues}, + asList("a", "e"), asList(aType, eType), Optional.of(parquetSchema), false); + } + + @Test + public void testOldAvroArray() + throws Exception + { + MessageType parquetMrAvroSchema = parseMessageType("message avro_schema_old {" + + " optional group my_list (LIST){" + + " repeated int32 array;" + + " }" + + "} "); + Iterable> nonNullArrayElements = createTestArrays(intsBetween(0, 31_234)); + tester.testSingleLevelArrayRoundTrip(getStandardListObjectInspector(javaIntObjectInspector), nonNullArrayElements, nonNullArrayElements, "my_list", new ArrayType(INTEGER), Optional.of(parquetMrAvroSchema)); + } + + @Test + public void testNewAvroArray() + throws Exception + { + MessageType parquetMrAvroSchema = parseMessageType("message avro_schema_new { " + + " optional group my_list (LIST) { " + + " repeated group list { " + + " optional int32 element; " + + " } " + + " } " + + "}"); + Iterable> values = createTestArrays(limit(cycle(asList(1, null, 3, 5, null, null, null, 7, 11, null, 13, 17)), 30_000)); + tester.testRoundTrip(getStandardListObjectInspector(javaIntObjectInspector), values, values, "my_list", new ArrayType(INTEGER), Optional.of(parquetMrAvroSchema)); + } + + /** + * Test reading various arrays schemas compatible with spec + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + */ + @Test + public void testArraySchemas() + throws Exception + { + MessageType parquetMrNullableSpecSchema = parseMessageType("message hive_schema {" + + " optional group my_list (LIST){" + + " repeated group list {" + + " required int32 element;" + + " }" + + " }" + + "} "); + Iterable> nonNullArrayElements = createTestArrays(intsBetween(0, 31_234)); + tester.testRoundTrip(getStandardListObjectInspector(javaIntObjectInspector), nonNullArrayElements, nonNullArrayElements, "my_list", new ArrayType(INTEGER), Optional.of(parquetMrNullableSpecSchema)); + + MessageType parquetMrNonNullSpecSchema = parseMessageType("message hive_schema {" + + " required group my_list (LIST){" + + " repeated group list {" + + " optional int32 element;" + + " }" + + " }" + + "} "); + Iterable> values = createTestArrays(limit(cycle(asList(1, null, 3, 5, null, null, null, 7, 11, null, 13, 17)), 30_000)); + tester.assertRoundTrip(singletonList(getStandardListObjectInspector(javaIntObjectInspector)), new Iterable[] {values}, new Iterable[] { + values}, singletonList("my_list"), singletonList(new ArrayType(INTEGER)), Optional.of(parquetMrNonNullSpecSchema)); + + MessageType sparkSchema = parseMessageType("message hive_schema {" + + " optional group my_list (LIST){" + + " repeated group list {" + + " optional int32 element;" + + " }" + + " }" + + "} "); + tester.testRoundTrip(getStandardListObjectInspector(javaIntObjectInspector), values, values, "my_list", new ArrayType(INTEGER), Optional.of(sparkSchema)); + + MessageType hiveSchema = parseMessageType("message hive_schema {" + + " optional group my_list (LIST){" + + " repeated group bag {" + + " optional int32 array_element;" + + " }" + + " }" + + "} "); + tester.testRoundTrip(getStandardListObjectInspector(javaIntObjectInspector), values, values, "my_list", new ArrayType(INTEGER), Optional.of(hiveSchema)); + + MessageType customNamingSchema = parseMessageType("message hive_schema {" + + " optional group my_list (LIST){" + + " repeated group bag {" + + " optional int32 array;" + + " }" + + " }" + + "} "); + tester.testRoundTrip(getStandardListObjectInspector(javaIntObjectInspector), values, values, "my_list", new ArrayType(INTEGER), Optional.of(customNamingSchema)); + } + + /** + * Test reading various maps schemas compatible with spec + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps + */ + @Test + public void testMapSchemas() + throws Exception + { + Iterable> values = createTestMaps(transform(intsBetween(0, 100_000), Object::toString), intsBetween(0, 10_000)); + Iterable> nullableValues = createTestMaps(transform(intsBetween(0, 30_000), Object::toString), limit(cycle(asList(1, null, 3, 5, null, null, null, 7, 11, null, 13, 17)), 30_000)); + tester.testRoundTrip(getStandardMapObjectInspector(javaStringObjectInspector, javaIntObjectInspector), values, values, mapType(VARCHAR, INTEGER)); + + // Map (nullable map, non-null values) + MessageType map = parseMessageType("message hive_schema {" + + " optional group my_map (MAP) {" + + " repeated group map { " + + " required binary str (UTF8); " + + " required int32 num; " + + " } " + + " }" + + "} "); + tester.testRoundTrip(getStandardMapObjectInspector(javaStringObjectInspector, javaIntObjectInspector), values, values, "my_map", mapType(VARCHAR, INTEGER), Optional.of(map)); + + // Map (nullable map, non-null values) + map = parseMessageType("message hive_schema {" + + " optional group my_map (MAP_KEY_VALUE) {" + + " repeated group map { " + + " required binary str (UTF8); " + + " required int32 num; " + + " } " + + " }" + + "} "); + tester.testRoundTrip(getStandardMapObjectInspector(javaStringObjectInspector, javaIntObjectInspector), values, values, "my_map", mapType(VARCHAR, INTEGER), Optional.of(map)); + + // Map (non-null map, nullable values) + map = parseMessageType("message hive_schema {" + + " required group my_map (MAP) { " + + " repeated group map { " + + " required binary key (UTF8); " + + " optional int32 value; " + + " } " + + " }" + + " } "); + tester.assertRoundTrip(singletonList(getStandardMapObjectInspector(javaStringObjectInspector, javaIntObjectInspector)), new Iterable[] {nullableValues}, + new Iterable[] {nullableValues}, singletonList("my_map"), singletonList(mapType(VARCHAR, INTEGER)), Optional.of(map)); + + // Map (non-null map, nullable values) + map = parseMessageType("message hive_schema {" + + " required group my_map (MAP_KEY_VALUE) { " + + " repeated group map { " + + " required binary key (UTF8); " + + " optional int32 value; " + + " } " + + " }" + + " } "); + tester.assertRoundTrip(singletonList(getStandardMapObjectInspector(javaStringObjectInspector, javaIntObjectInspector)), new Iterable[] {nullableValues}, + new Iterable[] {nullableValues}, singletonList("my_map"), singletonList(mapType(VARCHAR, INTEGER)), Optional.of(map)); + + // Map (non-null map, nullable values) + map = parseMessageType("message hive_schema {" + + " required group my_map (MAP) { " + + " repeated group map { " + + " required binary key (UTF8); " + + " required int32 value; " + + " } " + + " }" + + " } "); + tester.assertRoundTrip(singletonList(getStandardMapObjectInspector(javaStringObjectInspector, javaIntObjectInspector)), new Iterable[] {values}, + new Iterable[] {values}, singletonList("my_map"), singletonList(mapType(VARCHAR, INTEGER)), Optional.of(map)); + + // Map (non-null map, nullable values) + map = parseMessageType("message hive_schema {" + + " required group my_map (MAP_KEY_VALUE) { " + + " repeated group map { " + + " required binary key (UTF8); " + + " required int32 value; " + + " } " + + " }" + + " } "); + tester.assertRoundTrip(singletonList(getStandardMapObjectInspector(javaStringObjectInspector, javaIntObjectInspector)), new Iterable[] {values}, + new Iterable[] {values}, singletonList("my_map"), singletonList(mapType(VARCHAR, INTEGER)), Optional.of(map)); + + // Map (nullable map, nullable values) + map = parseMessageType("message hive_schema {" + + " optional group my_map (MAP) { " + + " repeated group map { " + + " required binary key (UTF8); " + + " optional int32 value; " + + " } " + + " }" + + " } "); + tester.testRoundTrip(getStandardMapObjectInspector(javaStringObjectInspector, javaIntObjectInspector), nullableValues, nullableValues, "my_map", mapType(VARCHAR, INTEGER), Optional.of(map)); + + // Map (nullable map, nullable values) + map = parseMessageType("message hive_schema {" + + " optional group my_map (MAP_KEY_VALUE) { " + + " repeated group map { " + + " required binary key (UTF8); " + + " optional int32 value; " + + " } " + + " }" + + " } "); + tester.testRoundTrip(getStandardMapObjectInspector(javaStringObjectInspector, javaIntObjectInspector), nullableValues, nullableValues, "my_map", mapType(VARCHAR, INTEGER), Optional.of(map)); + } + + @Test + public void testLongStrideDictionary() + throws Exception + { + testRoundTripNumeric(concat(ImmutableList.of(1), Collections.nCopies(9999, 123), ImmutableList.of(2), Collections.nCopies(9999, 123))); + } + + private void testRoundTripNumeric(Iterable writeValues) + throws Exception + { + tester.testRoundTrip(javaByteObjectInspector, + transform(writeValues, AbstractTestParquetReader::intToByte), + AbstractTestParquetReader::byteToInt, + INTEGER); + + tester.testRoundTrip(javaShortObjectInspector, + transform(writeValues, AbstractTestParquetReader::intToShort), + AbstractTestParquetReader::shortToInt, + INTEGER); + + tester.testRoundTrip(javaIntObjectInspector, writeValues, INTEGER); + tester.testRoundTrip(javaLongObjectInspector, transform(writeValues, AbstractTestParquetReader::intToLong), BIGINT); + tester.testRoundTrip(javaTimestampObjectInspector, + transform(writeValues, AbstractTestParquetReader::intToTimestamp), + transform(writeValues, AbstractTestParquetReader::intToSqlTimestamp), + TIMESTAMP); + + tester.testRoundTrip(javaDateObjectInspector, + transform(writeValues, AbstractTestParquetReader::intToDate), + transform(writeValues, AbstractTestParquetReader::intToSqlDate), + DATE); + } + + @Test + public void testFloatSequence() + throws Exception + { + tester.testRoundTrip(javaFloatObjectInspector, floatSequence(0.0f, 0.1f, 30_000), REAL); + } + + @Test + public void testFloatNaNInfinity() + throws Exception + { + tester.testRoundTrip(javaFloatObjectInspector, ImmutableList.of(1000.0f, -1.23f, Float.POSITIVE_INFINITY), REAL); + tester.testRoundTrip(javaFloatObjectInspector, ImmutableList.of(-1000.0f, Float.NEGATIVE_INFINITY, 1.23f), REAL); + tester.testRoundTrip(javaFloatObjectInspector, ImmutableList.of(0.0f, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY), REAL); + + tester.testRoundTrip(javaFloatObjectInspector, ImmutableList.of(Float.NaN, -0.0f, 1.0f), REAL); + tester.testRoundTrip(javaFloatObjectInspector, ImmutableList.of(Float.NaN, -1.0f, Float.POSITIVE_INFINITY), REAL); + tester.testRoundTrip(javaFloatObjectInspector, ImmutableList.of(Float.NaN, Float.NEGATIVE_INFINITY, 1.0f), REAL); + tester.testRoundTrip(javaFloatObjectInspector, ImmutableList.of(Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY), REAL); + } + + @Test + public void testDoubleSequence() + throws Exception + { + tester.testRoundTrip(javaDoubleObjectInspector, doubleSequence(0, 0.1, 30_000), DOUBLE); + } + + @Test + public void testDoubleNaNInfinity() + throws Exception + { + tester.testRoundTrip(javaDoubleObjectInspector, ImmutableList.of(1000.0, -1.0, Double.POSITIVE_INFINITY), DOUBLE); + tester.testRoundTrip(javaDoubleObjectInspector, ImmutableList.of(-1000.0, Double.NEGATIVE_INFINITY, 1.0), DOUBLE); + tester.testRoundTrip(javaDoubleObjectInspector, ImmutableList.of(0.0, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY), DOUBLE); + + tester.testRoundTrip(javaDoubleObjectInspector, ImmutableList.of(Double.NaN, -1.0, 1.0), DOUBLE); + tester.testRoundTrip(javaDoubleObjectInspector, ImmutableList.of(Double.NaN, -1.0, Double.POSITIVE_INFINITY), DOUBLE); + tester.testRoundTrip(javaDoubleObjectInspector, ImmutableList.of(Double.NaN, Double.NEGATIVE_INFINITY, 1.0), DOUBLE); + tester.testRoundTrip(javaDoubleObjectInspector, ImmutableList.of(Double.NaN, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY), DOUBLE); + } + + @Test + public void testStringUnicode() + throws Exception + { + tester.testRoundTrip(javaStringObjectInspector, limit(cycle(ImmutableList.of("apple", "apple pie", "apple\uD835\uDC03", "apple\uFFFD")), 30_000), createUnboundedVarcharType()); + } + + @Test + public void testStringDirectSequence() + throws Exception + { + tester.testRoundTrip(javaStringObjectInspector, transform(intsBetween(0, 30_000), Object::toString), createUnboundedVarcharType()); + } + + @Test + public void testStringDictionarySequence() + throws Exception + { + tester.testRoundTrip(javaStringObjectInspector, limit(cycle(transform(ImmutableList.of(1, 3, 5, 7, 11, 13, 17), Object::toString)), 30_000), createUnboundedVarcharType()); + } + + @Test + public void testStringStrideDictionary() + throws Exception + { + tester.testRoundTrip(javaStringObjectInspector, concat(ImmutableList.of("a"), Collections.nCopies(9999, "123"), ImmutableList.of("b"), Collections.nCopies(9999, "123")), createUnboundedVarcharType()); + } + + @Test + public void testEmptyStringSequence() + throws Exception + { + tester.testRoundTrip(javaStringObjectInspector, limit(cycle(""), 30_000), createUnboundedVarcharType()); + } + + @Test + public void testBinaryDirectSequence() + throws Exception + { + Iterable writeValues = transform(intsBetween(0, 30_000), compose(AbstractTestParquetReader::stringToByteArray, Object::toString)); + tester.testRoundTrip(javaByteArrayObjectInspector, + writeValues, + transform(writeValues, AbstractTestParquetReader::byteArrayToVarbinary), + VARBINARY); + } + + @Test + public void testBinaryDictionarySequence() + throws Exception + { + Iterable writeValues = limit(cycle(transform(ImmutableList.of(1, 3, 5, 7, 11, 13, 17), compose(AbstractTestParquetReader::stringToByteArray, Object::toString))), 30_000); + tester.testRoundTrip(javaByteArrayObjectInspector, + writeValues, + transform(writeValues, AbstractTestParquetReader::byteArrayToVarbinary), + VARBINARY); + } + + @Test + public void testEmptyBinarySequence() + throws Exception + { + tester.testRoundTrip(javaByteArrayObjectInspector, limit(cycle(new byte[0]), 30_000), AbstractTestParquetReader::byteArrayToVarbinary, VARBINARY); + } + + private static Iterable skipEvery(int n, Iterable iterable) + { + return () -> new AbstractIterator() + { + private final Iterator delegate = iterable.iterator(); + private int position; + + @Override + protected T computeNext() + { + while (true) { + if (!delegate.hasNext()) { + return endOfData(); + } + + T next = delegate.next(); + position++; + if (position <= n) { + return next; + } + position = 0; + } + } + }; + } + + @Test + public void testStructMaxReadBytes() + throws Exception + { + DataSize maxReadBlockSize = new DataSize(1_000, DataSize.Unit.BYTE); + List> structValues = createTestStructs( + Collections.nCopies(500, join("", Collections.nCopies(33, "test"))), + Collections.nCopies(500, join("", Collections.nCopies(1, "test")))); + List structFieldNames = asList("a", "b"); + Type structType = RowType.from(asList(field("a", VARCHAR), field("b", VARCHAR))); + + tester.testMaxReadBytes( + getStandardStructObjectInspector(structFieldNames, asList(javaStringObjectInspector, javaStringObjectInspector)), + structValues, + structValues, + structType, + maxReadBlockSize); + } + + @Test + public void testArrayMaxReadBytes() + throws Exception + { + DataSize maxReadBlockSize = new DataSize(1_000, DataSize.Unit.BYTE); + Iterable> values = createFixedTestArrays(limit(cycle(asList(1, null, 3, 5, null, null, null, 7, 11, null, 13, 17)), 30_000)); + tester.testMaxReadBytes(getStandardListObjectInspector(javaIntObjectInspector), values, values, new ArrayType(INTEGER), maxReadBlockSize); + } + + @Test + public void testMapMaxReadBytes() + throws Exception + { + DataSize maxReadBlockSize = new DataSize(1_000, DataSize.Unit.BYTE); + Iterable> values = createFixedTestMaps(Collections.nCopies(5_000, join("", Collections.nCopies(33, "test"))), longsBetween(0, 5_000)); + tester.testMaxReadBytes(getStandardMapObjectInspector(javaStringObjectInspector, javaLongObjectInspector), values, values, mapType(VARCHAR, BIGINT), maxReadBlockSize); + } + + private static Iterable repeatEach(int n, Iterable iterable) + { + return () -> new AbstractIterator() + { + private final Iterator delegate = iterable.iterator(); + private int position; + private T value; + + @Override + protected T computeNext() + { + if (position == 0) { + if (!delegate.hasNext()) { + return endOfData(); + } + value = delegate.next(); + } + + position++; + if (position >= n) { + position = 0; + } + return value; + } + }; + } + + private static Iterable floatSequence(double start, double step, int items) + { + return transform(doubleSequence(start, step, items), input -> { + if (input == null) { + return null; + } + return input.floatValue(); + }); + } + + private static Iterable doubleSequence(double start, double step, int items) + { + return () -> new AbstractSequentialIterator(start) + { + private int item; + + @Override + protected Double computeNext(Double previous) + { + if (item >= items) { + return null; + } + item++; + return previous + step; + } + }; + } + + private static ContiguousSet intsBetween(int lowerInclusive, int upperExclusive) + { + return ContiguousSet.create(Range.closedOpen(lowerInclusive, upperExclusive), DiscreteDomain.integers()); + } + + private static ContiguousSet longsBetween(long lowerInclusive, long upperExclusive) + { + return ContiguousSet.create(Range.closedOpen(lowerInclusive, upperExclusive), DiscreteDomain.longs()); + } + + private static ContiguousSet bigIntegersBetween(BigInteger lowerInclusive, BigInteger upperExclusive) + { + return ContiguousSet.create(Range.closedOpen(lowerInclusive, upperExclusive), DiscreteDomain.bigIntegers()); + } + + private List> createTestStructs(Iterable fieldValues) + { + checkArgument(fieldValues.iterator().hasNext(), "struct field values cannot be empty"); + List> structs = new ArrayList<>(); + for (F field : fieldValues) { + structs.add(singletonList(field)); + } + return structs; + } + + private List> createTestStructs(Iterable... values) + { + List> structs = new ArrayList<>(); + List> iterators = Arrays.stream(values).map(Iterable::iterator).collect(Collectors.toList()); + iterators.forEach(iter -> checkArgument(iter.hasNext(), "struct field values cannot be empty")); + while (iterators.stream().allMatch(Iterator::hasNext)) { + structs.add(iterators.stream().map(Iterator::next).collect(Collectors.toList())); + } + return structs; + } + + private Iterable> createNullableTestStructs(Iterable... values) + { + return insertNullEvery(ThreadLocalRandom.current().nextInt(2, 5), createTestStructs(values)); + } + + private List> createTestArrays(Iterable values) + { + List> arrays = new ArrayList<>(); + Iterator valuesIter = values.iterator(); + List array = new ArrayList<>(); + while (valuesIter.hasNext()) { + if (ThreadLocalRandom.current().nextBoolean()) { + arrays.add(array); + array = new ArrayList<>(); + } + if (ThreadLocalRandom.current().nextInt(10) == 0) { + arrays.add(Collections.emptyList()); + } + array.add(valuesIter.next()); + } + return arrays; + } + + private Iterable> createNullableTestArrays(Iterable values) + { + return insertNullEvery(ThreadLocalRandom.current().nextInt(2, 5), createTestArrays(values)); + } + + private List> createFixedTestArrays(Iterable values) + { + List> arrays = new ArrayList<>(); + Iterator valuesIter = values.iterator(); + List array = new ArrayList<>(); + int count = 1; + while (valuesIter.hasNext()) { + if (count % 10 == 0) { + arrays.add(array); + array = new ArrayList<>(); + } + if (count % 20 == 0) { + arrays.add(Collections.emptyList()); + } + array.add(valuesIter.next()); + ++count; + } + return arrays; + } + + private Iterable> createFixedTestMaps(Iterable keys, Iterable values) + { + List> maps = new ArrayList<>(); + Iterator keysIterator = keys.iterator(); + Iterator valuesIterator = values.iterator(); + Map map = new HashMap<>(); + int count = 1; + while (keysIterator.hasNext() && valuesIterator.hasNext()) { + if (count % 5 == 0) { + maps.add(map); + map = new HashMap<>(); + } + if (count % 10 == 0) { + maps.add(Collections.emptyMap()); + } + map.put(keysIterator.next(), valuesIterator.next()); + ++count; + } + return maps; + } + + private Iterable> createTestMaps(Iterable keys, Iterable values) + { + List> maps = new ArrayList<>(); + Iterator keysIterator = keys.iterator(); + Iterator valuesIterator = values.iterator(); + Map map = new HashMap<>(); + while (keysIterator.hasNext() && valuesIterator.hasNext()) { + if (ThreadLocalRandom.current().nextInt(5) == 0) { + maps.add(map); + map = new HashMap<>(); + } + if (ThreadLocalRandom.current().nextInt(10) == 0) { + maps.add(Collections.emptyMap()); + } + map.put(keysIterator.next(), valuesIterator.next()); + } + return maps; + } + + private Iterable> createNullableTestMaps(Iterable keys, Iterable values) + { + return insertNullEvery(ThreadLocalRandom.current().nextInt(2, 5), createTestMaps(keys, values)); + } + + private static Byte intToByte(Integer input) + { + if (input == null) { + return null; + } + return (byte) (input & 0xFF); + } + + private static Short intToShort(Integer input) + { + if (input == null) { + return null; + } + return Shorts.checkedCast(input); + } + + private static Integer byteToInt(Byte input) + { + return toInteger(input); + } + + private static Integer shortToInt(Short input) + { + return toInteger(input); + } + + private static Long intToLong(Integer input) + { + return toLong(input); + } + + private static Integer toInteger(N input) + { + if (input == null) { + return null; + } + return input.intValue(); + } + + private static Long toLong(N input) + { + if (input == null) { + return null; + } + return input.longValue(); + } + + private static byte[] stringToByteArray(String input) + { + return input.getBytes(UTF_8); + } + + private static SqlVarbinary byteArrayToVarbinary(byte[] input) + { + if (input == null) { + return null; + } + return new SqlVarbinary(input); + } + + private static Timestamp intToTimestamp(Integer input) + { + if (input == null) { + return null; + } + + long seconds = (input / 1000); + int nanos = ((input % 1000) * 1_000_000); + + // add some junk nanos to the timestamp, which will be truncated + nanos += 888_888; + + if (nanos < 0) { + nanos += 1_000_000_000; + seconds -= 1; + } + if (nanos > 1_000_000_000) { + nanos -= 1_000_000_000; + seconds += 1; + } + return Timestamp.ofEpochSecond(seconds, nanos); + } + + private static SqlTimestamp intToSqlTimestamp(Integer input) + { + if (input == null) { + return null; + } + return sqlTimestampOf((long) input); + } + + private static Date intToDate(Integer input) + { + if (input == null) { + return null; + } + return Date.ofEpochDay(input); + } + + private static SqlDate intToSqlDate(Integer input) + { + if (input == null) { + return null; + } + return new SqlDate(input); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/ParquetTester.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/ParquetTester.java new file mode 100644 index 00000000..c475d824 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/ParquetTester.java @@ -0,0 +1,652 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet; + +import com.google.common.base.Joiner; +import com.google.common.collect.AbstractIterator; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import io.airlift.slice.Slice; +import io.airlift.units.DataSize; +import io.prestosql.plugin.hive.AbstractTestHiveFileFormats; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveSessionProperties; +import io.prestosql.plugin.hive.HiveStorageFormat; +import io.prestosql.plugin.hive.HiveTestUtils; +import io.prestosql.plugin.hive.OrcFileWriterConfig; +import io.prestosql.plugin.hive.ParquetFileWriterConfig; +import io.prestosql.plugin.hive.benchmark.FileFormat; +import io.prestosql.plugin.hive.parquet.write.MapKeyValuesSchemaConverter; +import io.prestosql.plugin.hive.parquet.write.SingleLevelArrayMapKeyValuesSchemaConverter; +import io.prestosql.plugin.hive.parquet.write.SingleLevelArraySchemaConverter; +import io.prestosql.plugin.hive.parquet.write.TestMapredParquetOutputFormat; +import io.prestosql.spi.Page; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.RecordCursor; +import io.prestosql.spi.connector.RecordPageSource; +import io.prestosql.spi.type.ArrayType; +import io.prestosql.spi.type.DateType; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.MapType; +import io.prestosql.spi.type.SqlDate; +import io.prestosql.spi.type.SqlDecimal; +import io.prestosql.spi.type.SqlTimestamp; +import io.prestosql.spi.type.SqlVarbinary; +import io.prestosql.spi.type.TimestampType; +import io.prestosql.spi.type.Type; +import io.prestosql.testing.TestingConnectorSession; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.parquet.column.ParquetProperties.WriterVersion; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.schema.MessageType; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; +import java.util.Set; +import java.util.function.Function; + +import static com.google.common.base.Functions.constant; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.Iterables.transform; +import static io.prestosql.plugin.hive.HiveSessionProperties.getParquetMaxReadBlockSize; +import static io.prestosql.plugin.hive.HiveUtil.isArrayType; +import static io.prestosql.plugin.hive.HiveUtil.isMapType; +import static io.prestosql.plugin.hive.HiveUtil.isRowType; +import static io.prestosql.plugin.hive.HiveUtil.isStructuralType; +import static io.prestosql.spi.type.VarbinaryType.VARBINARY; +import static io.prestosql.spi.type.Varchars.isVarcharType; +import static java.util.Arrays.stream; +import static java.util.Collections.singletonList; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils.getTypeInfosFromTypeString; +import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0; +import static org.apache.parquet.hadoop.ParquetOutputFormat.COMPRESSION; +import static org.apache.parquet.hadoop.ParquetOutputFormat.ENABLE_DICTIONARY; +import static org.apache.parquet.hadoop.ParquetOutputFormat.WRITER_VERSION; +import static org.apache.parquet.hadoop.metadata.CompressionCodecName.GZIP; +import static org.apache.parquet.hadoop.metadata.CompressionCodecName.LZ4; +import static org.apache.parquet.hadoop.metadata.CompressionCodecName.LZO; +import static org.apache.parquet.hadoop.metadata.CompressionCodecName.SNAPPY; +import static org.apache.parquet.hadoop.metadata.CompressionCodecName.UNCOMPRESSED; +import static org.apache.parquet.hadoop.metadata.CompressionCodecName.ZSTD; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +public class ParquetTester +{ + private static final boolean OPTIMIZED = true; + private static final HiveConfig HIVE_CLIENT_CONFIG = createHiveConfig(false); + private static final HdfsEnvironment HDFS_ENVIRONMENT = HiveTestUtils.createTestHdfsEnvironment(HIVE_CLIENT_CONFIG); + private static final TestingConnectorSession SESSION = new TestingConnectorSession(new HiveSessionProperties(HIVE_CLIENT_CONFIG, new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + private static final TestingConnectorSession SESSION_USE_NAME = new TestingConnectorSession(new HiveSessionProperties(createHiveConfig(true), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + private static final List TEST_COLUMN = singletonList("test"); + + private Set compressions = ImmutableSet.of(); + + private Set versions = ImmutableSet.of(); + + private Set sessions = ImmutableSet.of(); + + public static ParquetTester quickParquetTester() + { + ParquetTester parquetTester = new ParquetTester(); + parquetTester.compressions = ImmutableSet.of(GZIP); + parquetTester.versions = ImmutableSet.of(PARQUET_1_0); + parquetTester.sessions = ImmutableSet.of(SESSION); + return parquetTester; + } + + public static ParquetTester fullParquetTester() + { + ParquetTester parquetTester = new ParquetTester(); + parquetTester.compressions = ImmutableSet.of(GZIP, UNCOMPRESSED, SNAPPY, LZO, LZ4, ZSTD); + parquetTester.versions = ImmutableSet.copyOf(WriterVersion.values()); + parquetTester.sessions = ImmutableSet.of(SESSION, SESSION_USE_NAME); + return parquetTester; + } + + public void testRoundTrip(PrimitiveObjectInspector columnObjectInspector, Iterable writeValues, Type parameterType) + throws Exception + { + testRoundTrip(columnObjectInspector, writeValues, writeValues, parameterType); + } + + public void testRoundTrip(PrimitiveObjectInspector columnObjectInspector, Iterable writeValues, Function readTransform, Type parameterType) + throws Exception + { + testRoundTrip(columnObjectInspector, writeValues, transform(writeValues, readTransform::apply), parameterType); + } + + public void testSingleLevelArraySchemaRoundTrip(ObjectInspector objectInspector, Iterable writeValues, Iterable readValues, Type type) + throws Exception + { + List typeInfos = getTypeInfosFromTypeString(objectInspector.getTypeName()); + MessageType schema = SingleLevelArraySchemaConverter.convert(TEST_COLUMN, typeInfos); + testSingleLevelArrayRoundTrip(objectInspector, writeValues, readValues, type, Optional.of(schema)); + if (objectInspector.getTypeName().contains("map<")) { + schema = SingleLevelArrayMapKeyValuesSchemaConverter.convert(TEST_COLUMN, typeInfos); + testSingleLevelArrayRoundTrip(objectInspector, writeValues, readValues, type, Optional.of(schema)); + } + } + + public void testRoundTrip(ObjectInspector objectInspector, Iterable writeValues, Iterable readValues, Type type) + throws Exception + { + // just the values + testRoundTripType(singletonList(objectInspector), new Iterable[] {writeValues}, + new Iterable[] {readValues}, TEST_COLUMN, singletonList(type), Optional.empty(), false); + + // all nulls + assertRoundTrip(singletonList(objectInspector), new Iterable[] {transform(writeValues, constant(null))}, + new Iterable[] {transform(writeValues, constant(null))}, TEST_COLUMN, singletonList(type), Optional.empty()); + if (objectInspector.getTypeName().contains("map<")) { + List typeInfos = getTypeInfosFromTypeString(objectInspector.getTypeName()); + MessageType schema = MapKeyValuesSchemaConverter.convert(TEST_COLUMN, typeInfos); + // just the values + testRoundTripType(singletonList(objectInspector), new Iterable[] {writeValues}, new Iterable[] { + readValues}, TEST_COLUMN, singletonList(type), Optional.of(schema), false); + + // all nulls + assertRoundTrip(singletonList(objectInspector), new Iterable[] {transform(writeValues, constant(null))}, + new Iterable[] {transform(writeValues, constant(null))}, TEST_COLUMN, singletonList(type), Optional.of(schema)); + } + } + + public void testRoundTrip(ObjectInspector objectInspector, Iterable writeValues, Iterable readValues, Type type, Optional parquetSchema) + throws Exception + { + testRoundTrip(singletonList(objectInspector), new Iterable[] {writeValues}, new Iterable[] {readValues}, TEST_COLUMN, singletonList(type), parquetSchema, false); + } + + public void testSingleLevelArrayRoundTrip(ObjectInspector objectInspector, Iterable writeValues, Iterable readValues, Type type, Optional parquetSchema) + throws Exception + { + testRoundTrip(singletonList(objectInspector), new Iterable[] {writeValues}, new Iterable[] {readValues}, TEST_COLUMN, singletonList(type), parquetSchema, true); + } + + public void testRoundTrip(ObjectInspector objectInspector, Iterable writeValues, Iterable readValues, String columnName, Type type, Optional parquetSchema) + throws Exception + { + testRoundTrip( + singletonList(objectInspector), + new Iterable[] {writeValues}, + new Iterable[] {readValues}, + singletonList(columnName), + singletonList(type), + parquetSchema, + false); + } + + public void testSingleLevelArrayRoundTrip(ObjectInspector objectInspector, Iterable writeValues, Iterable readValues, String columnName, Type type, Optional parquetSchema) + throws Exception + { + testRoundTrip( + singletonList(objectInspector), + new Iterable[] {writeValues}, + new Iterable[] {readValues}, + singletonList(columnName), + singletonList(type), + parquetSchema, + true); + } + + public void testRoundTrip(List objectInspectors, Iterable[] writeValues, Iterable[] readValues, List columnNames, List columnTypes, Optional parquetSchema, boolean singleLevelArray) + throws Exception + { + // just the values + testRoundTripType(objectInspectors, writeValues, readValues, columnNames, columnTypes, parquetSchema, singleLevelArray); + + // all nulls + assertRoundTrip(objectInspectors, transformToNulls(writeValues), transformToNulls(readValues), columnNames, columnTypes, parquetSchema, singleLevelArray); + } + + private void testRoundTripType( + List objectInspectors, + Iterable[] writeValues, + Iterable[] readValues, + List columnNames, + List columnTypes, + Optional parquetSchema, + boolean singleLevelArray) + throws Exception + { + // forward order + assertRoundTrip(objectInspectors, writeValues, readValues, columnNames, columnTypes, parquetSchema, singleLevelArray); + + // reverse order + assertRoundTrip(objectInspectors, reverse(writeValues), reverse(readValues), columnNames, columnTypes, parquetSchema, singleLevelArray); + + // forward order with nulls + assertRoundTrip(objectInspectors, insertNullEvery(5, writeValues), insertNullEvery(5, readValues), columnNames, columnTypes, parquetSchema, singleLevelArray); + + // reverse order with nulls + assertRoundTrip(objectInspectors, insertNullEvery(5, reverse(writeValues)), insertNullEvery(5, reverse(readValues)), columnNames, columnTypes, parquetSchema, singleLevelArray); + } + + void assertRoundTrip( + List objectInspectors, + Iterable[] writeValues, + Iterable[] readValues, + List columnNames, + List columnTypes, + Optional parquetSchema) + throws Exception + { + assertRoundTrip(objectInspectors, writeValues, readValues, columnNames, columnTypes, parquetSchema, false); + } + + void assertRoundTrip( + List objectInspectors, + Iterable[] writeValues, + Iterable[] readValues, + List columnNames, + List columnTypes, + Optional parquetSchema, + boolean singleLevelArray) + throws Exception + { + for (WriterVersion version : versions) { + for (CompressionCodecName compressionCodecName : compressions) { + for (ConnectorSession session : sessions) { + try (TempFile tempFile = new TempFile("test", "parquet")) { + JobConf jobConf = new JobConf(); + jobConf.setEnum(COMPRESSION, compressionCodecName); + jobConf.setBoolean(ENABLE_DICTIONARY, true); + jobConf.setEnum(WRITER_VERSION, version); + writeParquetColumn( + jobConf, + tempFile.getFile(), + compressionCodecName, + createTableProperties(columnNames, objectInspectors), + getStandardStructObjectInspector(columnNames, objectInspectors), + getIterators(writeValues), + parquetSchema, + singleLevelArray); + assertFileContents( + session, + tempFile.getFile(), + getIterators(readValues), + columnNames, + columnTypes); + } + } + } + } + } + + static void testMaxReadBytes(ObjectInspector objectInspector, Iterable writeValues, Iterable readValues, Type type, DataSize maxReadBlockSize) + throws Exception + { + assertMaxReadBytes( + singletonList(objectInspector), + new Iterable[] {writeValues}, + new Iterable[] {readValues}, + TEST_COLUMN, + singletonList(type), + Optional.empty(), + maxReadBlockSize); + } + + static void assertMaxReadBytes( + List objectInspectors, + Iterable[] writeValues, + Iterable[] readValues, + List columnNames, + List columnTypes, + Optional parquetSchema, + DataSize maxReadBlockSize) + throws Exception + { + CompressionCodecName compressionCodecName = UNCOMPRESSED; + HiveConfig config = new HiveConfig() + .setHiveStorageFormat(HiveStorageFormat.PARQUET) + .setUseParquetColumnNames(false) + .setParquetMaxReadBlockSize(maxReadBlockSize); + ConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(config, new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + + try (TempFile tempFile = new TempFile("test", "parquet")) { + JobConf jobConf = new JobConf(); + jobConf.setEnum(COMPRESSION, compressionCodecName); + jobConf.setBoolean(ENABLE_DICTIONARY, true); + jobConf.setEnum(WRITER_VERSION, PARQUET_1_0); + writeParquetColumn( + jobConf, + tempFile.getFile(), + compressionCodecName, + createTableProperties(columnNames, objectInspectors), + getStandardStructObjectInspector(columnNames, objectInspectors), + getIterators(writeValues), + parquetSchema, + false); + + Iterator[] expectedValues = getIterators(readValues); + try (ConnectorPageSource pageSource = getFileFormat().createFileFormatReader( + session, + HDFS_ENVIRONMENT, + tempFile.getFile(), + columnNames, + columnTypes)) { + assertPageSource( + columnTypes, + expectedValues, + pageSource, + Optional.of(getParquetMaxReadBlockSize(session).toBytes())); + assertFalse(stream(expectedValues).allMatch(Iterator::hasNext)); + } + } + } + + private static void assertFileContents( + ConnectorSession session, + File dataFile, + Iterator[] expectedValues, + List columnNames, + List columnTypes) + throws IOException + { + try (ConnectorPageSource pageSource = getFileFormat().createFileFormatReader( + session, + HDFS_ENVIRONMENT, + dataFile, + columnNames, + columnTypes)) { + if (pageSource instanceof RecordPageSource) { + assertRecordCursor(columnTypes, expectedValues, ((RecordPageSource) pageSource).getCursor()); + } + else { + assertPageSource(columnTypes, expectedValues, pageSource); + } + assertFalse(stream(expectedValues).allMatch(Iterator::hasNext)); + } + } + + private static void assertPageSource(List types, Iterator[] valuesByField, ConnectorPageSource pageSource) + { + assertPageSource(types, valuesByField, pageSource, Optional.empty()); + } + + private static void assertPageSource(List types, Iterator[] valuesByField, ConnectorPageSource pageSource, Optional maxReadBlockSize) + { + while (!pageSource.isFinished()) { + Page page = pageSource.getNextPage(); + if (page == null) { + continue; + } + + maxReadBlockSize.ifPresent(max -> + assertTrue(page.getPositionCount() == 1 || page.getSizeInBytes() <= max)); + + for (int field = 0; field < page.getChannelCount(); field++) { + Block block = page.getBlock(field); + for (int i = 0; i < block.getPositionCount(); i++) { + assertTrue(valuesByField[field].hasNext()); + Object expected = valuesByField[field].next(); + Object actual = decodeObject(types.get(field), block, i); + assertEquals(actual, expected); + } + } + } + } + + private static void assertRecordCursor(List types, Iterator[] valuesByField, RecordCursor cursor) + { + while (cursor.advanceNextPosition()) { + for (int field = 0; field < types.size(); field++) { + assertTrue(valuesByField[field].hasNext()); + Object expected = valuesByField[field].next(); + Object actual = getActualCursorValue(cursor, types.get(field), field); + assertEquals(actual, expected); + } + } + } + + private static Object getActualCursorValue(RecordCursor cursor, Type type, int field) + { + Object fieldFromCursor = AbstractTestHiveFileFormats.getFieldFromCursor(cursor, type, field); + if (fieldFromCursor == null) { + return null; + } + if (isStructuralType(type)) { + Block block = (Block) fieldFromCursor; + if (isArrayType(type)) { + Type elementType = ((ArrayType) type).getElementType(); + return toArrayValue(block, elementType); + } + if (isMapType(type)) { + MapType mapType = (MapType) type; + return toMapValue(block, mapType.getKeyType(), mapType.getValueType()); + } + if (isRowType(type)) { + return toRowValue(block, type.getTypeParameters()); + } + } + if (type instanceof DecimalType) { + DecimalType decimalType = (DecimalType) type; + return new SqlDecimal((BigInteger) fieldFromCursor, decimalType.getPrecision(), decimalType.getScale()); + } + if (isVarcharType(type)) { + return new String(((Slice) fieldFromCursor).getBytes()); + } + if (VARBINARY.equals(type)) { + return new SqlVarbinary(((Slice) fieldFromCursor).getBytes()); + } + if (DateType.DATE.equals(type)) { + return new SqlDate(((Long) fieldFromCursor).intValue()); + } + if (TimestampType.TIMESTAMP.equals(type)) { + return new SqlTimestamp((long) fieldFromCursor); + } + return fieldFromCursor; + } + + private static Map toMapValue(Block mapBlock, Type keyType, Type valueType) + { + Map map = new HashMap<>(mapBlock.getPositionCount() * 2); + for (int i = 0; i < mapBlock.getPositionCount(); i += 2) { + map.put(keyType.getObjectValue(SESSION, mapBlock, i), valueType.getObjectValue(SESSION, mapBlock, i + 1)); + } + return Collections.unmodifiableMap(map); + } + + private static List toArrayValue(Block arrayBlock, Type elementType) + { + List values = new ArrayList<>(); + for (int position = 0; position < arrayBlock.getPositionCount(); position++) { + values.add(elementType.getObjectValue(SESSION, arrayBlock, position)); + } + return Collections.unmodifiableList(values); + } + + private static List toRowValue(Block rowBlock, List fieldTypes) + { + List values = new ArrayList<>(rowBlock.getPositionCount()); + for (int i = 0; i < rowBlock.getPositionCount(); i++) { + values.add(fieldTypes.get(i).getObjectValue(SESSION, rowBlock, i)); + } + return Collections.unmodifiableList(values); + } + + private static HiveConfig createHiveConfig(boolean useParquetColumnNames) + { + return new HiveConfig() + .setHiveStorageFormat(HiveStorageFormat.PARQUET) + .setUseParquetColumnNames(useParquetColumnNames); + } + + private static FileFormat getFileFormat() + { + return OPTIMIZED ? FileFormat.PRESTO_PARQUET : FileFormat.HIVE_PARQUET; + } + + private static void writeParquetColumn( + JobConf jobConf, + File outputFile, + CompressionCodecName compressionCodecName, + Properties tableProperties, + SettableStructObjectInspector objectInspector, + Iterator[] valuesByField, + Optional parquetSchema, + boolean singleLevelArray) + throws Exception + { + RecordWriter recordWriter = new TestMapredParquetOutputFormat(parquetSchema, singleLevelArray) + .getHiveRecordWriter( + jobConf, + new Path(outputFile.toURI()), + Text.class, + compressionCodecName != UNCOMPRESSED, + tableProperties, + () -> {}); + Object row = objectInspector.create(); + List fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs()); + while (stream(valuesByField).allMatch(Iterator::hasNext)) { + for (int field = 0; field < fields.size(); field++) { + Object value = valuesByField[field].next(); + objectInspector.setStructFieldData(row, fields.get(field), value); + } + ParquetHiveSerDe serde = new ParquetHiveSerDe(); + serde.initialize(jobConf, tableProperties, null); + Writable record = serde.serialize(row, objectInspector); + recordWriter.write(record); + } + recordWriter.close(false); + } + + private static Properties createTableProperties(List columnNames, List objectInspectors) + { + Properties orderTableProperties = new Properties(); + orderTableProperties.setProperty("columns", Joiner.on(',').join(columnNames)); + orderTableProperties.setProperty("columns.types", Joiner.on(',').join(transform(objectInspectors, ObjectInspector::getTypeName))); + return orderTableProperties; + } + + private static class TempFile + implements Closeable + { + private final File file; + + public TempFile(String prefix, String suffix) + { + try { + file = File.createTempFile(prefix, suffix); + verify(file.delete()); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + public File getFile() + { + return file; + } + + @Override + public void close() + { + if (!file.delete()) { + verify(!file.exists()); + } + } + } + + private static Iterator[] getIterators(Iterable[] values) + { + return stream(values) + .map(Iterable::iterator) + .toArray(Iterator[]::new); + } + + private Iterable[] transformToNulls(Iterable[] values) + { + return stream(values) + .map(v -> transform(v, constant(null))) + .toArray(Iterable[]::new); + } + + private static Iterable[] reverse(Iterable[] iterables) + { + return stream(iterables) + .map(ImmutableList::copyOf) + .map(Lists::reverse) + .toArray(Iterable[]::new); + } + + private static Iterable[] insertNullEvery(int n, Iterable[] iterables) + { + return stream(iterables) + .map(itr -> insertNullEvery(n, itr)) + .toArray(Iterable[]::new); + } + + static Iterable insertNullEvery(int n, Iterable iterable) + { + return () -> new AbstractIterator() + { + private final Iterator delegate = iterable.iterator(); + private int position; + + @Override + protected T computeNext() + { + position++; + if (position > n) { + position = 0; + return null; + } + + if (!delegate.hasNext()) { + return endOfData(); + } + + return delegate.next(); + } + }; + } + + private static Object decodeObject(Type type, Block block, int position) + { + if (block.isNull(position)) { + return null; + } + + return type.getObjectValue(SESSION, block, position); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/TestFullParquetReader.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/TestFullParquetReader.java new file mode 100644 index 00000000..124c43de --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/TestFullParquetReader.java @@ -0,0 +1,26 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet; + +import org.testng.annotations.Test; + +@Test(groups = "ci") +public class TestFullParquetReader + extends AbstractTestParquetReader +{ + public TestFullParquetReader() + { + super(ParquetTester.fullParquetTester()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/TestParquetPageSourceFactory.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/TestParquetPageSourceFactory.java new file mode 100644 index 00000000..20fd16af --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/TestParquetPageSourceFactory.java @@ -0,0 +1,87 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet; + +import com.google.common.collect.ImmutableSet; +import io.prestosql.plugin.hive.FileFormatDataSourceStats; +import io.prestosql.plugin.hive.HdfsConfigurationInitializer; +import io.prestosql.plugin.hive.HdfsEnvironment; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveHdfsConfiguration; +import io.prestosql.plugin.hive.authentication.NoHdfsAuthentication; +import io.prestosql.spi.connector.ConnectorPageSource; +import io.prestosql.spi.type.testing.TestingTypeManager; +import org.apache.hadoop.conf.Configuration; +import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.Optional; +import java.util.Properties; + +import static io.prestosql.plugin.hive.HiveStorageFormat.PARQUET; +import static io.prestosql.plugin.hive.HiveUtil.shouldUseRecordReaderFromInputFormat; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_OUTPUT_FORMAT; +import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_SERDE; +import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +public class TestParquetPageSourceFactory +{ + private static final String PARQUET_HIVE_SERDE = "parquet.hive.serde.ParquetHiveSerDe"; + + private ParquetPageSourceFactory parquetPageSourceFactory; + + @BeforeClass + public void setUp() + { + HiveHdfsConfiguration hiveHdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(new HiveConfig(), ImmutableSet.of()), ImmutableSet.of()); + HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hiveHdfsConfiguration, new HiveConfig(), new NoHdfsAuthentication()); + parquetPageSourceFactory = new ParquetPageSourceFactory(new TestingTypeManager(), hdfsEnvironment, new FileFormatDataSourceStats(), new HiveConfig()); + } + + @AfterClass(alwaysRun = true) + public void cleanUp() + { + parquetPageSourceFactory = null; + } + + @Test + public void testCreatePageSourceEmptyWithoutParquetSerDe() + { + Properties schema = new Properties(); + schema.setProperty(META_TABLE_SERDE, PARQUET_HIVE_SERDE); + schema.setProperty(SERIALIZATION_LIB, ""); + schema.setProperty(FILE_INPUT_FORMAT, ""); + schema.setProperty(FILE_OUTPUT_FORMAT, ""); + Optional optionalPageSource = parquetPageSourceFactory.createPageSource(new Configuration(), null, null, 0L, 0L, 0L, schema, null, null, Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), null, false, -1L); + assertFalse(optionalPageSource.isPresent()); + } + + @Test + public void testCreatePageSourceEmptyWithParquetSerDeAndAnnotation() + { + Properties schema = new Properties(); + schema.setProperty(META_TABLE_SERDE, PARQUET_HIVE_SERDE); + schema.setProperty(SERIALIZATION_LIB, PARQUET.getSerDe()); + schema.setProperty(FILE_INPUT_FORMAT, HoodieParquetRealtimeInputFormat.class.getName()); + schema.setProperty(FILE_OUTPUT_FORMAT, ""); + Optional optionalPageSource = parquetPageSourceFactory.createPageSource(new Configuration(), null, null, 0L, 0L, 0L, schema, null, null, Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), null, false, -1L); + assertTrue(shouldUseRecordReaderFromInputFormat(new Configuration(), schema)); + assertFalse(optionalPageSource.isPresent()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/TestParquetReader.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/TestParquetReader.java new file mode 100644 index 00000000..cd68552b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/TestParquetReader.java @@ -0,0 +1,26 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet; + +import org.testng.annotations.Test; + +@Test +public class TestParquetReader + extends AbstractTestParquetReader +{ + public TestParquetReader() + { + super(ParquetTester.quickParquetTester()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/predicate/TestParquetPredicateUtils.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/predicate/TestParquetPredicateUtils.java new file mode 100644 index 00000000..dcec01eb --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/predicate/TestParquetPredicateUtils.java @@ -0,0 +1,157 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet.predicate; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterables; +import io.prestosql.parquet.RichColumnDescriptor; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveType; +import io.prestosql.plugin.hive.parquet.ParquetPageSourceFactory; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.type.ArrayType; +import io.prestosql.spi.type.MapType; +import io.prestosql.spi.type.RowType; +import io.prestosql.spi.type.StandardTypes; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static io.prestosql.parquet.ParquetTypeUtils.getDescriptors; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.spi.block.MethodHandleUtil.methodHandle; +import static io.prestosql.spi.predicate.TupleDomain.withColumnDomains; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.TypeSignature.parseTypeSignature; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; +import static org.apache.parquet.schema.Type.Repetition.REPEATED; +import static org.apache.parquet.schema.Type.Repetition.REQUIRED; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +public class TestParquetPredicateUtils +{ + @Test + public void testParquetTupleDomainPrimitiveArray() + { + HiveColumnHandle columnHandle = new HiveColumnHandle("my_array", HiveType.valueOf("array"), parseTypeSignature(StandardTypes.ARRAY), 0, REGULAR, Optional.empty()); + TupleDomain domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(new ArrayType(INTEGER)))); + + MessageType fileSchema = new MessageType("hive_schema", + new GroupType(OPTIONAL, "my_array", + new GroupType(REPEATED, "bag", new PrimitiveType(OPTIONAL, INT32, "array_element")))); + + Map, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); + TupleDomain tupleDomain = ParquetPageSourceFactory.getParquetTupleDomain(descriptorsByPath, domain); + assertTrue(tupleDomain.getDomains().get().isEmpty()); + } + + @Test + public void testParquetTupleDomainStructArray() + { + HiveColumnHandle columnHandle = new HiveColumnHandle("my_array_struct", HiveType.valueOf("array>"), parseTypeSignature(StandardTypes.ARRAY), 0, REGULAR, Optional.empty()); + RowType.Field rowField = new RowType.Field(Optional.of("a"), INTEGER); + RowType rowType = RowType.from(ImmutableList.of(rowField)); + TupleDomain domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(new ArrayType(rowType)))); + + MessageType fileSchema = new MessageType("hive_schema", + new GroupType(OPTIONAL, "my_array_struct", + new GroupType(REPEATED, "bag", + new GroupType(OPTIONAL, "array_element", new PrimitiveType(OPTIONAL, INT32, "a"))))); + + Map, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); + TupleDomain tupleDomain = ParquetPageSourceFactory.getParquetTupleDomain(descriptorsByPath, domain); + assertTrue(tupleDomain.getDomains().get().isEmpty()); + } + + @Test + public void testParquetTupleDomainPrimitive() + { + HiveColumnHandle columnHandle = new HiveColumnHandle("my_primitive", HiveType.valueOf("bigint"), parseTypeSignature(StandardTypes.BIGINT), 0, REGULAR, Optional.empty()); + Domain singleValueDomain = Domain.singleValue(BIGINT, 123L); + TupleDomain domain = withColumnDomains(ImmutableMap.of(columnHandle, singleValueDomain)); + + MessageType fileSchema = new MessageType("hive_schema", new PrimitiveType(OPTIONAL, INT64, "my_primitive")); + + Map, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); + TupleDomain tupleDomain = ParquetPageSourceFactory.getParquetTupleDomain(descriptorsByPath, domain); + + assertEquals(tupleDomain.getDomains().get().size(), 1); + ColumnDescriptor descriptor = tupleDomain.getDomains().get().keySet().iterator().next(); + assertEquals(descriptor.getPath().length, 1); + assertEquals(descriptor.getPath()[0], "my_primitive"); + + Domain predicateDomain = Iterables.getOnlyElement(tupleDomain.getDomains().get().values()); + assertEquals(predicateDomain, singleValueDomain); + } + + @Test + public void testParquetTupleDomainStruct() + { + HiveColumnHandle columnHandle = new HiveColumnHandle("my_struct", HiveType.valueOf("struct"), parseTypeSignature(StandardTypes.ROW), 0, REGULAR, Optional.empty()); + RowType.Field rowField = new RowType.Field(Optional.of("my_struct"), INTEGER); + RowType rowType = RowType.from(ImmutableList.of(rowField)); + TupleDomain domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(rowType))); + + MessageType fileSchema = new MessageType("hive_schema", + new GroupType(OPTIONAL, "my_struct", + new PrimitiveType(OPTIONAL, INT32, "a"), + new PrimitiveType(OPTIONAL, INT32, "b"))); + Map, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); + TupleDomain tupleDomain = ParquetPageSourceFactory.getParquetTupleDomain(descriptorsByPath, domain); + assertTrue(tupleDomain.getDomains().get().isEmpty()); + } + + @Test + public void testParquetTupleDomainMap() + { + HiveColumnHandle columnHandle = new HiveColumnHandle("my_map", HiveType.valueOf("map"), parseTypeSignature(StandardTypes.MAP), 0, REGULAR, Optional.empty()); + + MapType mapType = new MapType( + INTEGER, + INTEGER, + methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"), + methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"), + methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"), + methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException")); + + TupleDomain domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(mapType))); + + MessageType fileSchema = new MessageType("hive_schema", + new GroupType(OPTIONAL, "my_map", + new GroupType(REPEATED, "map", + new PrimitiveType(REQUIRED, INT32, "key"), + new PrimitiveType(OPTIONAL, INT32, "value")))); + + Map, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); + TupleDomain tupleDomain = ParquetPageSourceFactory.getParquetTupleDomain(descriptorsByPath, domain); + assertTrue(tupleDomain.getDomains().get().isEmpty()); + } + + public static void throwUnsupportedOperationException() + { + throw new UnsupportedOperationException(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/MapKeyValuesSchemaConverter.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/MapKeyValuesSchemaConverter.java new file mode 100644 index 00000000..5b465307 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/MapKeyValuesSchemaConverter.java @@ -0,0 +1,218 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet.write; + +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Type.Repetition; +import org.apache.parquet.schema.Types; + +import java.util.List; +import java.util.Locale; + +import static org.apache.parquet.schema.OriginalType.MAP_KEY_VALUE; + +/** + * This class is copied from org.apache.hadoop.hive.ql.io.parquet.convert.HiveSchemaConverter + * and modified to test maps where MAP_KEY_VALUE is incorrectly used in place of MAP + * Backward-compatibility rules described in spec https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps + */ +public class MapKeyValuesSchemaConverter +{ + private MapKeyValuesSchemaConverter() + { + } + + public static MessageType convert(final List columnNames, final List columnTypes) + { + return new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); + } + + private static Type[] convertTypes(final List columnNames, final List columnTypes) + { + if (columnNames.size() != columnTypes.size()) { + throw new IllegalStateException("Mismatched Hive columns and types. Hive columns names" + + " found : " + columnNames + " . And Hive types found : " + columnTypes); + } + final Type[] types = new Type[columnNames.size()]; + for (int i = 0; i < columnNames.size(); ++i) { + types[i] = convertType(columnNames.get(i), columnTypes.get(i)); + } + return types; + } + + private static Type convertType(final String name, final TypeInfo typeInfo) + { + return convertType(name, typeInfo, Repetition.OPTIONAL); + } + + private static Type convertType(final String name, final TypeInfo typeInfo, + final Repetition repetition) + { + if (typeInfo.getCategory().equals(Category.PRIMITIVE)) { + if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) { + return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) + .named(name); + } + else if (typeInfo.equals(TypeInfoFactory.intTypeInfo) || + typeInfo.equals(TypeInfoFactory.shortTypeInfo) || + typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { + return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) { + return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) { + return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.floatTypeInfo)) { + return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) { + return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.binaryTypeInfo)) { + return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) { + return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) { + throw new UnsupportedOperationException("Void type not implemented"); + } + else if (typeInfo.getTypeName().toLowerCase(Locale.ENGLISH).startsWith( + serdeConstants.CHAR_TYPE_NAME)) { + return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); + } + else if (typeInfo.getTypeName().toLowerCase(Locale.ENGLISH).startsWith( + serdeConstants.VARCHAR_TYPE_NAME)) { + return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); + } + else if (typeInfo instanceof DecimalTypeInfo) { + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo; + int prec = decimalTypeInfo.precision(); + int scale = decimalTypeInfo.scale(); + int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; + return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.dateTypeInfo)) { + return Types.primitive(PrimitiveTypeName.INT32, repetition).as(OriginalType.DATE).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.unknownTypeInfo)) { + throw new UnsupportedOperationException("Unknown type not implemented"); + } + else { + throw new IllegalArgumentException("Unknown type: " + typeInfo); + } + } + else if (typeInfo.getCategory().equals(Category.LIST)) { + return convertArrayType(name, (ListTypeInfo) typeInfo); + } + else if (typeInfo.getCategory().equals(Category.STRUCT)) { + return convertStructType(name, (StructTypeInfo) typeInfo); + } + else if (typeInfo.getCategory().equals(Category.MAP)) { + return convertMapType(name, (MapTypeInfo) typeInfo); + } + else if (typeInfo.getCategory().equals(Category.UNION)) { + throw new UnsupportedOperationException("Union type not implemented"); + } + else { + throw new IllegalArgumentException("Unknown type: " + typeInfo); + } + } + + // An optional group containing a repeated anonymous group "bag", containing + // 1 anonymous element "array_element" + private static GroupType convertArrayType(final String name, final ListTypeInfo typeInfo) + { + final TypeInfo subType = typeInfo.getListElementTypeInfo(); + return listWrapper(name, OriginalType.LIST, new GroupType(Repetition.REPEATED, + ParquetHiveSerDe.ARRAY.toString(), convertType("array_element", subType))); + } + + // An optional group containing multiple elements + private static GroupType convertStructType(final String name, final StructTypeInfo typeInfo) + { + final List columnNames = typeInfo.getAllStructFieldNames(); + final List columnTypes = typeInfo.getAllStructFieldTypeInfos(); + return new GroupType(Repetition.OPTIONAL, name, convertTypes(columnNames, columnTypes)); + } + + // An optional group containing a repeated anonymous group "map", containing + // 2 elements: "key", "value" + private static GroupType convertMapType(final String name, final MapTypeInfo typeInfo) + { + final Type keyType = convertType(ParquetHiveSerDe.MAP_KEY.toString(), + typeInfo.getMapKeyTypeInfo(), Repetition.REQUIRED); + final Type valueType = convertType(ParquetHiveSerDe.MAP_VALUE.toString(), + typeInfo.getMapValueTypeInfo()); + return mapType(Repetition.OPTIONAL, name, "map", keyType, valueType); + } + + public static GroupType mapType(Repetition repetition, String alias, String mapAlias, Type keyType, Type valueType) + { + //support projection only on key of a map + if (valueType == null) { + return listWrapper( + repetition, + alias, + MAP_KEY_VALUE, + new GroupType( + Repetition.REPEATED, + mapAlias, + keyType)); + } + else { + if (!valueType.getName().equals("value")) { + throw new RuntimeException(valueType.getName() + " should be value"); + } + return listWrapper( + repetition, + alias, + MAP_KEY_VALUE, + new GroupType( + Repetition.REPEATED, + mapAlias, + keyType, + valueType)); + } + } + + private static GroupType listWrapper(Repetition repetition, String alias, OriginalType originalType, Type nested) + { + if (!nested.isRepetition(Repetition.REPEATED)) { + throw new IllegalArgumentException("Nested type should be repeated: " + nested); + } + return new GroupType(repetition, alias, originalType, nested); + } + + private static GroupType listWrapper(final String name, final OriginalType originalType, + final GroupType groupType) + { + return new GroupType(Repetition.OPTIONAL, name, originalType, groupType); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/SingleLevelArrayMapKeyValuesSchemaConverter.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/SingleLevelArrayMapKeyValuesSchemaConverter.java new file mode 100644 index 00000000..9f2daf8f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/SingleLevelArrayMapKeyValuesSchemaConverter.java @@ -0,0 +1,232 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet.write; + +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Type.Repetition; +import org.apache.parquet.schema.Types; + +import java.util.List; +import java.util.Locale; + +import static org.apache.parquet.schema.OriginalType.MAP_KEY_VALUE; + +/** + * This class is copied from org.apache.hadoop.hive.ql.io.parquet.convert.HiveSchemaConverter + * and modified to test Array Schema without wrapping anonymous group "bag". + * Additionally, there is a schema modification in maps where MAP_KEY_VALUE is incorrectly used in place of MAP + * Backward-compatibility rules described in spec https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + */ +public class SingleLevelArrayMapKeyValuesSchemaConverter +{ + private SingleLevelArrayMapKeyValuesSchemaConverter() + { + } + + public static MessageType convert(final List columnNames, final List columnTypes) + { + return new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); + } + + private static Type[] convertTypes(final List columnNames, final List columnTypes) + { + if (columnNames.size() != columnTypes.size()) { + throw new IllegalStateException("Mismatched Hive columns and types. Hive columns names" + + " found : " + columnNames + " . And Hive types found : " + columnTypes); + } + final Type[] types = new Type[columnNames.size()]; + for (int i = 0; i < columnNames.size(); ++i) { + types[i] = convertType(columnNames.get(i), columnTypes.get(i)); + } + return types; + } + + private static Type convertType(final String name, final TypeInfo typeInfo) + { + return convertType(name, typeInfo, Repetition.OPTIONAL); + } + + private static Type convertType(final String name, final TypeInfo typeInfo, + final Repetition repetition) + { + if (typeInfo.getCategory().equals(Category.PRIMITIVE)) { + if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) { + return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) + .named(name); + } + else if (typeInfo.equals(TypeInfoFactory.intTypeInfo) || + typeInfo.equals(TypeInfoFactory.shortTypeInfo) || + typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { + return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) { + return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) { + return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.floatTypeInfo)) { + return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) { + return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.binaryTypeInfo)) { + return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) { + return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) { + throw new UnsupportedOperationException("Void type not implemented"); + } + else if (typeInfo.getTypeName().toLowerCase(Locale.ENGLISH).startsWith( + serdeConstants.CHAR_TYPE_NAME)) { + if (repetition == Repetition.OPTIONAL) { + return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); + } + else { + return Types.repeated(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); + } + } + else if (typeInfo.getTypeName().toLowerCase(Locale.ENGLISH).startsWith( + serdeConstants.VARCHAR_TYPE_NAME)) { + if (repetition == Repetition.OPTIONAL) { + return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); + } + else { + return Types.repeated(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); + } + } + else if (typeInfo instanceof DecimalTypeInfo) { + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo; + int prec = decimalTypeInfo.precision(); + int scale = decimalTypeInfo.scale(); + int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; + if (repetition == Repetition.OPTIONAL) { + return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name); + } + else { + return Types.repeated(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name); + } + } + else if (typeInfo.equals(TypeInfoFactory.dateTypeInfo)) { + return Types.primitive(PrimitiveTypeName.INT32, repetition).as(OriginalType.DATE).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.unknownTypeInfo)) { + throw new UnsupportedOperationException("Unknown type not implemented"); + } + else { + throw new IllegalArgumentException("Unknown type: " + typeInfo); + } + } + else if (typeInfo.getCategory().equals(Category.LIST)) { + return convertArrayType(name, (ListTypeInfo) typeInfo, repetition); + } + else if (typeInfo.getCategory().equals(Category.STRUCT)) { + return convertStructType(name, (StructTypeInfo) typeInfo, repetition); + } + else if (typeInfo.getCategory().equals(Category.MAP)) { + return convertMapType(name, (MapTypeInfo) typeInfo, repetition); + } + else if (typeInfo.getCategory().equals(Category.UNION)) { + throw new UnsupportedOperationException("Union type not implemented"); + } + else { + throw new IllegalArgumentException("Unknown type: " + typeInfo); + } + } + + // 1 anonymous element "array_element" + private static GroupType convertArrayType(final String name, final ListTypeInfo typeInfo, final Repetition repetition) + { + final TypeInfo subType = typeInfo.getListElementTypeInfo(); + return listWrapper(name, OriginalType.LIST, convertType("array_element", subType, Repetition.REPEATED), repetition); + } + + // An optional group containing multiple elements + private static GroupType convertStructType(final String name, final StructTypeInfo typeInfo, final Repetition repetition) + { + final List columnNames = typeInfo.getAllStructFieldNames(); + final List columnTypes = typeInfo.getAllStructFieldTypeInfos(); + return new GroupType(repetition, name, convertTypes(columnNames, columnTypes)); + } + + // An optional group containing a repeated anonymous group "map", containing + // 2 elements: "key", "value" + private static GroupType convertMapType(final String name, final MapTypeInfo typeInfo, final Repetition repetition) + { + final Type keyType = convertType(ParquetHiveSerDe.MAP_KEY.toString(), + typeInfo.getMapKeyTypeInfo(), Repetition.REQUIRED); + final Type valueType = convertType(ParquetHiveSerDe.MAP_VALUE.toString(), + typeInfo.getMapValueTypeInfo()); + return mapType(repetition, name, "map", keyType, valueType); + } + + public static GroupType mapType(Repetition repetition, String alias, String mapAlias, Type keyType, Type valueType) + { + //support projection only on key of a map + if (valueType == null) { + return listWrapper( + repetition, + alias, + MAP_KEY_VALUE, + new GroupType( + Repetition.REPEATED, + mapAlias, + keyType)); + } + else { + if (!valueType.getName().equals("value")) { + throw new RuntimeException(valueType.getName() + " should be value"); + } + return listWrapper( + repetition, + alias, + MAP_KEY_VALUE, + new GroupType( + Repetition.REPEATED, + mapAlias, + keyType, + valueType)); + } + } + + private static GroupType listWrapper(Repetition repetition, String alias, OriginalType originalType, Type nested) + { + if (!nested.isRepetition(Repetition.REPEATED)) { + throw new IllegalArgumentException("Nested type should be repeated: " + nested); + } + return new GroupType(repetition, alias, originalType, nested); + } + + private static GroupType listWrapper(final String name, final OriginalType originalType, + final Type elementType, final Repetition repetition) + { + return new GroupType(repetition, name, originalType, elementType); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/SingleLevelArraySchemaConverter.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/SingleLevelArraySchemaConverter.java new file mode 100644 index 00000000..2df25ec5 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/SingleLevelArraySchemaConverter.java @@ -0,0 +1,193 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet.write; + +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.parquet.schema.ConversionPatterns; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Type.Repetition; +import org.apache.parquet.schema.Types; + +import java.util.List; +import java.util.Locale; + +/** + * This class is copied from org.apache.hadoop.hive.ql.io.parquet.convert.HiveSchemaConverter + * and modified to test Array Schema without wrapping anonymous group "bag". + * Backward-compatibility rules described in spec https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + */ +public class SingleLevelArraySchemaConverter +{ + private SingleLevelArraySchemaConverter() + { + } + + public static MessageType convert(final List columnNames, final List columnTypes) + { + return new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); + } + + private static Type[] convertTypes(final List columnNames, final List columnTypes) + { + if (columnNames.size() != columnTypes.size()) { + throw new IllegalStateException("Mismatched Hive columns and types. Hive columns names" + + " found : " + columnNames + " . And Hive types found : " + columnTypes); + } + final Type[] types = new Type[columnNames.size()]; + for (int i = 0; i < columnNames.size(); ++i) { + types[i] = convertType(columnNames.get(i), columnTypes.get(i)); + } + return types; + } + + private static Type convertType(final String name, final TypeInfo typeInfo) + { + return convertType(name, typeInfo, Repetition.OPTIONAL); + } + + private static Type convertType(final String name, final TypeInfo typeInfo, + final Repetition repetition) + { + if (typeInfo.getCategory().equals(Category.PRIMITIVE)) { + if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) { + return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) + .named(name); + } + else if (typeInfo.equals(TypeInfoFactory.intTypeInfo) || + typeInfo.equals(TypeInfoFactory.shortTypeInfo) || + typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { + return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) { + return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) { + return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.floatTypeInfo)) { + return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) { + return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.binaryTypeInfo)) { + return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) { + return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) { + throw new UnsupportedOperationException("Void type not implemented"); + } + else if (typeInfo.getTypeName().toLowerCase(Locale.ENGLISH).startsWith( + serdeConstants.CHAR_TYPE_NAME)) { + if (repetition == Repetition.OPTIONAL) { + return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); + } + else { + return Types.repeated(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); + } + } + else if (typeInfo.getTypeName().toLowerCase(Locale.ENGLISH).startsWith( + serdeConstants.VARCHAR_TYPE_NAME)) { + if (repetition == Repetition.OPTIONAL) { + return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); + } + else { + return Types.repeated(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); + } + } + else if (typeInfo instanceof DecimalTypeInfo) { + DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo; + int prec = decimalTypeInfo.precision(); + int scale = decimalTypeInfo.scale(); + int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; + if (repetition == Repetition.OPTIONAL) { + return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name); + } + else { + return Types.repeated(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name); + } + } + else if (typeInfo.equals(TypeInfoFactory.dateTypeInfo)) { + return Types.primitive(PrimitiveTypeName.INT32, repetition).as(OriginalType.DATE).named(name); + } + else if (typeInfo.equals(TypeInfoFactory.unknownTypeInfo)) { + throw new UnsupportedOperationException("Unknown type not implemented"); + } + else { + throw new IllegalArgumentException("Unknown type: " + typeInfo); + } + } + else if (typeInfo.getCategory().equals(Category.LIST)) { + return convertArrayType(name, (ListTypeInfo) typeInfo, repetition); + } + else if (typeInfo.getCategory().equals(Category.STRUCT)) { + return convertStructType(name, (StructTypeInfo) typeInfo, repetition); + } + else if (typeInfo.getCategory().equals(Category.MAP)) { + return convertMapType(name, (MapTypeInfo) typeInfo, repetition); + } + else if (typeInfo.getCategory().equals(Category.UNION)) { + throw new UnsupportedOperationException("Union type not implemented"); + } + else { + throw new IllegalArgumentException("Unknown type: " + typeInfo); + } + } + + // 1 anonymous element "array_element" + private static GroupType convertArrayType(final String name, final ListTypeInfo typeInfo, final Repetition repetition) + { + final TypeInfo subType = typeInfo.getListElementTypeInfo(); + return listWrapper(name, OriginalType.LIST, convertType("array", subType, Repetition.REPEATED), repetition); + } + + // An optional group containing multiple elements + private static GroupType convertStructType(final String name, final StructTypeInfo typeInfo, final Repetition repetition) + { + final List columnNames = typeInfo.getAllStructFieldNames(); + final List columnTypes = typeInfo.getAllStructFieldTypeInfos(); + return new GroupType(repetition, name, convertTypes(columnNames, columnTypes)); + } + + // An optional group containing a repeated anonymous group "map", containing + // 2 elements: "key", "value" + private static GroupType convertMapType(final String name, final MapTypeInfo typeInfo, final Repetition repetition) + { + final Type keyType = convertType(ParquetHiveSerDe.MAP_KEY.toString(), + typeInfo.getMapKeyTypeInfo(), Repetition.REQUIRED); + final Type valueType = convertType(ParquetHiveSerDe.MAP_VALUE.toString(), + typeInfo.getMapValueTypeInfo()); + return ConversionPatterns.mapType(repetition, name, keyType, valueType); + } + + private static GroupType listWrapper(final String name, final OriginalType originalType, + final Type elementType, final Repetition repetition) + { + return new GroupType(repetition, name, originalType, elementType); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/TestDataWritableWriteSupport.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/TestDataWritableWriteSupport.java new file mode 100644 index 00000000..87a87639 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/TestDataWritableWriteSupport.java @@ -0,0 +1,59 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet.write; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriteSupport; +import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord; +import org.apache.parquet.hadoop.api.WriteSupport; +import org.apache.parquet.io.api.RecordConsumer; +import org.apache.parquet.schema.MessageType; + +import java.util.HashMap; + +/** + * This class is copied from org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriteSupport + * and extended to support empty arrays and maps (HIVE-13632). + */ +class TestDataWritableWriteSupport + extends WriteSupport +{ + private TestDataWritableWriter writer; + private MessageType schema; + private boolean singleLevelArray; + + public TestDataWritableWriteSupport(boolean singleLevelArray) + { + this.singleLevelArray = singleLevelArray; + } + + @Override + public WriteContext init(final Configuration configuration) + { + schema = DataWritableWriteSupport.getSchema(configuration); + return new WriteContext(schema, new HashMap<>()); + } + + @Override + public void prepareForWrite(final RecordConsumer recordConsumer) + { + writer = new TestDataWritableWriter(recordConsumer, schema, singleLevelArray); + } + + @Override + public void write(final ParquetHiveRecord record) + { + writer.write(record); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/TestDataWritableWriter.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/TestDataWritableWriter.java new file mode 100644 index 00000000..1dfab863 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/TestDataWritableWriter.java @@ -0,0 +1,410 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet.write; + +import io.airlift.log.Logger; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; +import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; +import org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter; +import org.apache.hadoop.hive.serde2.io.ParquetHiveRecord; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.io.api.RecordConsumer; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.Type; + +import java.util.List; +import java.util.Map; + +/** + * This class is copied from org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter + * and extended to support empty arrays and maps (HIVE-13632). + * Additionally, there is a support for arrays without an inner element layer and + * support for maps where MAP_KEY_VALUE is incorrectly used in place of MAP + * for backward-compatibility rules testing (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists) + */ +public class TestDataWritableWriter +{ + private static final Logger log = Logger.get(DataWritableWriter.class); + private final RecordConsumer recordConsumer; + private final GroupType schema; + private final boolean singleLevelArray; + + public TestDataWritableWriter(final RecordConsumer recordConsumer, final GroupType schema, boolean singleLevelArray) + { + this.recordConsumer = recordConsumer; + this.schema = schema; + this.singleLevelArray = singleLevelArray; + } + + /** + * It writes all record values to the Parquet RecordConsumer. + * + * @param record Contains the record that are going to be written. + */ + public void write(final ParquetHiveRecord record) + { + if (record != null) { + recordConsumer.startMessage(); + try { + writeGroupFields(record.getObject(), record.getObjectInspector(), schema); + } + catch (RuntimeException e) { + String errorMessage = "Parquet record is malformed: " + e.getMessage(); + log.error(errorMessage, e); + throw new RuntimeException(errorMessage, e); + } + recordConsumer.endMessage(); + } + } + + /** + * It writes all the fields contained inside a group to the RecordConsumer. + * + * @param value The list of values contained in the group. + * @param inspector The object inspector used to get the correct value type. + * @param type Type that contains information about the group schema. + */ + private void writeGroupFields(final Object value, final StructObjectInspector inspector, final GroupType type) + { + if (value != null) { + List fields = inspector.getAllStructFieldRefs(); + List fieldValuesList = inspector.getStructFieldsDataAsList(value); + + for (int i = 0; i < type.getFieldCount(); i++) { + Type fieldType = type.getType(i); + String fieldName = fieldType.getName(); + Object fieldValue = fieldValuesList.get(i); + + if (fieldValue != null) { + ObjectInspector fieldInspector = fields.get(i).getFieldObjectInspector(); + recordConsumer.startField(fieldName, i); + writeValue(fieldValue, fieldInspector, fieldType); + recordConsumer.endField(fieldName, i); + } + } + } + } + + /** + * It writes the field value to the Parquet RecordConsumer. It detects the field type, and calls + * the correct write function. + * + * @param value The writable object that contains the value. + * @param inspector The object inspector used to get the correct value type. + * @param type Type that contains information about the type schema. + */ + private void writeValue(final Object value, final ObjectInspector inspector, final Type type) + { + if (type.isPrimitive()) { + checkInspectorCategory(inspector, ObjectInspector.Category.PRIMITIVE); + writePrimitive(value, (PrimitiveObjectInspector) inspector); + } + else { + GroupType groupType = type.asGroupType(); + OriginalType originalType = type.getOriginalType(); + + if (originalType != null && originalType.equals(OriginalType.LIST)) { + checkInspectorCategory(inspector, ObjectInspector.Category.LIST); + if (singleLevelArray) { + writeSingleLevelArray(value, (ListObjectInspector) inspector, groupType); + } + else { + writeArray(value, (ListObjectInspector) inspector, groupType); + } + } + else if (originalType != null && (originalType.equals(OriginalType.MAP) || originalType.equals(OriginalType.MAP_KEY_VALUE))) { + checkInspectorCategory(inspector, ObjectInspector.Category.MAP); + writeMap(value, (MapObjectInspector) inspector, groupType); + } + else { + checkInspectorCategory(inspector, ObjectInspector.Category.STRUCT); + writeGroup(value, (StructObjectInspector) inspector, groupType); + } + } + } + + /** + * Checks that an inspector matches the category indicated as a parameter. + * + * @param inspector The object inspector to check + * @param category The category to match + * @throws IllegalArgumentException if inspector does not match the category + */ + private void checkInspectorCategory(ObjectInspector inspector, ObjectInspector.Category category) + { + if (!inspector.getCategory().equals(category)) { + throw new IllegalArgumentException("Invalid data type: expected " + category + + " type, but found: " + inspector.getCategory()); + } + } + + /** + * It writes a group type and all its values to the Parquet RecordConsumer. + * This is used only for optional and required groups. + * + * @param value Object that contains the group values. + * @param inspector The object inspector used to get the correct value type. + * @param type Type that contains information about the group schema. + */ + private void writeGroup(final Object value, final StructObjectInspector inspector, final GroupType type) + { + recordConsumer.startGroup(); + writeGroupFields(value, inspector, type); + recordConsumer.endGroup(); + } + + /** + * It writes a list type and its array elements to the Parquet RecordConsumer. + * This is called when the original type (LIST) is detected by writeValue()/ + * This function assumes the following schema: + * optional group arrayCol (LIST) { + * repeated group array { + * optional TYPE array_element; + * } + * } + * + * @param value The object that contains the array values. + * @param inspector The object inspector used to get the correct value type. + * @param type Type that contains information about the group (LIST) schema. + */ + private void writeArray(final Object value, final ListObjectInspector inspector, final GroupType type) + { + // Get the internal array structure + GroupType repeatedType = type.getType(0).asGroupType(); + recordConsumer.startGroup(); + + List arrayValues = inspector.getList(value); + if (!arrayValues.isEmpty()) { + recordConsumer.startField(repeatedType.getName(), 0); + ObjectInspector elementInspector = inspector.getListElementObjectInspector(); + + Type elementType = repeatedType.getType(0); + String elementName = elementType.getName(); + + for (Object element : arrayValues) { + recordConsumer.startGroup(); + if (element != null) { + recordConsumer.startField(elementName, 0); + writeValue(element, elementInspector, elementType); + recordConsumer.endField(elementName, 0); + } + recordConsumer.endGroup(); + } + + recordConsumer.endField(repeatedType.getName(), 0); + } + recordConsumer.endGroup(); + } + + private void writeSingleLevelArray(final Object value, final ListObjectInspector inspector, final GroupType type) + { + // Get the internal array structure + Type elementType = type.getType(0); + + recordConsumer.startGroup(); + + List arrayValues = inspector.getList(value); + if (!arrayValues.isEmpty()) { + recordConsumer.startField(elementType.getName(), 0); + ObjectInspector elementInspector = inspector.getListElementObjectInspector(); + + for (Object element : arrayValues) { + if (element == null) { + throw new IllegalArgumentException("Array elements are requires in given schema definition"); + } + writeValue(element, elementInspector, elementType); + } + + recordConsumer.endField(elementType.getName(), 0); + } + recordConsumer.endGroup(); + } + + /** + * It writes a map type and its key-pair values to the Parquet RecordConsumer. + * This is called when the original type (MAP) is detected by writeValue(). + * This function assumes the following schema: + * optional group mapCol (MAP) { + * repeated group map (MAP_KEY_VALUE) { + * required TYPE key; + * optional TYPE value; + * } + * } + * + * @param value The object that contains the map key-values. + * @param inspector The object inspector used to get the correct value type. + * @param type Type that contains information about the group (MAP) schema. + */ + private void writeMap(final Object value, final MapObjectInspector inspector, final GroupType type) + { + // Get the internal map structure (MAP_KEY_VALUE) + GroupType repeatedType = type.getType(0).asGroupType(); + + recordConsumer.startGroup(); + Map mapValues = inspector.getMap(value); + if (mapValues != null && mapValues.size() > 0) { + recordConsumer.startField(repeatedType.getName(), 0); + + Type keyType = repeatedType.getType(0); + String keyName = keyType.getName(); + ObjectInspector keyInspector = inspector.getMapKeyObjectInspector(); + + Type valuetype = repeatedType.getType(1); + String valueName = valuetype.getName(); + ObjectInspector valueInspector = inspector.getMapValueObjectInspector(); + + for (Map.Entry keyValue : mapValues.entrySet()) { + recordConsumer.startGroup(); + if (keyValue != null) { + // write key element + Object keyElement = keyValue.getKey(); + recordConsumer.startField(keyName, 0); + writeValue(keyElement, keyInspector, keyType); + recordConsumer.endField(keyName, 0); + + // write value element + Object valueElement = keyValue.getValue(); + if (valueElement != null) { + recordConsumer.startField(valueName, 1); + writeValue(valueElement, valueInspector, valuetype); + recordConsumer.endField(valueName, 1); + } + } + recordConsumer.endGroup(); + } + + recordConsumer.endField(repeatedType.getName(), 0); + } + recordConsumer.endGroup(); + } + + /** + * It writes the primitive value to the Parquet RecordConsumer. + * + * @param value The object that contains the primitive value. + * @param inspector The object inspector used to get the correct value type. + */ + private void writePrimitive(final Object value, final PrimitiveObjectInspector inspector) + { + if (value == null) { + return; + } + + switch (inspector.getPrimitiveCategory()) { + case VOID: + return; + case DOUBLE: + recordConsumer.addDouble(((DoubleObjectInspector) inspector).get(value)); + break; + case BOOLEAN: + recordConsumer.addBoolean(((BooleanObjectInspector) inspector).get(value)); + break; + case FLOAT: + recordConsumer.addFloat(((FloatObjectInspector) inspector).get(value)); + break; + case BYTE: + recordConsumer.addInteger(((ByteObjectInspector) inspector).get(value)); + break; + case INT: + recordConsumer.addInteger(((IntObjectInspector) inspector).get(value)); + break; + case LONG: + recordConsumer.addLong(((LongObjectInspector) inspector).get(value)); + break; + case SHORT: + recordConsumer.addInteger(((ShortObjectInspector) inspector).get(value)); + break; + case STRING: + String v = ((StringObjectInspector) inspector).getPrimitiveJavaObject(value); + recordConsumer.addBinary(Binary.fromString(v)); + break; + case CHAR: + String vChar = ((HiveCharObjectInspector) inspector).getPrimitiveJavaObject(value).getStrippedValue(); + recordConsumer.addBinary(Binary.fromString(vChar)); + break; + case VARCHAR: + String vVarchar = ((HiveVarcharObjectInspector) inspector).getPrimitiveJavaObject(value).getValue(); + recordConsumer.addBinary(Binary.fromString(vVarchar)); + break; + case BINARY: + byte[] vBinary = ((BinaryObjectInspector) inspector).getPrimitiveJavaObject(value); + recordConsumer.addBinary(Binary.fromByteArray(vBinary)); + break; + case TIMESTAMP: + Timestamp ts = ((TimestampObjectInspector) inspector).getPrimitiveJavaObject(value); + recordConsumer.addBinary(NanoTimeUtils.getNanoTime(ts, false).toBinary()); + break; + case DECIMAL: + HiveDecimal vDecimal = ((HiveDecimal) inspector.getPrimitiveJavaObject(value)); + DecimalTypeInfo decTypeInfo = (DecimalTypeInfo) inspector.getTypeInfo(); + recordConsumer.addBinary(decimalToBinary(vDecimal, decTypeInfo)); + break; + case DATE: + Date vDate = ((DateObjectInspector) inspector).getPrimitiveJavaObject(value); + recordConsumer.addInteger(vDate.toEpochDay()); + break; + default: + throw new IllegalArgumentException("Unsupported primitive data type: " + inspector.getPrimitiveCategory()); + } + } + + private Binary decimalToBinary(final HiveDecimal hiveDecimal, final DecimalTypeInfo decimalTypeInfo) + { + int prec = decimalTypeInfo.precision(); + int scale = decimalTypeInfo.scale(); + byte[] decimalBytes = hiveDecimal.setScale(scale).unscaledValue().toByteArray(); + + // Estimated number of bytes needed. + int precToBytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; + if (precToBytes == decimalBytes.length) { + // No padding needed. + return Binary.fromByteArray(decimalBytes); + } + + byte[] tgt = new byte[precToBytes]; + if (hiveDecimal.signum() == -1) { + // For negative number, initializing bits to 1 + for (int i = 0; i < precToBytes; i++) { + tgt[i] |= 0xFF; + } + } + + System.arraycopy(decimalBytes, 0, tgt, precToBytes - decimalBytes.length, decimalBytes.length); // Padding leading zeroes/ones. + return Binary.fromByteArray(tgt); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/TestMapredParquetOutputFormat.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/TestMapredParquetOutputFormat.java new file mode 100644 index 00000000..c9d515a6 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/parquet/write/TestMapredParquetOutputFormat.java @@ -0,0 +1,65 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.parquet.write; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; +import org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriteSupport; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.util.Progressable; +import org.apache.parquet.hadoop.ParquetOutputFormat; +import org.apache.parquet.schema.MessageType; + +import java.io.IOException; +import java.util.Optional; +import java.util.Properties; + +import static java.util.Objects.requireNonNull; + +/* + MapredParquetOutputFormat creates the Parquet schema from the column types, + which is not always what we want. Because, in that case for decimal type + the schema always specifies FIXED_LEN_BYTE_ARRAY as the backing type. But, + we also want to test the cases were the backing type is INT32/INT64, which requires + a custom Parquet schema. +*/ +public class TestMapredParquetOutputFormat + extends MapredParquetOutputFormat +{ + private final Optional schema; + + public TestMapredParquetOutputFormat(Optional schema, boolean singleLevelArray) + { + super(new ParquetOutputFormat<>(new TestDataWritableWriteSupport(singleLevelArray))); + this.schema = requireNonNull(schema, "schema is null"); + } + + @Override + public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jobConf, + Path finalOutPath, + Class valueClass, + boolean isCompressed, + Properties tableProperties, + Progressable progress) + throws IOException + { + if (schema.isPresent()) { + DataWritableWriteSupport.setSchema(schema.get(), jobConf); + return getParquerRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress, tableProperties); + } + return super.getHiveRecordWriter(jobConf, finalOutPath, valueClass, isCompressed, tableProperties, progress); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHiveFilterPushdown.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHiveFilterPushdown.java new file mode 100644 index 00000000..9629eb13 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHiveFilterPushdown.java @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.rule; + +import io.prestosql.cost.ConnectorFilterStatsCalculatorService; +import io.prestosql.cost.FilterStatsCalculator; +import io.prestosql.cost.ScalarStatsCalculator; +import io.prestosql.cost.StatsNormalizer; +import io.prestosql.plugin.hive.HivePartitionManager; +import io.prestosql.plugin.hive.HiveTransactionManager; +import io.prestosql.spi.function.StandardFunctionResolution; +import io.prestosql.spi.plan.FilterNode; +import io.prestosql.spi.plan.FilterStatsCalculatorService; +import io.prestosql.spi.plan.PlanNode; +import io.prestosql.spi.plan.TableScanNode; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.relation.RowExpressionService; +import io.prestosql.sql.TestingRowExpressionTranslator; +import io.prestosql.sql.relational.ConnectorRowExpressionService; +import io.prestosql.sql.relational.FunctionResolution; +import io.prestosql.sql.relational.RowExpressionDeterminismEvaluator; +import io.prestosql.sql.relational.RowExpressionDomainTranslator; +import org.testng.annotations.Test; + +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.COLUMN_INT; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.COLUMN_TYPE_MAP; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.ID_ALLOCATOR; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.OFFLOAD_METADATA; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.OFFLOAD_SESSION; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.SYMBOL_ALLOCATOR; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.buildFilterNode; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.buildTableScanNode; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.matchFilterOffload; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.simulationHiveTransactionManager; + +public class TestHiveFilterPushdown + extends TestHivePushdown +{ + private static final HiveFilterPushdown FILTER_OPTIMIZER = createOptimizer(); + private static final TestingRowExpressionTranslator TRANSLATOR = new TestingRowExpressionTranslator(OFFLOAD_METADATA); + + private static HiveFilterPushdown createOptimizer() + { + RowExpressionService expressionService = new ConnectorRowExpressionService(new RowExpressionDomainTranslator(OFFLOAD_METADATA), new RowExpressionDeterminismEvaluator(OFFLOAD_METADATA)); + HiveTransactionManager transactionManager = simulationHiveTransactionManager(); + StandardFunctionResolution resolution = new FunctionResolution(OFFLOAD_METADATA.getFunctionAndTypeManager()); + HivePartitionManager partitionManager = + new HivePartitionManager(OFFLOAD_METADATA.getFunctionAndTypeManager(), 1, false, 1); + ScalarStatsCalculator scalarStatsCalculator = new ScalarStatsCalculator(OFFLOAD_METADATA); + StatsNormalizer normalizer = new StatsNormalizer(); + FilterStatsCalculator statsCalculator = new FilterStatsCalculator(OFFLOAD_METADATA, scalarStatsCalculator, normalizer); + FilterStatsCalculatorService calculatorService = new ConnectorFilterStatsCalculatorService(statsCalculator); + HiveFilterPushdown optimizer = new HiveFilterPushdown(transactionManager, expressionService, + resolution, partitionManager, calculatorService, OFFLOAD_METADATA.getFunctionAndTypeManager()); + return optimizer; + } + + @Test + public void testFilterPushdown() + { + TableScanNode tableScanNode = buildTableScanNode(COLUMN_INT); + String predicate = String.format("%s < 1", COLUMN_INT.getColumnName()); + RowExpression expression = TRANSLATOR.translate(predicate, SYMBOL_ALLOCATOR.getSymbols()); + FilterNode filterNode = buildFilterNode(tableScanNode, expression); + + PlanNode node = FILTER_OPTIMIZER.optimize(filterNode, OFFLOAD_SESSION, COLUMN_TYPE_MAP, SYMBOL_ALLOCATOR, ID_ALLOCATOR); + matchFilterOffload(node, expression); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHiveLimitPushdown.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHiveLimitPushdown.java new file mode 100644 index 00000000..c22d5210 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHiveLimitPushdown.java @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.rule; + +import io.prestosql.spi.plan.LimitNode; +import io.prestosql.spi.plan.PlanNode; +import io.prestosql.spi.plan.TableScanNode; +import org.testng.annotations.Test; + +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.COLUMN_TYPE_MAP; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.ID_ALLOCATOR; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.OFFLOAD_SESSION; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.SYMBOL_ALLOCATOR; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.buildPartialLimitNode; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.buildTableScanNode; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.matchLimitOffload; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.simulationHiveTransactionManager; + +public class TestHiveLimitPushdown + extends TestHivePushdown +{ + @Test + public void testLimitPushdown() + { + int count = 5; + HiveLimitPushdown optimizer = new HiveLimitPushdown(simulationHiveTransactionManager()); + TableScanNode tableScanNode = buildTableScanNode(); + LimitNode limitNode = buildPartialLimitNode(tableScanNode, count); + PlanNode node = optimizer.optimize(limitNode, OFFLOAD_SESSION, COLUMN_TYPE_MAP, SYMBOL_ALLOCATOR, ID_ALLOCATOR); + matchLimitOffload(node, count); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePartialAggregationPushdown.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePartialAggregationPushdown.java new file mode 100644 index 00000000..613c7a68 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePartialAggregationPushdown.java @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.rule; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.huawei.boostkit.omnidata.model.AggregationInfo; +import io.prestosql.plugin.hive.HiveMetadata; +import io.prestosql.plugin.hive.HiveMetadataFactory; +import io.prestosql.spi.function.OperatorType; +import io.prestosql.spi.function.StandardFunctionResolution; +import io.prestosql.spi.plan.AggregationNode; +import io.prestosql.spi.plan.Assignments; +import io.prestosql.spi.plan.PlanNode; +import io.prestosql.spi.plan.ProjectNode; +import io.prestosql.spi.plan.Symbol; +import io.prestosql.spi.plan.TableScanNode; +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.ConstantExpression; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.relation.VariableReferenceExpression; +import io.prestosql.sql.relational.FunctionResolution; +import org.mockito.Mockito; +import org.testng.annotations.Test; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.COLUMN_INT; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.COLUMN_TYPE_MAP; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.ID_ALLOCATOR; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.OFFLOAD_METADATA; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.OFFLOAD_SESSION; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.SYMBOL_ALLOCATOR; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.buildAggregationNode; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.buildAssignments; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.buildProjectNode; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.buildTableScanNode; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.createAggregation; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.createOperationExpression; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.matchAggregatorOffload; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.matchProjection; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.simulationHiveMetadata; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.simulationHiveTransactionManager; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.IntegerType.INTEGER; + +public class TestHivePartialAggregationPushdown + extends TestHivePushdown +{ + private static final HivePartialAggregationPushdown AGGREGATION_OPTIMIZER = createOptimizer(); + + private static HivePartialAggregationPushdown createOptimizer() + { + HiveMetadataFactory hiveMetadataFactory = Mockito.mock(HiveMetadataFactory.class); + HiveMetadata hiveMetadata = simulationHiveMetadata(); + Mockito.when(hiveMetadataFactory.get()).thenReturn(hiveMetadata); + + StandardFunctionResolution resolution = new FunctionResolution(OFFLOAD_METADATA.getFunctionAndTypeManager()); + HivePartialAggregationPushdown optimizer = new HivePartialAggregationPushdown(simulationHiveTransactionManager(), + OFFLOAD_METADATA.getFunctionAndTypeManager(), resolution, hiveMetadataFactory); + return optimizer; + } + + AggregationNode buildCountAggregationNode(PlanNode source) + { + // select count(x) from table group by x + VariableReferenceExpression expression = new VariableReferenceExpression(COLUMN_INT.getName(), COLUMN_TYPE_MAP.get(COLUMN_INT.getName())); + AggregationNode.Aggregation aggregation = createAggregation("count", BIGINT, ImmutableList.of(expression)); + Map aggregations = ImmutableMap.of(new Symbol(COLUMN_INT.getName()), aggregation); + AggregationNode.GroupingSetDescriptor groupingSets = + new AggregationNode.GroupingSetDescriptor(ImmutableList.of(new Symbol(COLUMN_INT.getName())), 1, Collections.emptySet()); + return buildAggregationNode(source, aggregations, groupingSets); + } + + @Test + public void testPartialAggregationPushdown() + { + // select count(x) from table group by x + TableScanNode tableScanNode = buildTableScanNode(COLUMN_INT); + AggregationNode aggregationNode = buildCountAggregationNode(tableScanNode); + + ImmutableMap.Builder aggregationsExpected = new ImmutableMap.Builder<>(); + for (Map.Entry entry : aggregationNode.getAggregations().entrySet()) { + AggregationInfo.AggregateFunction aggregateFunction = + new AggregationInfo.AggregateFunction(entry.getValue().getFunctionCall(), entry.getValue().isDistinct()); + aggregationsExpected.put(entry.getKey().getName(), aggregateFunction); + } + List groupingKeysExpected = + ImmutableList.of(new VariableReferenceExpression(COLUMN_INT.getName(), INTEGER)); + AggregationInfo aggregationInfoExpected = new AggregationInfo(aggregationsExpected.build(), groupingKeysExpected); + + PlanNode outputNode = AGGREGATION_OPTIMIZER.optimize(aggregationNode, OFFLOAD_SESSION, COLUMN_TYPE_MAP, SYMBOL_ALLOCATOR, ID_ALLOCATOR); + matchAggregatorOffload(outputNode, aggregationInfoExpected); + } + + @Test + public void testPartialAggregationAndProjectPushdown() + { + // select count(x + 5) from table group by x + TableScanNode tableScanNode = buildTableScanNode(COLUMN_INT); + + CallExpression callExpression = createOperationExpression(OperatorType.ADD, + new VariableReferenceExpression(COLUMN_INT.getName(), INTEGER), new ConstantExpression(5, INTEGER)); + List symbols = ImmutableList.of(new Symbol(COLUMN_INT.getName())); + List rowExpressions = ImmutableList.of(callExpression); + ProjectNode projectNode = buildProjectNode(tableScanNode, symbols, rowExpressions); + + AggregationNode aggregationNode = buildCountAggregationNode(projectNode); + + PlanNode output = AGGREGATION_OPTIMIZER.optimize(aggregationNode, OFFLOAD_SESSION, COLUMN_TYPE_MAP, SYMBOL_ALLOCATOR, ID_ALLOCATOR); + Assignments assignmentsExpected = buildAssignments(symbols, rowExpressions); + matchProjection(output, assignmentsExpected.getMap()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePlanOptimizerProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePlanOptimizerProvider.java new file mode 100644 index 00000000..5c86b050 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePlanOptimizerProvider.java @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.rule; + +import io.prestosql.cost.ConnectorFilterStatsCalculatorService; +import io.prestosql.cost.FilterStatsCalculator; +import io.prestosql.cost.ScalarStatsCalculator; +import io.prestosql.cost.StatsNormalizer; +import io.prestosql.plugin.hive.HiveMetadata; +import io.prestosql.plugin.hive.HiveMetadataFactory; +import io.prestosql.plugin.hive.HivePartitionManager; +import io.prestosql.plugin.hive.HiveTransactionManager; +import io.prestosql.spi.function.StandardFunctionResolution; +import io.prestosql.spi.plan.FilterStatsCalculatorService; +import io.prestosql.spi.relation.RowExpressionService; +import io.prestosql.sql.relational.ConnectorRowExpressionService; +import io.prestosql.sql.relational.FunctionResolution; +import io.prestosql.sql.relational.RowExpressionDeterminismEvaluator; +import io.prestosql.sql.relational.RowExpressionDomainTranslator; +import org.mockito.Mockito; +import org.testng.annotations.Test; + +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.OFFLOAD_METADATA; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.simulationHiveMetadata; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.simulationHiveTransactionManager; +import static org.testng.Assert.assertEquals; + +public class TestHivePlanOptimizerProvider + extends TestHivePushdown +{ + @Test + public void testProvider() + { + RowExpressionService expressionService = new ConnectorRowExpressionService(new RowExpressionDomainTranslator(OFFLOAD_METADATA), new RowExpressionDeterminismEvaluator(OFFLOAD_METADATA)); + HiveTransactionManager transactionManager = simulationHiveTransactionManager(); + StandardFunctionResolution resolution = new FunctionResolution(OFFLOAD_METADATA.getFunctionAndTypeManager()); + HivePartitionManager partitionManager = + new HivePartitionManager(OFFLOAD_METADATA.getFunctionAndTypeManager(), 1, false, 1); + ScalarStatsCalculator scalarStatsCalculator = new ScalarStatsCalculator(OFFLOAD_METADATA); + StatsNormalizer normalizer = new StatsNormalizer(); + FilterStatsCalculator statsCalculator = new FilterStatsCalculator(OFFLOAD_METADATA, scalarStatsCalculator, normalizer); + FilterStatsCalculatorService calculatorService = new ConnectorFilterStatsCalculatorService(statsCalculator); + + HiveMetadataFactory hiveMetadataFactory = Mockito.mock(HiveMetadataFactory.class); + HiveMetadata hiveMetadata = simulationHiveMetadata(); + Mockito.when(hiveMetadataFactory.get()).thenReturn(hiveMetadata); + + HivePlanOptimizerProvider hivePlanOptimizerProvider = new HivePlanOptimizerProvider(transactionManager, + expressionService, resolution, partitionManager, OFFLOAD_METADATA.getFunctionAndTypeManager(), + calculatorService, hiveMetadataFactory); + assertEquals(hivePlanOptimizerProvider.getLogicalPlanOptimizers().size(), 3); + assertEquals(hivePlanOptimizerProvider.getPhysicalPlanOptimizers().size(), 3); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePushdown.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePushdown.java new file mode 100644 index 00000000..11369864 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePushdown.java @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.rule; + +import io.prestosql.plugin.hive.omnidata.OmniDataNodeManager; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; + +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.simulationOmniDataConfig; +import static io.prestosql.plugin.hive.rule.TestHivePushdownUtil.unsimulationOmniDataConfig; + +public class TestHivePushdown +{ + @BeforeClass + public void load() + { + simulationOmniDataConfig(); + OmniDataNodeManager manager = new OmniDataNodeManager(); + HivePushdownUtil.setOmniDataNodeManager(manager); + } + + @AfterClass + public void unload() + { + unsimulationOmniDataConfig(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePushdownUtil.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePushdownUtil.java new file mode 100644 index 00000000..d5d472b3 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/rule/TestHivePushdownUtil.java @@ -0,0 +1,380 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.rule; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.huawei.boostkit.omnidata.model.AggregationInfo; +import io.prestosql.metadata.Metadata; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveMetadata; +import io.prestosql.plugin.hive.HiveOffloadExpression; +import io.prestosql.plugin.hive.HiveSessionProperties; +import io.prestosql.plugin.hive.HiveStorageFormat; +import io.prestosql.plugin.hive.HiveTableHandle; +import io.prestosql.plugin.hive.HiveTransactionManager; +import io.prestosql.plugin.hive.OrcFileWriterConfig; +import io.prestosql.plugin.hive.ParquetFileWriterConfig; +import io.prestosql.plugin.hive.omnidata.OmniDataNodeManager; +import io.prestosql.spi.connector.CatalogName; +import io.prestosql.spi.connector.ColumnHandle; +import io.prestosql.spi.connector.ColumnMetadata; +import io.prestosql.spi.connector.ConnectorSession; +import io.prestosql.spi.connector.ConnectorTableHandle; +import io.prestosql.spi.connector.ConnectorTableMetadata; +import io.prestosql.spi.connector.Constraint; +import io.prestosql.spi.function.FunctionHandle; +import io.prestosql.spi.function.OperatorType; +import io.prestosql.spi.metadata.TableHandle; +import io.prestosql.spi.plan.AggregationNode; +import io.prestosql.spi.plan.Assignments; +import io.prestosql.spi.plan.FilterNode; +import io.prestosql.spi.plan.LimitNode; +import io.prestosql.spi.plan.PlanNode; +import io.prestosql.spi.plan.PlanNodeIdAllocator; +import io.prestosql.spi.plan.ProjectNode; +import io.prestosql.spi.plan.Symbol; +import io.prestosql.spi.plan.TableScanNode; +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.statistics.ColumnStatistics; +import io.prestosql.spi.statistics.DoubleRange; +import io.prestosql.spi.statistics.Estimate; +import io.prestosql.spi.statistics.TableStatistics; +import io.prestosql.spi.type.Type; +import io.prestosql.sql.analyzer.TypeSignatureProvider; +import io.prestosql.sql.planner.PlanSymbolAllocator; +import io.prestosql.sql.planner.iterative.rule.test.PlanBuilder; +import io.prestosql.sql.tree.QualifiedName; +import io.prestosql.testing.TestingConnectorSession; +import io.prestosql.testing.TestingTransactionHandle; +import org.mockito.Matchers; +import org.mockito.Mockito; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; + +import static io.prestosql.metadata.FunctionAndTypeManager.qualifyObjectName; +import static io.prestosql.metadata.MetadataManager.createTestMetadataManager; +import static io.prestosql.plugin.hive.HiveTableProperties.STORAGE_FORMAT_PROPERTY; +import static io.prestosql.plugin.hive.HiveType.HIVE_BOOLEAN; +import static io.prestosql.plugin.hive.HiveType.HIVE_BYTE; +import static io.prestosql.plugin.hive.HiveType.HIVE_DATE; +import static io.prestosql.plugin.hive.HiveType.HIVE_DOUBLE; +import static io.prestosql.plugin.hive.HiveType.HIVE_INT; +import static io.prestosql.plugin.hive.HiveType.HIVE_LONG; +import static io.prestosql.plugin.hive.HiveType.HIVE_STRING; +import static io.prestosql.plugin.hive.HiveType.HIVE_TIMESTAMP; +import static io.prestosql.plugin.hive.omnidata.OmniDataNodeManager.CONFIG_PROPERTY; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.TimestampType.TIMESTAMP; +import static io.prestosql.spi.type.VarcharType.VARCHAR; +import static io.prestosql.sql.analyzer.TypeSignatureProvider.fromTypes; +import static io.prestosql.sql.relational.Expressions.call; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +public class TestHivePushdownUtil +{ + private static final int OFFLOAD_COLUMN_NUM = 1000; + private static final int DISTINICT_COLUMN_NUM = OFFLOAD_COLUMN_NUM / 10; + private static final String SIMULATION_FLIE_NAME = "config.simluation"; + protected static final ConnectorSession OFFLOAD_SESSION = getHiveOffloadSession(); + protected static final Metadata OFFLOAD_METADATA = createTestMetadataManager(); + protected static final PlanNodeIdAllocator ID_ALLOCATOR = new PlanNodeIdAllocator(); + protected static final CatalogName CATALOG_NAME = new CatalogName("catalog"); + protected static final PlanBuilder PLAN_BUILDER = new PlanBuilder(ID_ALLOCATOR, OFFLOAD_METADATA); + protected static final HiveTableHandle OFFLOAD_HIVE_TABLE_HANDLE = + new HiveTableHandle("db", "test", ImmutableMap.of(), ImmutableList.of(), Optional.empty()); + protected static final TableHandle OFFLOAD_TABLE_HANDLE = + new TableHandle(CATALOG_NAME, OFFLOAD_HIVE_TABLE_HANDLE, TestingTransactionHandle.create(), Optional.empty()); + + protected static final HiveColumnHandle COLUMN_BOOLEAN = + new HiveColumnHandle("_boolean", HIVE_BOOLEAN, HIVE_BOOLEAN.getTypeSignature(), 0, HiveColumnHandle.ColumnType.REGULAR, Optional.empty()); + protected static final HiveColumnHandle COLUMN_INT = + new HiveColumnHandle("_int", HIVE_INT, HIVE_INT.getTypeSignature(), 1, HiveColumnHandle.ColumnType.REGULAR, Optional.empty()); + protected static final HiveColumnHandle COLUMN_LONG = + new HiveColumnHandle("_long", HIVE_LONG, HIVE_LONG.getTypeSignature(), 2, HiveColumnHandle.ColumnType.REGULAR, Optional.empty()); + protected static final HiveColumnHandle COLUMN_DOUBLE = + new HiveColumnHandle("_double", HIVE_DOUBLE, HIVE_DOUBLE.getTypeSignature(), 3, HiveColumnHandle.ColumnType.REGULAR, Optional.empty()); + protected static final HiveColumnHandle COLUMN_CHAR = + new HiveColumnHandle("_btye", HIVE_BYTE, HIVE_BYTE.getTypeSignature(), 4, HiveColumnHandle.ColumnType.REGULAR, Optional.empty()); + protected static final HiveColumnHandle COLUMN_STRING = + new HiveColumnHandle("_string", HIVE_STRING, HIVE_STRING.getTypeSignature(), 5, HiveColumnHandle.ColumnType.REGULAR, Optional.empty()); + protected static final HiveColumnHandle COLUMN_TIMESTAMP = + new HiveColumnHandle("_timestamp", HIVE_TIMESTAMP, HIVE_TIMESTAMP.getTypeSignature(), 6, HiveColumnHandle.ColumnType.REGULAR, Optional.empty()); + protected static final HiveColumnHandle COLUMN_DATE = + new HiveColumnHandle("_date", HIVE_DATE, HIVE_DATE.getTypeSignature(), 7, HiveColumnHandle.ColumnType.REGULAR, Optional.empty()); + + protected static final Map COLUMN_TYPE_MAP = getColumnTypeMap(); + protected static final PlanSymbolAllocator SYMBOL_ALLOCATOR = new PlanSymbolAllocator(toSymbolMap(COLUMN_TYPE_MAP)); + protected static final Map COLUMN_HANDLE_MAP = getColumnHandlesMap(); + + private TestHivePushdownUtil() + { + OmniDataNodeManager nodeManager = new OmniDataNodeManager(); + HivePushdownUtil.setOmniDataNodeManager(nodeManager); + } + + private static Map getColumnHandlesMap() + { + ImmutableMap.Builder builder = new ImmutableMap.Builder<>(); + builder.put(COLUMN_BOOLEAN.getColumnName(), COLUMN_BOOLEAN) + .put(COLUMN_INT.getColumnName(), COLUMN_INT) + .put(COLUMN_LONG.getColumnName(), COLUMN_LONG) + .put(COLUMN_DOUBLE.getColumnName(), COLUMN_DOUBLE) + .put(COLUMN_STRING.getColumnName(), COLUMN_STRING) + .put(COLUMN_TIMESTAMP.getColumnName(), COLUMN_TIMESTAMP) + .put(COLUMN_DATE.getColumnName(), COLUMN_DATE); + return builder.build(); + } + + protected static Map getColumnTypeMap() + { + ImmutableMap.Builder builder = new ImmutableMap.Builder(); + builder.put(COLUMN_BOOLEAN.getColumnName(), BOOLEAN) + .put(COLUMN_INT.getColumnName(), INTEGER) + .put(COLUMN_LONG.getColumnName(), BIGINT) + .put(COLUMN_DOUBLE.getColumnName(), DOUBLE) + .put(COLUMN_STRING.getColumnName(), VARCHAR) + .put(COLUMN_TIMESTAMP.getColumnName(), TIMESTAMP) + .put(COLUMN_DATE.getColumnName(), DATE); + return builder.build(); + } + + protected static Map toSymbolMap(Map map) + { + return map.entrySet().stream().collect(Collectors.toMap(entry -> new Symbol(entry.getKey()), entry -> entry.getValue())); + } + + protected static TableScanNode generateTableScanNode(HiveTableHandle hiveTableHandle, HiveColumnHandle... columnHandles) + { + ImmutableMap.Builder assignmentsBuilder = ImmutableMap.builder(); + ImmutableList.Builder symbolsBuilder = ImmutableList.builder(); + for (HiveColumnHandle columnHandle : columnHandles) { + String name = columnHandle.getName().toLowerCase(Locale.ENGLISH); + Symbol symbol = new Symbol(name); + symbolsBuilder.add(symbol); + assignmentsBuilder.put(symbol, columnHandle); + } + return PLAN_BUILDER.tableScan(OFFLOAD_TABLE_HANDLE, symbolsBuilder.build(), assignmentsBuilder.build()); + } + + protected static TableScanNode buildTableScanNode(HiveColumnHandle... columnHandles) + { + return generateTableScanNode(OFFLOAD_HIVE_TABLE_HANDLE, columnHandles); + } + + protected static AggregationNode buildAggregationNode( + PlanNode source, + Map aggregations, + AggregationNode.GroupingSetDescriptor groupingSets) + { + AggregationNode aggregationNode = new AggregationNode(ID_ALLOCATOR.getNextId(), source, aggregations, + groupingSets, Collections.emptyList(), AggregationNode.Step.PARTIAL, Optional.empty(), + Optional.empty(), AggregationNode.AggregationType.HASH, Optional.empty()); + return aggregationNode; + } + + protected static TableScanNode buildTableScanNode() + { + return generateTableScanNode(OFFLOAD_HIVE_TABLE_HANDLE, COLUMN_INT, COLUMN_LONG); + } + + protected static LimitNode buildPartialLimitNode(PlanNode source, long count) + { + return new LimitNode(ID_ALLOCATOR.getNextId(), source, count, true); + } + + protected static FilterNode buildFilterNode(PlanNode source, RowExpression predicate) + { + return new FilterNode(ID_ALLOCATOR.getNextId(), source, predicate); + } + + protected static LimitNode buildLimitNode(PlanNode source, long count) + { + return new LimitNode(ID_ALLOCATOR.getNextId(), source, count, false); + } + + protected static Assignments buildAssignments(List symbols, List rowExpressions) + { + assertEquals(symbols.size(), rowExpressions.size()); + ImmutableMap.Builder assignments = ImmutableMap.builder(); + for (int i = 0; i < symbols.size(); i++) { + assignments.put(symbols.get(i), rowExpressions.get(i)); + } + return Assignments.copyOf(assignments.build()); + } + + protected static ProjectNode buildProjectNode(PlanNode source, List symbols, List rowExpressions) + { + Assignments assignments = buildAssignments(symbols, rowExpressions); + return new ProjectNode(ID_ALLOCATOR.getNextId(), source, assignments); + } + + private static HiveOffloadExpression getCheckedOffloadExpression(PlanNode node) + { + assertTrue(node instanceof TableScanNode); + ConnectorTableHandle tableHandle = ((TableScanNode) node).getTable().getConnectorHandle(); + assertTrue(tableHandle instanceof HiveTableHandle); + HiveOffloadExpression hiveOffloadExpression = ((HiveTableHandle) tableHandle).getOffloadExpression(); + return hiveOffloadExpression; + } + + protected static void matchLimitOffload(PlanNode node, long count) + { + HiveOffloadExpression expression = getCheckedOffloadExpression(node); + assertTrue(expression.isPresent()); + assertEquals(count, expression.getLimit().getAsLong()); + } + + protected static void matchFilterOffload(PlanNode node, RowExpression predicate) + { + HiveOffloadExpression expression = getCheckedOffloadExpression(node); + assertTrue(expression.isPresent()); + assertEquals(predicate, expression.getFilterExpression()); + } + + protected static void matchAggregatorOffload(PlanNode node, AggregationInfo aggregationInfoExpected) + { + HiveOffloadExpression expression = getCheckedOffloadExpression(node); + assertTrue(expression.isPresent()); + assertTrue(expression.getAggregations().isPresent()); + AggregationInfo aggregationInfo = expression.getAggregations().get(); + assertEquals(aggregationInfoExpected, aggregationInfo); + } + + protected static void matchProjection(PlanNode node, Map projections) + { + HiveOffloadExpression expression = getCheckedOffloadExpression(node); + assertTrue(expression.isPresent()); + assertTrue(!expression.getProjections().isEmpty()); + assertEquals(projections, expression.getProjections()); + } + + protected static ConnectorSession getHiveOffloadSession() + { + HiveConfig hiveConfig = new HiveConfig().setOmniDataEnabled(true) + .setFilterOffloadEnabled(true) + .setAggregatorOffloadEnabled(true) + .setMinFilterOffloadFactor(1) + .setMinAggregatorOffloadFactor(1) + .setMinOffloadRowNumber(1) + .setOmniDataSslEnabled(false); + return new TestingConnectorSession( + new HiveSessionProperties(hiveConfig, new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + } + + public static void simulationOmniDataConfig() + { + String configFile = System.getProperty(CONFIG_PROPERTY); + if (configFile != null) { + return; + } + + File file = new File(SIMULATION_FLIE_NAME); + try { + file.createNewFile(); + } + catch (IOException e) { + e.printStackTrace(); + } + + System.setProperty(CONFIG_PROPERTY, SIMULATION_FLIE_NAME); + } + + public static void unsimulationOmniDataConfig() + { + String configFile = System.getProperty(CONFIG_PROPERTY); + if (configFile == null) { + return; + } + if (configFile.equals(SIMULATION_FLIE_NAME)) { + System.clearProperty(CONFIG_PROPERTY); + File file = new File(SIMULATION_FLIE_NAME); + file.delete(); + } + } + + protected static CallExpression createCallExpression(String functionName, Type returnType, List arguments) + { + List inputTypes = arguments.stream().map(expression -> expression.getType()).collect(Collectors.toList()); + FunctionHandle functionHandle = OFFLOAD_METADATA.getFunctionAndTypeManager() + .resolveFunction(Optional.empty(), qualifyObjectName(QualifiedName.of(functionName)), TypeSignatureProvider.fromTypes(inputTypes)); + return new CallExpression(functionName, functionHandle, returnType, arguments); + } + + protected static CallExpression createOperationExpression(OperatorType operatorType, RowExpression left, RowExpression right) + { + FunctionHandle functionHandle = OFFLOAD_METADATA.getFunctionAndTypeManager().resolveOperatorFunctionHandle(operatorType, fromTypes(left.getType(), right.getType())); + return call(operatorType.name(), functionHandle, left.getType(), left, right); + } + + protected static AggregationNode.Aggregation createAggregation(String functionName, Type returnType, List arguments) + { + CallExpression callExpression = createCallExpression(functionName, returnType, arguments); + return new AggregationNode.Aggregation(callExpression, arguments, false, Optional.empty(), Optional.empty(), Optional.empty()); + } + + protected static HiveMetadata simulationHiveMetadata() + { + // simulation chain: HiveTransactionManager -> HiveMetadata -> ColumnMetadata + TableStatistics + ColumnStatistics + ColumnMetadata columnMetadataInt = Mockito.mock(ColumnMetadata.class); + Mockito.when(columnMetadataInt.getName()).thenReturn(COLUMN_INT.getName()); + Mockito.when(columnMetadataInt.getType()).thenReturn(INTEGER); + + HashMap propertyMap = new HashMap<>(); + propertyMap.put(STORAGE_FORMAT_PROPERTY, HiveStorageFormat.ORC); + ConnectorTableMetadata connectorTableMetadata = Mockito.mock(ConnectorTableMetadata.class); + Mockito.when(connectorTableMetadata.getProperties()).thenReturn(propertyMap); + + Map columnStatistics = new HashMap<>(); + ColumnStatistics columnStatisInt = + new ColumnStatistics(Estimate.zero(), Estimate.of(DISTINICT_COLUMN_NUM), Estimate.unknown(), Optional.of(new DoubleRange(1, 10))); + columnStatistics.put(COLUMN_INT, columnStatisInt); + TableStatistics statistics = new TableStatistics(Estimate.of(OFFLOAD_COLUMN_NUM), 5, 1024, columnStatistics); + HiveMetadata metadata = Mockito.mock(HiveMetadata.class); + Mockito.when(metadata.getTableMetadata(OFFLOAD_SESSION, OFFLOAD_HIVE_TABLE_HANDLE)).thenReturn(connectorTableMetadata); + Mockito.when(metadata.getColumnMetadata(Matchers.eq(OFFLOAD_SESSION), Matchers.eq(OFFLOAD_HIVE_TABLE_HANDLE), Matchers.any(ColumnHandle.class))).thenReturn(columnMetadataInt); + Map columnHandleMap = ImmutableMap.of(COLUMN_INT.getName(), COLUMN_INT); + Mockito.when(metadata.getColumnHandles(OFFLOAD_SESSION, OFFLOAD_HIVE_TABLE_HANDLE)).thenReturn(columnHandleMap); + + Mockito.when(metadata.getTableStatistics(Matchers.eq(OFFLOAD_SESSION), Matchers.eq(OFFLOAD_HIVE_TABLE_HANDLE), Matchers.any(Constraint.class), Matchers.eq(true))) + .thenReturn(statistics); + + return metadata; + } + + protected static HiveTransactionManager simulationHiveTransactionManager() + { + HiveMetadata metadata = simulationHiveMetadata(); + HiveTransactionManager transactionManager = Mockito.mock(HiveTransactionManager.class); + Mockito.when(transactionManager.get(OFFLOAD_TABLE_HANDLE.getTransaction())).thenReturn(metadata); + return transactionManager; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/s3/MockAmazonS3.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/s3/MockAmazonS3.java new file mode 100644 index 00000000..524bc471 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/s3/MockAmazonS3.java @@ -0,0 +1,130 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import com.amazonaws.services.s3.AbstractAmazonS3; +import com.amazonaws.services.s3.model.AmazonS3Exception; +import com.amazonaws.services.s3.model.CannedAccessControlList; +import com.amazonaws.services.s3.model.GetObjectMetadataRequest; +import com.amazonaws.services.s3.model.GetObjectRequest; +import com.amazonaws.services.s3.model.ListObjectsRequest; +import com.amazonaws.services.s3.model.ObjectListing; +import com.amazonaws.services.s3.model.ObjectMetadata; +import com.amazonaws.services.s3.model.PutObjectRequest; +import com.amazonaws.services.s3.model.PutObjectResult; +import com.amazonaws.services.s3.model.S3Object; +import com.amazonaws.services.s3.model.S3ObjectSummary; +import com.amazonaws.services.s3.model.StorageClass; + +import java.util.Date; + +import static java.net.HttpURLConnection.HTTP_OK; + +public class MockAmazonS3 + extends AbstractAmazonS3 +{ + private int getObjectHttpCode = HTTP_OK; + private int getObjectMetadataHttpCode = HTTP_OK; + private GetObjectMetadataRequest getObjectMetadataRequest; + private CannedAccessControlList acl; + private boolean hasGlacierObjects; + + public void setGetObjectHttpErrorCode(int getObjectHttpErrorCode) + { + this.getObjectHttpCode = getObjectHttpErrorCode; + } + + public void setGetObjectMetadataHttpCode(int getObjectMetadataHttpCode) + { + this.getObjectMetadataHttpCode = getObjectMetadataHttpCode; + } + + public CannedAccessControlList getAcl() + { + return this.acl; + } + + public void setHasGlacierObjects(boolean hasGlacierObjects) + { + this.hasGlacierObjects = hasGlacierObjects; + } + + public GetObjectMetadataRequest getGetObjectMetadataRequest() + { + return getObjectMetadataRequest; + } + + @Override + public ObjectMetadata getObjectMetadata(GetObjectMetadataRequest getObjectMetadataRequest) + { + this.getObjectMetadataRequest = getObjectMetadataRequest; + if (getObjectMetadataHttpCode != HTTP_OK) { + AmazonS3Exception exception = new AmazonS3Exception("Failing getObjectMetadata call with " + getObjectMetadataHttpCode); + exception.setStatusCode(getObjectMetadataHttpCode); + throw exception; + } + return null; + } + + @Override + public S3Object getObject(GetObjectRequest getObjectRequest) + { + if (getObjectHttpCode != HTTP_OK) { + AmazonS3Exception exception = new AmazonS3Exception("Failing getObject call with " + getObjectHttpCode); + exception.setStatusCode(getObjectHttpCode); + throw exception; + } + return null; + } + + @Override + public PutObjectResult putObject(PutObjectRequest putObjectRequest) + { + this.acl = putObjectRequest.getCannedAcl(); + return new PutObjectResult(); + } + + @Override + public ObjectListing listObjects(ListObjectsRequest listObjectsRequest) + { + ObjectListing listing = new ObjectListing(); + + S3ObjectSummary standard = new S3ObjectSummary(); + standard.setStorageClass(StorageClass.Standard.toString()); + standard.setKey("test/standard"); + standard.setLastModified(new Date()); + listing.getObjectSummaries().add(standard); + + if (hasGlacierObjects) { + S3ObjectSummary glacier = new S3ObjectSummary(); + glacier.setStorageClass(StorageClass.Glacier.toString()); + glacier.setKey("test/glacier"); + glacier.setLastModified(new Date()); + listing.getObjectSummaries().add(glacier); + } + + return listing; + } + + @Override + public PutObjectResult putObject(String bucketName, String key, String content) + { + return new PutObjectResult(); + } + + @Override + public void shutdown() + { + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/s3/TestHiveS3Config.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/s3/TestHiveS3Config.java new file mode 100644 index 00000000..a94d16ad --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/s3/TestHiveS3Config.java @@ -0,0 +1,130 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import com.google.common.base.StandardSystemProperty; +import com.google.common.collect.ImmutableMap; +import io.airlift.units.DataSize; +import io.airlift.units.DataSize.Unit; +import io.airlift.units.Duration; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; +import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; +import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; + +public class TestHiveS3Config +{ + @Test + public void testDefaults() + { + assertRecordedDefaults(recordDefaults(HiveS3Config.class) + .setS3AwsAccessKey(null) + .setS3AwsSecretKey(null) + .setS3Endpoint(null) + .setS3SignerType(null) + .setS3PathStyleAccess(false) + .setS3UseInstanceCredentials(true) + .setS3IamRole(null) + .setS3SslEnabled(true) + .setS3SseEnabled(false) + .setS3SseType(PrestoS3SseType.S3) + .setS3SseKmsKeyId(null) + .setS3KmsKeyId(null) + .setS3EncryptionMaterialsProvider(null) + .setS3MaxClientRetries(5) + .setS3MaxErrorRetries(10) + .setS3MaxBackoffTime(new Duration(10, TimeUnit.MINUTES)) + .setS3MaxRetryTime(new Duration(10, TimeUnit.MINUTES)) + .setS3ConnectTimeout(new Duration(5, TimeUnit.SECONDS)) + .setS3SocketTimeout(new Duration(5, TimeUnit.SECONDS)) + .setS3MultipartMinFileSize(new DataSize(16, Unit.MEGABYTE)) + .setS3MultipartMinPartSize(new DataSize(5, Unit.MEGABYTE)) + .setS3MaxConnections(500) + .setS3StagingDirectory(new File(StandardSystemProperty.JAVA_IO_TMPDIR.value())) + .setPinS3ClientToCurrentRegion(false) + .setS3UserAgentPrefix("") + .setS3AclType(PrestoS3AclType.PRIVATE) + .setSkipGlacierObjects(false)); + } + + @Test + public void testExplicitPropertyMappings() + { + Map properties = new ImmutableMap.Builder() + .put("hive.s3.aws-access-key", "abc123") + .put("hive.s3.aws-secret-key", "secret") + .put("hive.s3.endpoint", "endpoint.example.com") + .put("hive.s3.signer-type", "S3SignerType") + .put("hive.s3.path-style-access", "true") + .put("hive.s3.use-instance-credentials", "false") + .put("hive.s3.iam-role", "roleArn") + .put("hive.s3.ssl.enabled", "false") + .put("hive.s3.sse.enabled", "true") + .put("hive.s3.sse.type", "KMS") + .put("hive.s3.sse.kms-key-id", "KMS_KEY_ID") + .put("hive.s3.encryption-materials-provider", "EMP_CLASS") + .put("hive.s3.kms-key-id", "KEY_ID") + .put("hive.s3.max-client-retries", "9") + .put("hive.s3.max-error-retries", "8") + .put("hive.s3.max-backoff-time", "4m") + .put("hive.s3.max-retry-time", "20m") + .put("hive.s3.connect-timeout", "8s") + .put("hive.s3.socket-timeout", "4m") + .put("hive.s3.multipart.min-file-size", "32MB") + .put("hive.s3.multipart.min-part-size", "15MB") + .put("hive.s3.max-connections", "77") + .put("hive.s3.staging-directory", "/s3-staging") + .put("hive.s3.pin-client-to-current-region", "true") + .put("hive.s3.user-agent-prefix", "user-agent-prefix") + .put("hive.s3.upload-acl-type", "PUBLIC_READ") + .put("hive.s3.skip-glacier-objects", "true") + .build(); + + HiveS3Config expected = new HiveS3Config() + .setS3AwsAccessKey("abc123") + .setS3AwsSecretKey("secret") + .setS3Endpoint("endpoint.example.com") + .setS3SignerType(PrestoS3SignerType.S3SignerType) + .setS3PathStyleAccess(true) + .setS3UseInstanceCredentials(false) + .setS3IamRole("roleArn") + .setS3SslEnabled(false) + .setS3SseEnabled(true) + .setS3SseType(PrestoS3SseType.KMS) + .setS3SseKmsKeyId("KMS_KEY_ID") + .setS3EncryptionMaterialsProvider("EMP_CLASS") + .setS3KmsKeyId("KEY_ID") + .setS3MaxClientRetries(9) + .setS3MaxErrorRetries(8) + .setS3MaxBackoffTime(new Duration(4, TimeUnit.MINUTES)) + .setS3MaxRetryTime(new Duration(20, TimeUnit.MINUTES)) + .setS3ConnectTimeout(new Duration(8, TimeUnit.SECONDS)) + .setS3SocketTimeout(new Duration(4, TimeUnit.MINUTES)) + .setS3MultipartMinFileSize(new DataSize(32, Unit.MEGABYTE)) + .setS3MultipartMinPartSize(new DataSize(15, Unit.MEGABYTE)) + .setS3MaxConnections(77) + .setS3StagingDirectory(new File("/s3-staging")) + .setPinS3ClientToCurrentRegion(true) + .setS3UserAgentPrefix("user-agent-prefix") + .setS3AclType(PrestoS3AclType.PUBLIC_READ) + .setSkipGlacierObjects(true); + + assertFullMapping(properties, expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/s3/TestPrestoS3FileSystem.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/s3/TestPrestoS3FileSystem.java new file mode 100644 index 00000000..2dbd9511 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/s3/TestPrestoS3FileSystem.java @@ -0,0 +1,651 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.s3; + +import com.amazonaws.AmazonWebServiceClient; +import com.amazonaws.ClientConfiguration; +import com.amazonaws.auth.AWSCredentials; +import com.amazonaws.auth.AWSCredentialsProvider; +import com.amazonaws.auth.AWSStaticCredentialsProvider; +import com.amazonaws.auth.DefaultAWSCredentialsProviderChain; +import com.amazonaws.auth.InstanceProfileCredentialsProvider; +import com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider; +import com.amazonaws.services.s3.AmazonS3Client; +import com.amazonaws.services.s3.AmazonS3EncryptionClient; +import com.amazonaws.services.s3.S3ClientOptions; +import com.amazonaws.services.s3.model.AmazonS3Exception; +import com.amazonaws.services.s3.model.CannedAccessControlList; +import com.amazonaws.services.s3.model.EncryptionMaterials; +import com.amazonaws.services.s3.model.EncryptionMaterialsProvider; +import com.amazonaws.services.s3.model.GetObjectMetadataRequest; +import com.amazonaws.services.s3.model.ObjectMetadata; +import com.google.common.base.VerifyException; +import io.prestosql.plugin.hive.s3.PrestoS3FileSystem.UnrecoverableS3OperationException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.testng.SkipException; +import org.testng.annotations.Test; + +import javax.crypto.spec.SecretKeySpec; + +import java.io.IOException; +import java.lang.reflect.Field; +import java.net.URI; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Map; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.io.MoreFiles.deleteRecursively; +import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static io.airlift.testing.Assertions.assertInstanceOf; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_ACCESS_KEY; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_ACL_TYPE; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_CREDENTIALS_PROVIDER; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_DIRECTORY_OBJECT_CONTENT_TYPE; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_ENCRYPTION_MATERIALS_PROVIDER; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_ENDPOINT; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_IAM_ROLE; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_KMS_KEY_ID; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MAX_BACKOFF_TIME; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MAX_CLIENT_RETRIES; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MAX_RETRY_TIME; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_PATH_STYLE_ACCESS; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_PIN_CLIENT_TO_CURRENT_REGION; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_SECRET_KEY; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_SIGNER_TYPE; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_SKIP_GLACIER_OBJECTS; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_STAGING_DIRECTORY; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_USER_AGENT_PREFIX; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_USER_AGENT_SUFFIX; +import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_USE_INSTANCE_CREDENTIALS; +import static java.net.HttpURLConnection.HTTP_FORBIDDEN; +import static java.net.HttpURLConnection.HTTP_INTERNAL_ERROR; +import static java.net.HttpURLConnection.HTTP_NOT_FOUND; +import static java.nio.file.Files.createTempDirectory; +import static java.nio.file.Files.createTempFile; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +public class TestPrestoS3FileSystem +{ + private static final int HTTP_RANGE_NOT_SATISFIABLE = 416; + + @Test + public void testStaticCredentials() + throws Exception + { + Configuration config = new Configuration(); + config.set(S3_ACCESS_KEY, "test_secret_access_key"); + config.set(S3_SECRET_KEY, "test_access_key_id"); + // the static credentials should be preferred + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3n://test-bucket/"), config); + assertInstanceOf(getAwsCredentialsProvider(fs), AWSStaticCredentialsProvider.class); + } + } + + @Test + public void testCompatibleStaticCredentials() + throws Exception + { + Configuration config = new Configuration(); + config.set(S3_ACCESS_KEY, "test_secret_access_key"); + config.set(S3_SECRET_KEY, "test_access_key_id"); + config.set(S3_ENDPOINT, "test.example.endpoint.com"); + config.set(S3_SIGNER_TYPE, "S3SignerType"); + // the static credentials should be preferred + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3a://test-bucket/"), config); + assertInstanceOf(getAwsCredentialsProvider(fs), AWSStaticCredentialsProvider.class); + } + } + + @Test(expectedExceptions = VerifyException.class, expectedExceptionsMessageRegExp = "Invalid configuration: either endpoint can be set or S3 client can be pinned to the current region") + public void testEndpointWithPinToCurrentRegionConfiguration() + throws Exception + { + Configuration config = new Configuration(); + config.set(S3_ENDPOINT, "test.example.endpoint.com"); + config.set(S3_PIN_CLIENT_TO_CURRENT_REGION, "true"); + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3a://test-bucket/"), config); + } + } + + @Test + public void testInstanceCredentialsEnabled() + throws Exception + { + Configuration config = new Configuration(); + // instance credentials are enabled by default + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3n://test-bucket/"), config); + assertInstanceOf(getAwsCredentialsProvider(fs), InstanceProfileCredentialsProvider.class); + } + } + + @Test + public void testAssumeRoleCredentials() + throws Exception + { + Configuration config = new Configuration(); + config.set(S3_IAM_ROLE, "role"); + config.setBoolean(S3_USE_INSTANCE_CREDENTIALS, false); + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3n://test-bucket/"), config); + assertInstanceOf(getAwsCredentialsProvider(fs), STSAssumeRoleSessionCredentialsProvider.class); + } + } + + @Test + public void testDefaultCredentials() + throws Exception + { + Configuration config = new Configuration(); + config.setBoolean(S3_USE_INSTANCE_CREDENTIALS, false); + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3n://test-bucket/"), config); + assertInstanceOf(getAwsCredentialsProvider(fs), DefaultAWSCredentialsProviderChain.class); + } + } + + @Test + public void testPathStyleAccess() + throws Exception + { + Configuration config = new Configuration(); + config.setBoolean(S3_PATH_STYLE_ACCESS, true); + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3n://test-bucket/"), config); + S3ClientOptions clientOptions = getFieldValue(fs.getS3Client(), AmazonS3Client.class, "clientOptions", S3ClientOptions.class); + assertTrue(clientOptions.isPathStyleAccess()); + } + } + + @Test + public void testUnderscoreBucket() + throws Exception + { + Configuration config = new Configuration(); + config.setBoolean(S3_PATH_STYLE_ACCESS, true); + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + String expectedBucketName = "test-bucket_underscore"; + fs.initialize(new URI("s3n://" + expectedBucketName + "/"), config); + fs.setS3Client(s3); + fs.getS3ObjectMetadata(new Path("/test/path")); + assertEquals(expectedBucketName, s3.getGetObjectMetadataRequest().getBucketName()); + } + } + + @SuppressWarnings({"ResultOfMethodCallIgnored", "OverlyStrongTypeCast", "ConstantConditions"}) + @Test + public void testReadRetryCounters() + throws Exception + { + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + int maxRetries = 2; + MockAmazonS3 s3 = new MockAmazonS3(); + s3.setGetObjectHttpErrorCode(HTTP_INTERNAL_ERROR); + Configuration configuration = new Configuration(); + configuration.set(S3_MAX_BACKOFF_TIME, "1ms"); + configuration.set(S3_MAX_RETRY_TIME, "5s"); + configuration.setInt(S3_MAX_CLIENT_RETRIES, maxRetries); + fs.initialize(new URI("s3n://test-bucket/"), configuration); + fs.setS3Client(s3); + try (FSDataInputStream inputStream = fs.open(new Path("s3n://test-bucket/test"))) { + inputStream.read(); + } + catch (Throwable expected) { + assertInstanceOf(expected, AmazonS3Exception.class); + assertEquals(((AmazonS3Exception) expected).getStatusCode(), HTTP_INTERNAL_ERROR); + assertEquals(PrestoS3FileSystem.getFileSystemStats().getReadRetries().getTotalCount(), maxRetries); + assertEquals(PrestoS3FileSystem.getFileSystemStats().getGetObjectRetries().getTotalCount(), (maxRetries + 1L) * maxRetries); + } + } + } + + @SuppressWarnings({"OverlyStrongTypeCast", "ConstantConditions"}) + @Test + public void testGetMetadataRetryCounter() + { + int maxRetries = 2; + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + s3.setGetObjectMetadataHttpCode(HTTP_INTERNAL_ERROR); + Configuration configuration = new Configuration(); + configuration.set(S3_MAX_BACKOFF_TIME, "1ms"); + configuration.set(S3_MAX_RETRY_TIME, "5s"); + configuration.setInt(S3_MAX_CLIENT_RETRIES, maxRetries); + fs.initialize(new URI("s3n://test-bucket/"), configuration); + fs.setS3Client(s3); + fs.getS3ObjectMetadata(new Path("s3n://test-bucket/test")); + } + catch (Throwable expected) { + assertInstanceOf(expected, AmazonS3Exception.class); + assertEquals(((AmazonS3Exception) expected).getStatusCode(), HTTP_INTERNAL_ERROR); + assertEquals(PrestoS3FileSystem.getFileSystemStats().getGetMetadataRetries().getTotalCount(), maxRetries); + } + } + + @SuppressWarnings("ResultOfMethodCallIgnored") + @Test(expectedExceptions = IOException.class, expectedExceptionsMessageRegExp = ".*Failing getObject call with " + HTTP_NOT_FOUND + ".*") + public void testReadNotFound() + throws Exception + { + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + s3.setGetObjectHttpErrorCode(HTTP_NOT_FOUND); + fs.initialize(new URI("s3n://test-bucket/"), new Configuration()); + fs.setS3Client(s3); + try (FSDataInputStream inputStream = fs.open(new Path("s3n://test-bucket/test"))) { + inputStream.read(); + } + } + } + + @SuppressWarnings("ResultOfMethodCallIgnored") + @Test(expectedExceptions = IOException.class, expectedExceptionsMessageRegExp = ".*Failing getObject call with " + HTTP_FORBIDDEN + ".*") + public void testReadForbidden() + throws Exception + { + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + s3.setGetObjectHttpErrorCode(HTTP_FORBIDDEN); + fs.initialize(new URI("s3n://test-bucket/"), new Configuration()); + fs.setS3Client(s3); + try (FSDataInputStream inputStream = fs.open(new Path("s3n://test-bucket/test"))) { + inputStream.read(); + } + } + } + + @Test + public void testCreateWithNonexistentStagingDirectory() + throws Exception + { + java.nio.file.Path stagingParent = createTempDirectory("test"); + java.nio.file.Path staging = Paths.get(stagingParent.toString(), "staging"); + // stagingParent = /tmp/testXXX + // staging = /tmp/testXXX/staging + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + Configuration conf = new Configuration(); + conf.set(S3_STAGING_DIRECTORY, staging.toString()); + fs.initialize(new URI("s3n://test-bucket/"), conf); + fs.setS3Client(s3); + FSDataOutputStream stream = fs.create(new Path("s3n://test-bucket/test")); + stream.close(); + assertTrue(Files.exists(staging)); + } + finally { + deleteRecursively(stagingParent, ALLOW_INSECURE); + } + } + + @Test(expectedExceptions = IOException.class, expectedExceptionsMessageRegExp = "Configured staging path is not a directory: .*") + public void testCreateWithStagingDirectoryFile() + throws Exception + { + java.nio.file.Path staging = createTempFile("staging", null); + // staging = /tmp/stagingXXX.tmp + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + Configuration conf = new Configuration(); + conf.set(S3_STAGING_DIRECTORY, staging.toString()); + fs.initialize(new URI("s3n://test-bucket/"), conf); + fs.setS3Client(s3); + fs.create(new Path("s3n://test-bucket/test")); + } + finally { + Files.deleteIfExists(staging); + } + } + + @Test + public void testCreateWithStagingDirectorySymlink() + throws Exception + { + java.nio.file.Path staging = createTempDirectory("staging"); + java.nio.file.Path link = Paths.get(staging + ".symlink"); + // staging = /tmp/stagingXXX + // link = /tmp/stagingXXX.symlink -> /tmp/stagingXXX + + try { + try { + Files.createSymbolicLink(link, staging); + } + catch (UnsupportedOperationException e) { + throw new SkipException("Filesystem does not support symlinks", e); + } + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + Configuration conf = new Configuration(); + conf.set(S3_STAGING_DIRECTORY, link.toString()); + fs.initialize(new URI("s3n://test-bucket/"), conf); + fs.setS3Client(s3); + FSDataOutputStream stream = fs.create(new Path("s3n://test-bucket/test")); + stream.close(); + assertTrue(Files.exists(link)); + } + } + finally { + deleteRecursively(link, ALLOW_INSECURE); + deleteRecursively(staging, ALLOW_INSECURE); + } + } + + @Test + public void testReadRequestRangeNotSatisfiable() + throws Exception + { + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + s3.setGetObjectHttpErrorCode(HTTP_RANGE_NOT_SATISFIABLE); + fs.initialize(new URI("s3n://test-bucket/"), new Configuration()); + fs.setS3Client(s3); + try (FSDataInputStream inputStream = fs.open(new Path("s3n://test-bucket/test"))) { + assertEquals(inputStream.read(), -1); + } + } + } + + @Test(expectedExceptions = IOException.class, expectedExceptionsMessageRegExp = ".*Failing getObjectMetadata call with " + HTTP_FORBIDDEN + ".*") + public void testGetMetadataForbidden() + throws Exception + { + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + s3.setGetObjectMetadataHttpCode(HTTP_FORBIDDEN); + fs.initialize(new URI("s3n://test-bucket/"), new Configuration()); + fs.setS3Client(s3); + fs.getS3ObjectMetadata(new Path("s3n://test-bucket/test")); + } + } + + @Test + public void testGetMetadataNotFound() + throws Exception + { + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + s3.setGetObjectMetadataHttpCode(HTTP_NOT_FOUND); + fs.initialize(new URI("s3n://test-bucket/"), new Configuration()); + fs.setS3Client(s3); + assertEquals(fs.getS3ObjectMetadata(new Path("s3n://test-bucket/test")), null); + } + } + + @Test + public void testEncryptionMaterialsProvider() + throws Exception + { + Configuration config = new Configuration(); + config.set(S3_ENCRYPTION_MATERIALS_PROVIDER, TestEncryptionMaterialsProvider.class.getName()); + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3n://test-bucket/"), config); + assertInstanceOf(fs.getS3Client(), AmazonS3EncryptionClient.class); + } + } + + @Test + public void testKMSEncryptionMaterialsProvider() + throws Exception + { + Configuration config = new Configuration(); + config.set(S3_KMS_KEY_ID, "test-key-id"); + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3n://test-bucket/"), config); + assertInstanceOf(fs.getS3Client(), AmazonS3EncryptionClient.class); + } + } + + @Test(expectedExceptions = UnrecoverableS3OperationException.class, expectedExceptionsMessageRegExp = ".*\\Q (Path: /tmp/test/path)\\E") + public void testUnrecoverableS3ExceptionMessage() + throws Exception + { + throw new UnrecoverableS3OperationException(new Path("/tmp/test/path"), new IOException("test io exception")); + } + + @Test + public void testCustomCredentialsProvider() + throws Exception + { + Configuration config = new Configuration(); + config.set(S3_USE_INSTANCE_CREDENTIALS, "false"); + config.set(S3_CREDENTIALS_PROVIDER, TestCredentialsProvider.class.getName()); + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3n://test-bucket/"), config); + assertInstanceOf(getAwsCredentialsProvider(fs), TestCredentialsProvider.class); + } + } + + @Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = "Error creating an instance of .*") + public void testCustomCredentialsClassCannotBeFound() + throws Exception + { + Configuration config = new Configuration(); + config.set(S3_USE_INSTANCE_CREDENTIALS, "false"); + config.set(S3_CREDENTIALS_PROVIDER, "com.example.DoesNotExist"); + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3n://test-bucket/"), config); + } + } + + @Test + public void testUserAgentPrefix() + throws Exception + { + String userAgentPrefix = "agent_prefix"; + Configuration config = new Configuration(); + config.set(S3_USER_AGENT_PREFIX, userAgentPrefix); + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3n://test-bucket/"), config); + ClientConfiguration clientConfig = getFieldValue(fs.getS3Client(), AmazonWebServiceClient.class, "clientConfiguration", ClientConfiguration.class); + assertEquals(clientConfig.getUserAgentSuffix(), S3_USER_AGENT_SUFFIX); + assertEquals(clientConfig.getUserAgentPrefix(), userAgentPrefix); + } + } + + @Test + public void testDefaultS3ClientConfiguration() + throws Exception + { + HiveS3Config defaults = new HiveS3Config(); + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + fs.initialize(new URI("s3n://test-bucket/"), new Configuration()); + ClientConfiguration config = getFieldValue(fs.getS3Client(), AmazonWebServiceClient.class, "clientConfiguration", ClientConfiguration.class); + assertEquals(config.getMaxErrorRetry(), defaults.getS3MaxErrorRetries()); + assertEquals(config.getConnectionTimeout(), defaults.getS3ConnectTimeout().toMillis()); + assertEquals(config.getSocketTimeout(), defaults.getS3SocketTimeout().toMillis()); + assertEquals(config.getMaxConnections(), defaults.getS3MaxConnections()); + assertEquals(config.getUserAgentSuffix(), S3_USER_AGENT_SUFFIX); + assertEquals(config.getUserAgentPrefix(), ""); + } + } + + @Test + public void testSkipGlacierObjectsEnabled() + throws Exception + { + assertSkipGlacierObjects(true); + assertSkipGlacierObjects(false); + } + + private static void assertSkipGlacierObjects(boolean skipGlacierObjects) + throws Exception + { + Configuration config = new Configuration(); + config.set(S3_SKIP_GLACIER_OBJECTS, String.valueOf(skipGlacierObjects)); + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + s3.setHasGlacierObjects(true); + fs.initialize(new URI("s3n://test-bucket/"), config); + fs.setS3Client(s3); + FileStatus[] statuses = fs.listStatus(new Path("s3n://test-bucket/test")); + assertEquals(statuses.length, skipGlacierObjects ? 1 : 2); + } + } + + private static AWSCredentialsProvider getAwsCredentialsProvider(PrestoS3FileSystem fs) + { + return getFieldValue(fs.getS3Client(), "awsCredentialsProvider", AWSCredentialsProvider.class); + } + + private static T getFieldValue(Object instance, String name, Class type) + { + return getFieldValue(instance, instance.getClass(), name, type); + } + + @SuppressWarnings("unchecked") + private static T getFieldValue(Object instance, Class clazz, String name, Class type) + { + try { + Field field = clazz.getDeclaredField(name); + checkArgument(field.getType() == type, "expected %s but found %s", type, field.getType()); + field.setAccessible(true); + return (T) field.get(instance); + } + catch (ReflectiveOperationException e) { + throw new RuntimeException(e); + } + } + + private static class TestEncryptionMaterialsProvider + implements EncryptionMaterialsProvider + { + private final EncryptionMaterials encryptionMaterials; + + public TestEncryptionMaterialsProvider() + { + encryptionMaterials = new EncryptionMaterials(new SecretKeySpec(new byte[] {1, 2, 3}, "AES")); + } + + @Override + public void refresh() + { + } + + @Override + public EncryptionMaterials getEncryptionMaterials(Map materialsDescription) + { + return encryptionMaterials; + } + + @Override + public EncryptionMaterials getEncryptionMaterials() + { + return encryptionMaterials; + } + } + + private static class TestCredentialsProvider + implements AWSCredentialsProvider + { + @SuppressWarnings("UnusedParameters") + public TestCredentialsProvider(URI uri, Configuration conf) {} + + @Override + public AWSCredentials getCredentials() + { + return null; + } + + @Override + public void refresh() {} + } + + @Test + public void testDefaultAcl() + throws Exception + { + Configuration config = new Configuration(); + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + String expectedBucketName = "test-bucket"; + fs.initialize(new URI("s3n://" + expectedBucketName + "/"), config); + fs.setS3Client(s3); + try (FSDataOutputStream stream = fs.create(new Path("s3n://test-bucket/test"))) { + // initiate an upload by creating a stream & closing it immediately + } + assertEquals(CannedAccessControlList.Private, s3.getAcl()); + } + } + + @Test + public void testFullBucketOwnerControlAcl() + throws Exception + { + Configuration config = new Configuration(); + config.set(S3_ACL_TYPE, "BUCKET_OWNER_FULL_CONTROL"); + + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3(); + String expectedBucketName = "test-bucket"; + fs.initialize(new URI("s3n://" + expectedBucketName + "/"), config); + fs.setS3Client(s3); + try (FSDataOutputStream stream = fs.create(new Path("s3n://test-bucket/test"))) { + // initiate an upload by creating a stream & closing it immediately + } + assertEquals(CannedAccessControlList.BucketOwnerFullControl, s3.getAcl()); + } + } + + @Test + public void testEmptyDirectory() + throws Exception + { + try (PrestoS3FileSystem fs = new PrestoS3FileSystem()) { + MockAmazonS3 s3 = new MockAmazonS3() + { + @Override + public ObjectMetadata getObjectMetadata(GetObjectMetadataRequest getObjectMetadataRequest) + { + if (getObjectMetadataRequest.getKey().equals("empty-dir/")) { + ObjectMetadata objectMetadata = new ObjectMetadata(); + objectMetadata.setContentType(S3_DIRECTORY_OBJECT_CONTENT_TYPE); + return objectMetadata; + } + return super.getObjectMetadata(getObjectMetadataRequest); + } + }; + fs.initialize(new URI("s3n://test-bucket/"), new Configuration()); + fs.setS3Client(s3); + + FileStatus fileStatus = fs.getFileStatus(new Path("s3n://test-bucket/empty-dir/")); + assertTrue(fileStatus.isDirectory()); + + fileStatus = fs.getFileStatus(new Path("s3n://test-bucket/empty-dir")); + assertTrue(fileStatus.isDirectory()); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/security/TestLegacyAccessControl.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/security/TestLegacyAccessControl.java new file mode 100644 index 00000000..af331cc0 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/security/TestLegacyAccessControl.java @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import io.prestosql.spi.connector.ConnectorAccessControl; +import org.testng.annotations.Test; + +import static io.prestosql.spi.testing.InterfaceTestUtils.assertAllMethodsOverridden; + +public class TestLegacyAccessControl +{ + @Test + public void testEverythingImplemented() + { + assertAllMethodsOverridden(ConnectorAccessControl.class, LegacyAccessControl.class); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/security/TestLegacySecurityConfig.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/security/TestLegacySecurityConfig.java new file mode 100644 index 00000000..f380771c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/security/TestLegacySecurityConfig.java @@ -0,0 +1,61 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import com.google.common.collect.ImmutableMap; +import org.testng.annotations.Test; + +import java.util.Map; + +import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; +import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; +import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; + +public class TestLegacySecurityConfig +{ + @Test + public void testDefaults() + { + assertRecordedDefaults(recordDefaults(LegacySecurityConfig.class) + .setAllowAddColumn(false) + .setAllowDropColumn(false) + .setAllowDropTable(false) + .setAllowRenameTable(false) + .setAllowCommentTable(false) + .setAllowRenameColumn(false)); + } + + @Test + public void testExplicitPropertyMappings() + { + Map properties = new ImmutableMap.Builder() + .put("hive.allow-add-column", "true") + .put("hive.allow-drop-column", "true") + .put("hive.allow-drop-table", "true") + .put("hive.allow-rename-table", "true") + .put("hive.allow-comment-table", "true") + .put("hive.allow-rename-column", "true") + .build(); + + LegacySecurityConfig expected = new LegacySecurityConfig() + .setAllowAddColumn(true) + .setAllowDropColumn(true) + .setAllowDropTable(true) + .setAllowRenameTable(true) + .setAllowCommentTable(true) + .setAllowRenameColumn(true); + + assertFullMapping(properties, expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/security/TestSqlStandardAccessControl.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/security/TestSqlStandardAccessControl.java new file mode 100644 index 00000000..682421be --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/security/TestSqlStandardAccessControl.java @@ -0,0 +1,28 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.security; + +import io.prestosql.spi.connector.ConnectorAccessControl; +import org.testng.annotations.Test; + +import static io.prestosql.spi.testing.InterfaceTestUtils.assertAllMethodsOverridden; + +public class TestSqlStandardAccessControl +{ + @Test + public void testEverythingImplemented() + { + assertAllMethodsOverridden(ConnectorAccessControl.class, SqlStandardAccessControl.class); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/statistics/BenchmarkGetPartitionsSample.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/statistics/BenchmarkGetPartitionsSample.java new file mode 100644 index 00000000..15e4f2ce --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/statistics/BenchmarkGetPartitionsSample.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.statistics; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HivePartition; +import io.prestosql.spi.connector.SchemaTableName; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; +import org.openjdk.jmh.runner.options.VerboseMode; + +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.getPartitionsSample; + +@State(Scope.Thread) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@Fork(2) +@Warmup(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS) +@Measurement(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS) +@BenchmarkMode(Mode.AverageTime) +public class BenchmarkGetPartitionsSample +{ + private static final int TOTAL_SIZE = 10_000; + private static final int SAMPLE_SIZE = 100; + + @Benchmark + public List selectSample(BenchmarkData data) + { + return getPartitionsSample(data.partitions, SAMPLE_SIZE); + } + + @State(Scope.Thread) + public static class BenchmarkData + { + public List partitions; + + @Setup + public void setup() + { + ImmutableList.Builder partitions = ImmutableList.builder(); + SchemaTableName table = new SchemaTableName("schema", "table"); + for (int i = 0; i < TOTAL_SIZE; i++) { + partitions.add(new HivePartition(table, "partition_" + i, ImmutableMap.of())); + } + this.partitions = partitions.build(); + } + } + + public static void main(String[] args) + throws Throwable + { + Options options = new OptionsBuilder() + .verbosity(VerboseMode.NORMAL) + .include(".*" + BenchmarkGetPartitionsSample.class.getSimpleName() + ".*") + .build(); + new Runner(options).run(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/statistics/TestMetastoreHiveStatisticsProvider.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/statistics/TestMetastoreHiveStatisticsProvider.java new file mode 100644 index 00000000..e102b2cf --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/statistics/TestMetastoreHiveStatisticsProvider.java @@ -0,0 +1,870 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.statistics; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HiveBasicStatistics; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveConfig; +import io.prestosql.plugin.hive.HiveErrorCode; +import io.prestosql.plugin.hive.HivePartition; +import io.prestosql.plugin.hive.HiveSessionProperties; +import io.prestosql.plugin.hive.OrcFileWriterConfig; +import io.prestosql.plugin.hive.ParquetFileWriterConfig; +import io.prestosql.plugin.hive.PartitionStatistics; +import io.prestosql.plugin.hive.metastore.Column; +import io.prestosql.plugin.hive.metastore.DateStatistics; +import io.prestosql.plugin.hive.metastore.DecimalStatistics; +import io.prestosql.plugin.hive.metastore.DoubleStatistics; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; +import io.prestosql.plugin.hive.metastore.IntegerStatistics; +import io.prestosql.plugin.hive.metastore.Storage; +import io.prestosql.plugin.hive.metastore.StorageFormat; +import io.prestosql.plugin.hive.metastore.Table; +import io.prestosql.spi.PrestoException; +import io.prestosql.spi.connector.SchemaTableName; +import io.prestosql.spi.statistics.ColumnStatistics; +import io.prestosql.spi.statistics.DoubleRange; +import io.prestosql.spi.statistics.Estimate; +import io.prestosql.spi.statistics.TableStatistics; +import io.prestosql.spi.type.DecimalType; +import io.prestosql.spi.type.Type; +import io.prestosql.testing.TestingConnectorSession; +import org.testng.annotations.Test; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.util.Optional; +import java.util.OptionalDouble; +import java.util.OptionalLong; + +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.PARTITION_KEY; +import static io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR; +import static io.prestosql.plugin.hive.HivePartition.UNPARTITIONED_ID; +import static io.prestosql.plugin.hive.HivePartitionManager.parsePartition; +import static io.prestosql.plugin.hive.HiveStorageFormat.ORC; +import static io.prestosql.plugin.hive.HiveType.HIVE_LONG; +import static io.prestosql.plugin.hive.HiveType.HIVE_STRING; +import static io.prestosql.plugin.hive.HiveUtil.parsePartitionValue; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.calculateAverageRowsPerPartition; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.calculateDataSize; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.calculateDataSizeForPartitioningKey; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.calculateDistinctPartitionKeys; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.calculateDistinctValuesCount; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.calculateNullsFraction; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.calculateNullsFractionForPartitioningKey; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.calculateRange; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.calculateRangeForPartitioningKey; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.convertPartitionValueToDouble; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.createDataColumnStatistics; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.getPartitionsSample; +import static io.prestosql.plugin.hive.statistics.MetastoreHiveStatisticsProvider.validatePartitionStatistics; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DecimalType.createDecimalType; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.VarcharType.VARCHAR; +import static java.lang.Double.NaN; +import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.testng.Assert.assertEquals; + +public class TestMetastoreHiveStatisticsProvider +{ + private static final Storage STORAGE_1 = new Storage(StorageFormat.fromHiveStorageFormat(ORC), "", Optional.empty(), false, ImmutableMap.of()); + private static final SchemaTableName TABLE = new SchemaTableName("schema", "table"); + private static final String PARTITION = "partition"; + private static final String COLUMN = "column"; + private static final DecimalType DECIMAL = createDecimalType(5, 3); + + private static final HiveColumnHandle PARTITION_COLUMN_1 = new HiveColumnHandle("p1", HIVE_STRING, VARCHAR.getTypeSignature(), 0, PARTITION_KEY, Optional.empty()); + private static final HiveColumnHandle PARTITION_COLUMN_2 = new HiveColumnHandle("p2", HIVE_LONG, BIGINT.getTypeSignature(), 1, PARTITION_KEY, Optional.empty()); + private static final Table table = new Table(TABLE.getSchemaName(), TABLE.getTableName(), "user", "MANAGED_TABLE", STORAGE_1, ImmutableList.of(), ImmutableList.of(new Column("p1", HIVE_STRING, Optional.empty()), new Column("p2", HIVE_LONG, Optional.empty())), ImmutableMap.of(), Optional.of("original"), Optional.of("expanded")); + + @Test + public void testGetPartitionsSample() + { + HivePartition p1 = partition("p1=string1/p2=1234"); + HivePartition p2 = partition("p1=string2/p2=2345"); + HivePartition p3 = partition("p1=string3/p2=3456"); + HivePartition p4 = partition("p1=string4/p2=4567"); + HivePartition p5 = partition("p1=string5/p2=5678"); + + assertEquals(getPartitionsSample(ImmutableList.of(p1), 1), ImmutableList.of(p1)); + assertEquals(getPartitionsSample(ImmutableList.of(p1), 2), ImmutableList.of(p1)); + assertEquals(getPartitionsSample(ImmutableList.of(p1, p2), 2), ImmutableList.of(p1, p2)); + assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3), 2), ImmutableList.of(p1, p3)); + assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 1), getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 1)); + assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 3), getPartitionsSample(ImmutableList.of(p1, p2, p3, p4), 3)); + assertEquals(getPartitionsSample(ImmutableList.of(p1, p2, p3, p4, p5), 3), ImmutableList.of(p1, p5, p4)); + } + + @Test + public void testValidatePartitionStatistics() + { + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(-1, 0, 0, 0)) + .build(), + invalidPartitionStatistics("fileCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, -1, 0, 0)) + .build(), + invalidPartitionStatistics("rowCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, -1, 0)) + .build(), + invalidPartitionStatistics("inMemoryDataSizeInBytes must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, -1)) + .build(), + invalidPartitionStatistics("onDiskDataSizeInBytes must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setMaxValueSizeInBytes(-1).build())) + .build(), + invalidColumnStatistics("maxValueSizeInBytes must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setTotalSizeInBytes(-1).build())) + .build(), + invalidColumnStatistics("totalSizeInBytes must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setNullsCount(-1).build())) + .build(), + invalidColumnStatistics("nullsCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setNullsCount(1).build())) + .build(), + invalidColumnStatistics("nullsCount must be less than or equal to rowCount. nullsCount: 1. rowCount: 0.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setDistinctValuesCount(-1).build())) + .build(), + invalidColumnStatistics("distinctValuesCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setDistinctValuesCount(1).build())) + .build(), + invalidColumnStatistics("distinctValuesCount must be less than or equal to rowCount. distinctValuesCount: 1. rowCount: 0.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 1, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setDistinctValuesCount(1).setNullsCount(1).build())) + .build(), + invalidColumnStatistics("distinctValuesCount must be less than or equal to nonNullsCount. distinctValuesCount: 1. nonNullsCount: 0.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(1), OptionalLong.of(-1), OptionalLong.empty(), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("integerStatistics.min must be less than or equal to integerStatistics.max. integerStatistics.min: 1. integerStatistics.max: -1.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.createDoubleColumnStatistics(OptionalDouble.of(1), OptionalDouble.of(-1), OptionalLong.empty(), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("doubleStatistics.min must be less than or equal to doubleStatistics.max. doubleStatistics.min: 1.0. doubleStatistics.max: -1.0.")); + validatePartitionStatistics( + TABLE, + ImmutableMap.of( + PARTITION, + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.createDoubleColumnStatistics(OptionalDouble.of(NaN), OptionalDouble.of(NaN), OptionalLong.empty(), OptionalLong.empty()))) + .build())); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.createDecimalColumnStatistics(Optional.of(BigDecimal.valueOf(1)), Optional.of(BigDecimal.valueOf(-1)), OptionalLong.empty(), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("decimalStatistics.min must be less than or equal to decimalStatistics.max. decimalStatistics.min: 1. decimalStatistics.max: -1.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.createDateColumnStatistics(Optional.of(LocalDate.ofEpochDay(1)), Optional.of(LocalDate.ofEpochDay(-1)), OptionalLong.empty(), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("dateStatistics.min must be less than or equal to dateStatistics.max. dateStatistics.min: 1970-01-02. dateStatistics.max: 1969-12-31.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(-1), OptionalLong.empty(), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("trueCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.empty(), OptionalLong.of(-1), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("falseCount must be greater than or equal to zero: -1")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(1), OptionalLong.empty(), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("booleanStatistics.trueCount must be less than or equal to rowCount. booleanStatistics.trueCount: 1. rowCount: 0.")); + assertInvalidStatistics( + PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(0, 0, 0, 0)) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.empty(), OptionalLong.of(1), OptionalLong.empty()))) + .build(), + invalidColumnStatistics("booleanStatistics.falseCount must be less than or equal to rowCount. booleanStatistics.falseCount: 1. rowCount: 0.")); + } + + @Test + public void testCalculateAverageRowsPerPartition() + { + assertThat(calculateAverageRowsPerPartition(ImmutableList.of())).isEmpty(); + assertThat(calculateAverageRowsPerPartition(ImmutableList.of(PartitionStatistics.empty()))).isEmpty(); + assertThat(calculateAverageRowsPerPartition(ImmutableList.of(PartitionStatistics.empty(), PartitionStatistics.empty()))).isEmpty(); + assertEquals(calculateAverageRowsPerPartition(ImmutableList.of(rowsCount(10))), OptionalDouble.of(10)); + assertEquals(calculateAverageRowsPerPartition(ImmutableList.of(rowsCount(10), PartitionStatistics.empty())), OptionalDouble.of(10)); + assertEquals(calculateAverageRowsPerPartition(ImmutableList.of(rowsCount(10), rowsCount(20))), OptionalDouble.of(15)); + assertEquals(calculateAverageRowsPerPartition(ImmutableList.of(rowsCount(10), rowsCount(20), PartitionStatistics.empty())), OptionalDouble.of(15)); + } + + @Test + public void testCalculateDistinctPartitionKeys() + { + assertEquals(calculateDistinctPartitionKeys(PARTITION_COLUMN_1, ImmutableList.of()), 0); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=string1/p2=1234"))), + 1); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=string2/p2=1234"))), + 2); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_2, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=string2/p2=1234"))), + 1); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_2, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=string1/p2=1235"))), + 2); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234"), partition("p1=string1/p2=1235"))), + 1); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_2, + ImmutableList.of(partition("p1=123/p2=__HIVE_DEFAULT_PARTITION__"), partition("p1=string1/p2=1235"))), + 1); + assertEquals( + calculateDistinctPartitionKeys( + PARTITION_COLUMN_2, + ImmutableList.of(partition("p1=123/p2=__HIVE_DEFAULT_PARTITION__"), partition("p1=string1/p2=__HIVE_DEFAULT_PARTITION__"))), + 0); + } + + @Test + public void testCalculateNullsFractionForPartitioningKey() + { + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=string1/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000)), + 2000, + 0), + 0.0); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=string1/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000)), + 2000, + 4000), + 0.0); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234")), + ImmutableMap.of("p1=__HIVE_DEFAULT_PARTITION__/p2=1234", rowsCount(1000)), + 2000, + 4000), + 0.25); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234")), + ImmutableMap.of("p1=__HIVE_DEFAULT_PARTITION__/p2=1234", PartitionStatistics.empty()), + 2000, + 4000), + 0.5); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234")), + ImmutableMap.of(), + 2000, + 4000), + 0.5); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234"), partition("p1=__HIVE_DEFAULT_PARTITION__/p2=4321")), + ImmutableMap.of("p1=__HIVE_DEFAULT_PARTITION__/p2=1234", rowsCount(1000), "p1=__HIVE_DEFAULT_PARTITION__/p2=4321", rowsCount(2000)), + 3000, + 4000), + 0.75); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234"), partition("p1=__HIVE_DEFAULT_PARTITION__/p2=4321")), + ImmutableMap.of("p1=__HIVE_DEFAULT_PARTITION__/p2=1234", rowsCount(1000), "p1=__HIVE_DEFAULT_PARTITION__/p2=4321", PartitionStatistics.empty()), + 3000, + 4000), + 1.0); + assertEquals( + calculateNullsFractionForPartitioningKey( + PARTITION_COLUMN_1, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234"), partition("p1=__HIVE_DEFAULT_PARTITION__/p2=4321")), + ImmutableMap.of("p1=__HIVE_DEFAULT_PARTITION__/p2=1234", rowsCount(1000), "p1=__HIVE_DEFAULT_PARTITION__/p2=4321", PartitionStatistics.empty()), + 4000, + 4000), + 1.0); + } + + @Test + public void testCalculateDataSizeForPartitioningKey() + { + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_2, + BIGINT, + ImmutableList.of(partition("p1=string1/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000)), + 2000), + Estimate.unknown()); + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=string1/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000)), + 2000), + Estimate.of(7000)); + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=string1/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", PartitionStatistics.empty()), + 2000), + Estimate.of(14000)); + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=str2/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000), "p1=str2/p2=1234", rowsCount(2000)), + 3000), + Estimate.of(15000)); + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=str2/p2=1234")), + ImmutableMap.of("p1=string1/p2=1234", rowsCount(1000), "p1=str2/p2=1234", PartitionStatistics.empty()), + 3000), + Estimate.of(19000)); + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=str2/p2=1234")), + ImmutableMap.of(), + 3000), + Estimate.of(33000)); + assertEquals( + calculateDataSizeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=__HIVE_DEFAULT_PARTITION__/p2=1234"), partition("p1=str2/p2=1234")), + ImmutableMap.of(), + 3000), + Estimate.of(12000)); + } + + @Test + public void testCalculateRangeForPartitioningKey() + { + assertEquals( + calculateRangeForPartitioningKey( + PARTITION_COLUMN_1, + VARCHAR, + ImmutableList.of(partition("p1=string1/p2=1234"))), + Optional.empty()); + assertEquals( + calculateRangeForPartitioningKey( + PARTITION_COLUMN_2, + BIGINT, + ImmutableList.of(partition("p1=string1/p2=__HIVE_DEFAULT_PARTITION__"))), + Optional.empty()); + assertEquals( + calculateRangeForPartitioningKey( + PARTITION_COLUMN_2, + BIGINT, + ImmutableList.of(partition("p1=string1/p2=__HIVE_DEFAULT_PARTITION__"), partition("p1=string1/p2=__HIVE_DEFAULT_PARTITION__"))), + Optional.empty()); + assertEquals( + calculateRangeForPartitioningKey( + PARTITION_COLUMN_2, + BIGINT, + ImmutableList.of(partition("p1=string1/p2=__HIVE_DEFAULT_PARTITION__"), partition("p1=string1/p2=1"))), + Optional.of(new DoubleRange(1, 1))); + assertEquals( + calculateRangeForPartitioningKey( + PARTITION_COLUMN_2, + BIGINT, + ImmutableList.of(partition("p1=string1/p2=2"), partition("p1=string1/p2=1"))), + Optional.of(new DoubleRange(1, 2))); + } + + @Test + public void testConvertPartitionValueToDouble() + { + assertConvertPartitionValueToDouble(BIGINT, "123456", 123456); + assertConvertPartitionValueToDouble(INTEGER, "12345", 12345); + assertConvertPartitionValueToDouble(SMALLINT, "1234", 1234); + assertConvertPartitionValueToDouble(TINYINT, "123", 123); + assertConvertPartitionValueToDouble(DOUBLE, "0.1", 0.1); + assertConvertPartitionValueToDouble(REAL, "0.2", (double) (float) 0.2); + assertConvertPartitionValueToDouble(createDecimalType(5, 2), "123.45", 123.45); + assertConvertPartitionValueToDouble(createDecimalType(25, 5), "12345678901234567890.12345", 12345678901234567890.12345); + assertConvertPartitionValueToDouble(DATE, "1970-01-02", 1); + } + + private static void assertConvertPartitionValueToDouble(Type type, String value, double expected) + { + Object prestoValue = parsePartitionValue(format("p=%s", value), value, type).getValue(); + assertEquals(convertPartitionValueToDouble(type, prestoValue), expected); + } + + @Test + public void testCreateDataColumnStatistics() + { + assertEquals(createDataColumnStatistics(COLUMN, BIGINT, 1000, ImmutableList.of()), ColumnStatistics.empty()); + assertEquals( + createDataColumnStatistics(COLUMN, BIGINT, 1000, ImmutableList.of(PartitionStatistics.empty(), PartitionStatistics.empty())), + ColumnStatistics.empty()); + assertEquals( + createDataColumnStatistics( + COLUMN, + BIGINT, + 1000, + ImmutableList.of(new PartitionStatistics(HiveBasicStatistics.createZeroStatistics(), ImmutableMap.of("column2", HiveColumnStatistics.empty())))), + ColumnStatistics.empty()); + } + + @Test + public void testCalculateDistinctValuesCount() + { + assertEquals(calculateDistinctValuesCount(ImmutableList.of()), Estimate.unknown()); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.empty())), Estimate.unknown()); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.empty(), HiveColumnStatistics.empty())), Estimate.unknown()); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(distinctValuesCount(1))), Estimate.of(1)); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(distinctValuesCount(1), distinctValuesCount(2))), Estimate.of(2)); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(distinctValuesCount(1), HiveColumnStatistics.empty())), Estimate.of(1)); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.empty(), OptionalLong.empty(), OptionalLong.empty()))), Estimate.unknown()); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(1), OptionalLong.of(0), OptionalLong.empty()))), Estimate.of(1)); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(10), OptionalLong.empty(), OptionalLong.empty()))), Estimate.unknown()); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(10), OptionalLong.of(10), OptionalLong.empty()))), Estimate.of(2)); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.empty(), OptionalLong.of(10), OptionalLong.empty()))), Estimate.unknown()); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(0), OptionalLong.of(10), OptionalLong.empty()))), Estimate.of(1)); + assertEquals(calculateDistinctValuesCount(ImmutableList.of(HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(0), OptionalLong.of(0), OptionalLong.empty()))), Estimate.of(0)); + assertEquals( + calculateDistinctValuesCount(ImmutableList.of( + HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(0), OptionalLong.of(10), OptionalLong.empty()), + HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(1), OptionalLong.of(10), OptionalLong.empty()))), + Estimate.of(2)); + } + + @Test + public void testCalculateNullsFraction() + { + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of()), Estimate.unknown()); + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(PartitionStatistics.empty())), Estimate.unknown()); + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCount(1000))), Estimate.unknown()); + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCount(1000), nullsCount(500))), Estimate.unknown()); + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCount(1000), nullsCount(500), rowsCountAndNullsCount(1000, 500))), Estimate.of(0.5)); + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCountAndNullsCount(2000, 200), rowsCountAndNullsCount(1000, 100))), Estimate.of(0.1)); + assertEquals(calculateNullsFraction(COLUMN, ImmutableList.of(rowsCountAndNullsCount(0, 0), rowsCountAndNullsCount(0, 0))), Estimate.of(0)); + } + + @Test + public void testCalculateDataSize() + { + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(), 0), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(), 1000), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(PartitionStatistics.empty()), 1000), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCount(1000)), 1000), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(dataSize(1000)), 1000), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(dataSize(1000), rowsCount(1000)), 1000), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(500, 1000)), 2000), Estimate.of(4000)); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(0, 0)), 2000), Estimate.unknown()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(0, 0)), 0), Estimate.zero()); + assertEquals(calculateDataSize(COLUMN, ImmutableList.of(rowsCountAndDataSize(1000, 0)), 2000), Estimate.of(0)); + assertEquals( + calculateDataSize( + COLUMN, + ImmutableList.of( + rowsCountAndDataSize(500, 1000), + rowsCountAndDataSize(1000, 5000)), + 5000), + Estimate.of(20000)); + assertEquals( + calculateDataSize( + COLUMN, + ImmutableList.of( + dataSize(1000), + rowsCountAndDataSize(500, 1000), + rowsCount(3000), + rowsCountAndDataSize(1000, 5000)), + 5000), + Estimate.of(20000)); + } + + @Test + public void testCalculateRange() + { + assertEquals(calculateRange(VARCHAR, ImmutableList.of()), Optional.empty()); + assertEquals(calculateRange(VARCHAR, ImmutableList.of(integerRange(OptionalLong.empty(), OptionalLong.empty()))), Optional.empty()); + assertEquals(calculateRange(VARCHAR, ImmutableList.of(integerRange(1, 2))), Optional.empty()); + assertEquals(calculateRange(BIGINT, ImmutableList.of(integerRange(1, 2))), Optional.of(new DoubleRange(1, 2))); + assertEquals(calculateRange(BIGINT, ImmutableList.of(integerRange(Long.MIN_VALUE, Long.MAX_VALUE))), Optional.of(new DoubleRange(Long.MIN_VALUE, Long.MAX_VALUE))); + assertEquals(calculateRange(INTEGER, ImmutableList.of(integerRange(Long.MIN_VALUE, Long.MAX_VALUE))), Optional.of(new DoubleRange(Integer.MIN_VALUE, Integer.MAX_VALUE))); + assertEquals(calculateRange(SMALLINT, ImmutableList.of(integerRange(Long.MIN_VALUE, Long.MAX_VALUE))), Optional.of(new DoubleRange(Short.MIN_VALUE, Short.MAX_VALUE))); + assertEquals(calculateRange(TINYINT, ImmutableList.of(integerRange(Long.MIN_VALUE, Long.MAX_VALUE))), Optional.of(new DoubleRange(Byte.MIN_VALUE, Byte.MAX_VALUE))); + assertEquals(calculateRange(BIGINT, ImmutableList.of(integerRange(1, 5), integerRange(3, 7))), Optional.of(new DoubleRange(1, 7))); + assertEquals(calculateRange(BIGINT, ImmutableList.of(integerRange(OptionalLong.empty(), OptionalLong.empty()), integerRange(3, 7))), Optional.of(new DoubleRange(3, 7))); + assertEquals(calculateRange(BIGINT, ImmutableList.of(integerRange(OptionalLong.empty(), OptionalLong.of(8)), integerRange(3, 7))), Optional.of(new DoubleRange(3, 7))); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(integerRange(1, 2))), Optional.empty()); + assertEquals(calculateRange(REAL, ImmutableList.of(integerRange(1, 2))), Optional.empty()); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(OptionalDouble.empty(), OptionalDouble.empty()))), Optional.empty()); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(0.1, 0.2))), Optional.of(new DoubleRange(0.1, 0.2))); + assertEquals(calculateRange(BIGINT, ImmutableList.of(doubleRange(0.1, 0.2))), Optional.empty()); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(0.1, 0.2), doubleRange(0.15, 0.25))), Optional.of(new DoubleRange(0.1, 0.25))); + assertEquals(calculateRange(REAL, ImmutableList.of(doubleRange(0.1, 0.2), doubleRange(0.15, 0.25))), Optional.of(new DoubleRange(0.1, 0.25))); + assertEquals(calculateRange(REAL, ImmutableList.of(doubleRange(OptionalDouble.empty(), OptionalDouble.of(0.2)), doubleRange(0.15, 0.25))), Optional.of(new DoubleRange(0.15, 0.25))); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(NaN, 0.2))), Optional.empty()); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(0.1, NaN))), Optional.empty()); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(NaN, NaN))), Optional.empty()); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))), Optional.of(new DoubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))); + assertEquals(calculateRange(REAL, ImmutableList.of(doubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))), Optional.of(new DoubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY))), Optional.of(new DoubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))); + assertEquals(calculateRange(DOUBLE, ImmutableList.of(doubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY), doubleRange(0.1, 0.2))), Optional.of(new DoubleRange(Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY))); + assertEquals(calculateRange(DATE, ImmutableList.of(doubleRange(0.1, 0.2))), Optional.empty()); + assertEquals(calculateRange(DATE, ImmutableList.of(dateRange("1970-01-01", "1970-01-02"))), Optional.of(new DoubleRange(0, 1))); + assertEquals(calculateRange(DATE, ImmutableList.of(dateRange(Optional.empty(), Optional.empty()))), Optional.empty()); + assertEquals(calculateRange(DATE, ImmutableList.of(dateRange(Optional.of("1970-01-01"), Optional.empty()))), Optional.empty()); + assertEquals(calculateRange(DATE, ImmutableList.of(dateRange("1970-01-01", "1970-01-05"), dateRange("1970-01-03", "1970-01-07"))), Optional.of(new DoubleRange(0, 6))); + assertEquals(calculateRange(DECIMAL, ImmutableList.of(doubleRange(0.1, 0.2))), Optional.empty()); + assertEquals(calculateRange(DECIMAL, ImmutableList.of(decimalRange(BigDecimal.valueOf(1), BigDecimal.valueOf(5)))), Optional.of(new DoubleRange(1, 5))); + assertEquals(calculateRange(DECIMAL, ImmutableList.of(decimalRange(Optional.empty(), Optional.empty()))), Optional.empty()); + assertEquals(calculateRange(DECIMAL, ImmutableList.of(decimalRange(Optional.of(BigDecimal.valueOf(1)), Optional.empty()))), Optional.empty()); + assertEquals(calculateRange(DECIMAL, ImmutableList.of(decimalRange(BigDecimal.valueOf(1), BigDecimal.valueOf(5)), decimalRange(BigDecimal.valueOf(3), BigDecimal.valueOf(7)))), Optional.of(new DoubleRange(1, 7))); + } + + @Test + public void testGetTableStatistics() + { + String partitionName = "p1=string1/p2=1234"; + PartitionStatistics statistics = PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.of(1000), OptionalLong.empty(), OptionalLong.empty())) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(-100), OptionalLong.of(100), OptionalLong.of(500), OptionalLong.of(300)))) + .build(); + MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((session, schemaTableName, hivePartitions, table) -> ImmutableMap.of(partitionName, statistics)); + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig(), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + HiveColumnHandle columnHandle = new HiveColumnHandle(COLUMN, HIVE_LONG, BIGINT.getTypeSignature(), 2, REGULAR, Optional.empty()); + TableStatistics expected = TableStatistics.builder() + .setRowCount(Estimate.of(1000)) + .setColumnStatistics( + PARTITION_COLUMN_1, + ColumnStatistics.builder() + .setDataSize(Estimate.of(7000)) + .setNullsFraction(Estimate.of(0)) + .setDistinctValuesCount(Estimate.of(1)) + .build()) + .setColumnStatistics( + PARTITION_COLUMN_2, + ColumnStatistics.builder() + .setRange(new DoubleRange(1234, 1234)) + .setNullsFraction(Estimate.of(0)) + .setDistinctValuesCount(Estimate.of(1)) + .build()) + .setColumnStatistics( + columnHandle, + ColumnStatistics.builder() + .setRange(new DoubleRange(-100, 100)) + .setNullsFraction(Estimate.of(0.5)) + .setDistinctValuesCount(Estimate.of(300)) + .build()) + .build(); + assertEquals( + statisticsProvider.getTableStatistics( + session, + TABLE, + ImmutableMap.of( + "p1", PARTITION_COLUMN_1, + "p2", PARTITION_COLUMN_2, + COLUMN, columnHandle), + ImmutableMap.of( + "p1", VARCHAR, + "p2", BIGINT, + COLUMN, BIGINT), + ImmutableList.of(partition(partitionName)), true, table), + expected); + } + + @Test + public void testGetTableStatisticsUnpartitioned() + { + PartitionStatistics statistics = PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.of(1000), OptionalLong.empty(), OptionalLong.empty())) + .setColumnStatistics(ImmutableMap.of(COLUMN, HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(-100), OptionalLong.of(100), OptionalLong.of(500), OptionalLong.of(300)))) + .build(); + MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((session, schemaTableName, hivePartitions, table) -> ImmutableMap.of(UNPARTITIONED_ID, statistics)); + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig(), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + HiveColumnHandle columnHandle = new HiveColumnHandle(COLUMN, HIVE_LONG, BIGINT.getTypeSignature(), 2, REGULAR, Optional.empty()); + TableStatistics expected = TableStatistics.builder() + .setRowCount(Estimate.of(1000)) + .setColumnStatistics( + columnHandle, + ColumnStatistics.builder() + .setRange(new DoubleRange(-100, 100)) + .setNullsFraction(Estimate.of(0.5)) + .setDistinctValuesCount(Estimate.of(300)) + .build()) + .build(); + assertEquals( + statisticsProvider.getTableStatistics( + session, + TABLE, + ImmutableMap.of(COLUMN, columnHandle), + ImmutableMap.of(COLUMN, BIGINT), + ImmutableList.of(new HivePartition(TABLE)), true, table), + expected); + } + + @Test + public void testGetTableStatisticsEmpty() + { + String partitionName = "p1=string1/p2=1234"; + MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((session, schemaTableName, hivePartitions, table) -> ImmutableMap.of(partitionName, PartitionStatistics.empty())); + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties(new HiveConfig(), new OrcFileWriterConfig(), new ParquetFileWriterConfig()).getSessionProperties()); + assertEquals( + statisticsProvider.getTableStatistics( + session, + TABLE, + ImmutableMap.of(), + ImmutableMap.of(), + ImmutableList.of(partition(partitionName)), true, table), + TableStatistics.empty()); + } + + @Test + public void testGetTableStatisticsSampling() + { + MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((session, schemaTableName, hivePartitions, table) -> { + assertEquals(schemaTableName, TABLE); + assertEquals(hivePartitions.size(), 1); + return ImmutableMap.of(); + }); + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties( + new HiveConfig().setPartitionStatisticsSampleSize(1), + new OrcFileWriterConfig(), + new ParquetFileWriterConfig()) + .getSessionProperties()); + statisticsProvider.getTableStatistics( + session, + TABLE, + ImmutableMap.of(), + ImmutableMap.of(), + ImmutableList.of(partition("p1=string1/p2=1234"), partition("p1=string1/p2=1235")), true, table); + } + + @Test + public void testGetTableStatisticsValidationFailure() + { + PartitionStatistics corruptedStatistics = PartitionStatistics.builder() + .setBasicStatistics(new HiveBasicStatistics(-1, 0, 0, 0)) + .build(); + String partitionName = "p1=string1/p2=1234"; + MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((session, schemaTableName, hivePartitions, table) -> ImmutableMap.of(partitionName, corruptedStatistics)); + TestingConnectorSession session = new TestingConnectorSession(new HiveSessionProperties( + new HiveConfig().setIgnoreCorruptedStatistics(false), + new OrcFileWriterConfig(), + new ParquetFileWriterConfig()) + .getSessionProperties()); + assertThatThrownBy(() -> statisticsProvider.getTableStatistics( + session, + TABLE, + ImmutableMap.of(), + ImmutableMap.of(), + ImmutableList.of(partition(partitionName)), true, table)) + .isInstanceOf(PrestoException.class) + .hasFieldOrPropertyWithValue("errorCode", HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS.toErrorCode()); + TestingConnectorSession ignoreSession = new TestingConnectorSession(new HiveSessionProperties( + new HiveConfig().setIgnoreCorruptedStatistics(true), + new OrcFileWriterConfig(), + new ParquetFileWriterConfig()) + .getSessionProperties()); + assertEquals( + statisticsProvider.getTableStatistics( + ignoreSession, + TABLE, + ImmutableMap.of(), + ImmutableMap.of(), + ImmutableList.of(partition(partitionName)), true, table), + TableStatistics.empty()); + } + + private static void assertInvalidStatistics(PartitionStatistics partitionStatistics, String expectedMessage) + { + assertThatThrownBy(() -> validatePartitionStatistics(TABLE, ImmutableMap.of(PARTITION, partitionStatistics))) + .isInstanceOf(PrestoException.class) + .hasFieldOrPropertyWithValue("errorCode", HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS.toErrorCode()) + .hasMessage(expectedMessage); + } + + private static String invalidPartitionStatistics(String message) + { + return format("Corrupted partition statistics (Table: %s Partition: [%s]): %s", TABLE, PARTITION, message); + } + + private static String invalidColumnStatistics(String message) + { + return format("Corrupted partition statistics (Table: %s Partition: [%s] Column: %s): %s", TABLE, PARTITION, COLUMN, message); + } + + private static HivePartition partition(String name) + { + return parsePartition(TABLE, name, ImmutableList.of(PARTITION_COLUMN_1, PARTITION_COLUMN_2), ImmutableList.of(VARCHAR, BIGINT)); + } + + private static PartitionStatistics rowsCount(long rowsCount) + { + return new PartitionStatistics(new HiveBasicStatistics(0, rowsCount, 0, 0), ImmutableMap.of()); + } + + private static PartitionStatistics nullsCount(long nullsCount) + { + return new PartitionStatistics(HiveBasicStatistics.createEmptyStatistics(), ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setNullsCount(nullsCount).build())); + } + + private static PartitionStatistics dataSize(long dataSize) + { + return new PartitionStatistics(HiveBasicStatistics.createEmptyStatistics(), ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setTotalSizeInBytes(dataSize).build())); + } + + private static PartitionStatistics rowsCountAndNullsCount(long rowsCount, long nullsCount) + { + return new PartitionStatistics( + new HiveBasicStatistics(0, rowsCount, 0, 0), + ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setNullsCount(nullsCount).build())); + } + + private static PartitionStatistics rowsCountAndDataSize(long rowsCount, long dataSize) + { + return new PartitionStatistics( + new HiveBasicStatistics(0, rowsCount, 0, 0), + ImmutableMap.of(COLUMN, HiveColumnStatistics.builder().setTotalSizeInBytes(dataSize).build())); + } + + private static HiveColumnStatistics distinctValuesCount(long count) + { + return HiveColumnStatistics.builder() + .setDistinctValuesCount(count) + .build(); + } + + private static HiveColumnStatistics integerRange(long min, long max) + { + return integerRange(OptionalLong.of(min), OptionalLong.of(max)); + } + + private static HiveColumnStatistics integerRange(OptionalLong min, OptionalLong max) + { + return HiveColumnStatistics.builder() + .setIntegerStatistics(new IntegerStatistics(min, max)) + .build(); + } + + private static HiveColumnStatistics doubleRange(double min, double max) + { + return doubleRange(OptionalDouble.of(min), OptionalDouble.of(max)); + } + + private static HiveColumnStatistics doubleRange(OptionalDouble min, OptionalDouble max) + { + return HiveColumnStatistics.builder() + .setDoubleStatistics(new DoubleStatistics(min, max)) + .build(); + } + + private static HiveColumnStatistics dateRange(String min, String max) + { + return dateRange(Optional.of(min), Optional.of(max)); + } + + private static HiveColumnStatistics dateRange(Optional min, Optional max) + { + return HiveColumnStatistics.builder() + .setDateStatistics(new DateStatistics(min.map(TestMetastoreHiveStatisticsProvider::parseDate), max.map(TestMetastoreHiveStatisticsProvider::parseDate))) + .build(); + } + + private static LocalDate parseDate(String date) + { + return LocalDate.parse(date); + } + + private static HiveColumnStatistics decimalRange(BigDecimal min, BigDecimal max) + { + return decimalRange(Optional.of(min), Optional.of(max)); + } + + private static HiveColumnStatistics decimalRange(Optional min, Optional max) + { + return HiveColumnStatistics.builder() + .setDecimalStatistics(new DecimalStatistics(min, max)) + .build(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestAsyncQueue.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestAsyncQueue.java new file mode 100644 index 00000000..d8639b40 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestAsyncQueue.java @@ -0,0 +1,256 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.util.concurrent.ListenableFuture; +import io.airlift.concurrent.Threads; +import io.prestosql.plugin.hive.util.AsyncQueue.BorrowResult; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicBoolean; + +import static io.airlift.concurrent.MoreFutures.getFutureValue; +import static io.airlift.testing.Assertions.assertContains; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; + +public class TestAsyncQueue +{ + private ExecutorService executor; + + @BeforeClass + public void setUpClass() + { + executor = Executors.newFixedThreadPool(8, Threads.daemonThreadsNamed("test-async-queue-%s")); + } + + @AfterClass(alwaysRun = true) + public void tearDownClass() + { + executor.shutdownNow(); + } + + @Test(timeOut = 10_000) + public void testGetPartial() + throws Exception + { + AsyncQueue queue = new AsyncQueue<>(4, executor); + + queue.offer("1"); + queue.offer("2"); + queue.offer("3"); + assertEquals(queue.getBatchAsync(100).get(), ImmutableList.of("1", "2", "3")); + + queue.finish(); + assertTrue(queue.isFinished()); + } + + @Test(timeOut = 10_000) + public void testFullQueue() + throws Exception + { + AsyncQueue queue = new AsyncQueue<>(4, executor); + + assertTrue(queue.offer("1").isDone()); + assertTrue(queue.offer("2").isDone()); + assertTrue(queue.offer("3").isDone()); + + assertFalse(queue.offer("4").isDone()); + assertFalse(queue.offer("5").isDone()); + ListenableFuture offerFuture = queue.offer("6"); + assertFalse(offerFuture.isDone()); + + assertEquals(queue.getBatchAsync(2).get(), ImmutableList.of("1", "2")); + assertFalse(offerFuture.isDone()); + + assertEquals(queue.getBatchAsync(1).get(), ImmutableList.of("3")); + offerFuture.get(); + + offerFuture = queue.offer("7"); + assertFalse(offerFuture.isDone()); + + queue.finish(); + offerFuture.get(); + assertFalse(queue.isFinished()); + assertEquals(queue.getBatchAsync(4).get(), ImmutableList.of("4", "5", "6", "7")); + assertTrue(queue.isFinished()); + } + + @Test(timeOut = 10_000) + public void testEmptyQueue() + throws Exception + { + AsyncQueue queue = new AsyncQueue<>(4, executor); + + assertTrue(queue.offer("1").isDone()); + assertTrue(queue.offer("2").isDone()); + assertTrue(queue.offer("3").isDone()); + assertEquals(queue.getBatchAsync(2).get(), ImmutableList.of("1", "2")); + assertEquals(queue.getBatchAsync(2).get(), ImmutableList.of("3")); + ListenableFuture batchFuture = queue.getBatchAsync(2); + assertFalse(batchFuture.isDone()); + + assertTrue(queue.offer("4").isDone()); + assertEquals(batchFuture.get(), ImmutableList.of("4")); + + batchFuture = queue.getBatchAsync(2); + assertFalse(batchFuture.isDone()); + queue.finish(); + batchFuture.get(); + assertTrue(queue.isFinished()); + } + + @Test(timeOut = 10_000) + public void testOfferAfterFinish() + throws Exception + { + AsyncQueue queue = new AsyncQueue<>(4, executor); + + assertTrue(queue.offer("1").isDone()); + assertTrue(queue.offer("2").isDone()); + assertTrue(queue.offer("3").isDone()); + assertFalse(queue.offer("4").isDone()); + + queue.finish(); + assertTrue(queue.offer("5").isDone()); + assertTrue(queue.offer("6").isDone()); + assertTrue(queue.offer("7").isDone()); + assertFalse(queue.isFinished()); + + assertEquals(queue.getBatchAsync(100).get(), ImmutableList.of("1", "2", "3", "4")); + assertTrue(queue.isFinished()); + } + + @Test + public void testBorrow() + throws Exception + { + // The numbers are chosen so that depletion of elements can happen. + // Size is 5. Two threads each borrowing 3 can deplete the queue. + // The third thread may try to borrow when the queue is already empty. + // We also want to confirm that isFinished won't return true even if queue is depleted. + + AsyncQueue queue = new AsyncQueue<>(4, executor); + queue.offer(1); + queue.offer(2); + queue.offer(3); + queue.offer(4); + queue.offer(5); + + // Repeatedly remove up to 3 elements and re-insert them. + Runnable runnable = () -> { + for (int i = 0; i < 700; i++) { + getFutureValue(queue.borrowBatchAsync(3, elements -> new BorrowResult<>(elements, null))); + } + }; + + Future future1 = executor.submit(runnable); + Future future2 = executor.submit(runnable); + Future future3 = executor.submit(runnable); + future1.get(); + future2.get(); + future3.get(); + + queue.finish(); + assertFalse(queue.isFinished()); + + AtomicBoolean done = new AtomicBoolean(); + executor.submit(() -> { + while (!done.get()) { + assertFalse(queue.isFinished() || done.get()); + } + }); + + future1 = executor.submit(runnable); + future2 = executor.submit(runnable); + future3 = executor.submit(runnable); + future1.get(); + future2.get(); + future3.get(); + done.set(true); + + assertFalse(queue.isFinished()); + ArrayList list = new ArrayList<>(queue.getBatchAsync(100).get()); + list.sort(Integer::compare); + assertEquals(list, ImmutableList.of(1, 2, 3, 4, 5)); + assertTrue(queue.isFinished()); + } + + @Test + public void testBorrowThrows() + throws Exception + { + // It doesn't matter the exact behavior when the caller-supplied function to borrow fails. + // However, it must not block pending futures. + + AsyncQueue queue = new AsyncQueue<>(4, executor); + queue.offer(1); + queue.offer(2); + queue.offer(3); + queue.offer(4); + queue.offer(5); + + ListenableFuture future1 = queue.offer(6); + assertFalse(future1.isDone()); + + Runnable runnable = () -> { + getFutureValue(queue.borrowBatchAsync(1, elements -> { + throw new RuntimeException("test fail"); + })); + }; + + try { + executor.submit(runnable).get(); + fail("expected failure"); + } + catch (ExecutionException e) { + assertContains(e.getMessage(), "test fail"); + } + + ListenableFuture future2 = queue.offer(7); + assertFalse(future1.isDone()); + assertFalse(future2.isDone()); + queue.finish(); + future1.get(); + future2.get(); + assertTrue(queue.offer(8).isDone()); + + try { + executor.submit(runnable).get(); + fail("expected failure"); + } + catch (ExecutionException e) { + assertContains(e.getMessage(), "test fail"); + } + + assertTrue(queue.offer(9).isDone()); + + assertFalse(queue.isFinished()); + ArrayList list = new ArrayList<>(queue.getBatchAsync(100).get()); + // 1 and 2 were removed by borrow call; 8 and 9 were never inserted because insertion happened after finish. + assertEquals(list, ImmutableList.of(3, 4, 5, 6, 7)); + assertTrue(queue.isFinished()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestCustomSplitConversionUtils.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestCustomSplitConversionUtils.java new file mode 100644 index 00000000..b86609fb --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestCustomSplitConversionUtils.java @@ -0,0 +1,58 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import static org.testng.Assert.assertEquals; + +public class TestCustomSplitConversionUtils +{ + @Test + public void testHudiRealtimeSplitConverterRoundTrip() throws IOException + { + Path expectedPath = new Path("s3://test/path"); + long expectedStart = 1L; + long expectedLength = 2L; + String[] expectedLocations = new String[] {"one", "two"}; + String expectedBasepath = "basepath"; + List expectedDeltaLogPaths = Arrays.asList("test1", "test2", "test3"); + String expectedMaxCommitTime = "max_commit_time"; + + FileSplit baseSplit = new FileSplit(expectedPath, expectedStart, expectedLength, expectedLocations); + FileSplit hudiSplit = new HoodieRealtimeFileSplit(baseSplit, expectedBasepath, expectedDeltaLogPaths, expectedMaxCommitTime); + + // Test conversion of HudiSplit -> customSplitInfo + Map customSplitInfo = CustomSplitConversionUtils.extractCustomSplitInfo(hudiSplit); + + // Test conversion of (customSplitInfo + baseSplit) -> HudiSplit + FileSplit recreatedSplit = CustomSplitConversionUtils.recreateSplitWithCustomInfo(baseSplit, customSplitInfo); + + assertEquals(recreatedSplit.getPath(), expectedPath); + assertEquals(recreatedSplit.getStart(), expectedStart); + assertEquals(recreatedSplit.getLength(), expectedLength); + assertEquals(recreatedSplit.getLocations(), expectedLocations); + assertEquals(((HoodieRealtimeFileSplit) recreatedSplit).getBasePath(), expectedBasepath); + assertEquals(((HoodieRealtimeFileSplit) recreatedSplit).getDeltaLogPaths(), expectedDeltaLogPaths); + assertEquals(((HoodieRealtimeFileSplit) recreatedSplit).getMaxCommitTime(), expectedMaxCommitTime); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestIndexCache.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestIndexCache.java new file mode 100644 index 00000000..df7e1911 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestIndexCache.java @@ -0,0 +1,270 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.airlift.units.DataSize; +import io.airlift.units.Duration; +import io.prestosql.plugin.hive.HiveColumnHandle; +import io.prestosql.plugin.hive.HiveSplit; +import io.prestosql.spi.HetuConstant; +import io.prestosql.spi.heuristicindex.Index; +import io.prestosql.spi.heuristicindex.IndexMetadata; +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.predicate.TupleDomain; +import io.prestosql.spi.predicate.ValueSet; +import io.prestosql.spi.service.PropertyService; +import io.prestosql.testing.NoOpIndexClient; +import org.mockito.internal.stubbing.answers.Returns; + +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; + +import static io.airlift.units.DataSize.Unit.KILOBYTE; +import static io.prestosql.spi.predicate.Range.range; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; + +public class TestIndexCache +{ + private final String catalog = "test_catalog"; + private final String table = "schema_name.table_name"; + private final long testLastModifiedTime = 1; + private final String testPath = "/user/hive/schema.db/table/001.orc"; + private final String testPath2 = "/user/hive/schema.db/table/002.orc"; + private final long loadDelay = 1000; + private final long numberOfIndexTypes = IndexCache.INDEX_TYPES.size(); + private final Domain domain = Domain.create(ValueSet.ofRanges(range(INTEGER, 0L, true, 100L, true)), false); + private final String column = "column_name"; + private TupleDomain effectivePredicate; + private List testPartitions = Collections.emptyList(); + private HiveColumnHandle testColumnHandle; + +// @BeforeClass + public void setupBeforeClass() + { + PropertyService.setProperty(HetuConstant.FILTER_ENABLED, true); + PropertyService.setProperty(HetuConstant.INDEXSTORE_FILESYSTEM_PROFILE, "local-config-default"); + PropertyService.setProperty(HetuConstant.FILTER_CACHE_MAX_MEMORY, (long) (new DataSize(numberOfIndexTypes * 2, KILOBYTE).getValue(KILOBYTE))); + PropertyService.setProperty(HetuConstant.FILTER_CACHE_TTL, new Duration(10, TimeUnit.MINUTES)); + PropertyService.setProperty(HetuConstant.FILTER_CACHE_LOADING_DELAY, new Duration(loadDelay, TimeUnit.MILLISECONDS)); + PropertyService.setProperty(HetuConstant.FILTER_CACHE_LOADING_THREADS, 2L); + PropertyService.setProperty(HetuConstant.FILTER_CACHE_SOFT_REFERENCE, false); + + testColumnHandle = mock(HiveColumnHandle.class); + when(testColumnHandle.getName()).thenReturn(column); + effectivePredicate = TupleDomain.withColumnDomains(ImmutableMap.of(testColumnHandle, domain)); + } + + // TODO: test is unstable and disabled for now + // @Test + public void testIndexCacheGetIndices() throws Exception + { + HiveSplit testHiveSplit; + testHiveSplit = mock(HiveSplit.class); + when(testHiveSplit.getPath()).thenReturn(testPath); + when(testHiveSplit.getLastModifiedTime()).thenReturn(testLastModifiedTime); + + IndexMetadata indexMetadata = mock(IndexMetadata.class); + when(indexMetadata.getLastModifiedTime()).thenReturn(testLastModifiedTime); + Index index = mock(Index.class); + when(indexMetadata.getIndex()).then(new Returns(index)); + when(index.getMemoryUsage()).thenReturn(new DataSize(1, KILOBYTE).toBytes()); + + List expectedIndices = new LinkedList<>(); + expectedIndices.add(indexMetadata); + + IndexCacheLoader indexCacheLoader = mock(IndexCacheLoader.class); + when(indexCacheLoader.load(any())).then(new Returns(expectedIndices)); + + IndexCache indexCache = new IndexCache(indexCacheLoader, new NoOpIndexClient()); + List actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicate, testPartitions); + assertEquals(actualSplitIndex.size(), 0); + Thread.sleep(loadDelay + 1000); + actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicate, testPartitions); + assertEquals(actualSplitIndex.size(), numberOfIndexTypes); + assertEquals(actualSplitIndex.get(0), expectedIndices.get(0)); + } + + // TODO: test is unstable and disabled for now +// @Test + public void testIndexCacheThrowsExecutionException() + throws Exception + { + HiveSplit testHiveSplit; + testHiveSplit = mock(HiveSplit.class); + when(testHiveSplit.getPath()).thenReturn(testPath); + when(testHiveSplit.getLastModifiedTime()).thenReturn(testLastModifiedTime); + + IndexMetadata indexMetadata = mock(IndexMetadata.class); + when(indexMetadata.getLastModifiedTime()).thenReturn(testLastModifiedTime); + + List expectedIndices = new LinkedList<>(); + expectedIndices.add(indexMetadata); + + IndexCacheLoader indexCacheLoader = mock(IndexCacheLoader.class); + when(indexCacheLoader.load(any())).thenThrow(ExecutionException.class); + + IndexCache indexCache = new IndexCache(indexCacheLoader, loadDelay, new NoOpIndexClient()); + List actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicate, testPartitions); + assertEquals(actualSplitIndex.size(), 0); + Thread.sleep(loadDelay + 500); + actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicate, testPartitions); + assertEquals(actualSplitIndex.size(), 0); + } + + // TODO: test is unstable and disabled for now + // @Test + public void testExpiredCacheIndices() throws Exception + { + HiveSplit testHiveSplit; + testHiveSplit = mock(HiveSplit.class); + when(testHiveSplit.getPath()).thenReturn(testPath); + when(testHiveSplit.getLastModifiedTime()).thenReturn(testLastModifiedTime); + + IndexMetadata indexMetadata = mock(IndexMetadata.class); + when(indexMetadata.getLastModifiedTime()).thenReturn(testLastModifiedTime); + Index index = mock(Index.class); + when(indexMetadata.getIndex()).then(new Returns(index)); + when(index.getMemoryUsage()).thenReturn(new DataSize(1, KILOBYTE).toBytes()); + + List expectedIndices = new LinkedList<>(); + expectedIndices.add(indexMetadata); + + IndexCacheLoader indexCacheLoader = mock(IndexCacheLoader.class); + when(indexCacheLoader.load(any())).then(new Returns(expectedIndices)); + + IndexCache indexCache = new IndexCache(indexCacheLoader, loadDelay, new NoOpIndexClient()); + List actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicate, testPartitions); + assertEquals(actualSplitIndex.size(), 0); + Thread.sleep(loadDelay + 500); + actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicate, testPartitions); + assertEquals(actualSplitIndex.size(), numberOfIndexTypes); + + // now the index is in the cache, but changing the lastmodified date of the split should invalidate it + when(testHiveSplit.getLastModifiedTime()).thenReturn(testLastModifiedTime + 1); + actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicate, testPartitions); + assertEquals(actualSplitIndex.size(), 0); + } + + // TODO: test is unstable and disabled for now + // @Test + public void testIndexCacheWithPartitions() + throws Exception + { + HiveSplit testHiveSplit; + testHiveSplit = mock(HiveSplit.class); + when(testHiveSplit.getPath()).thenReturn(testPath); + when(testHiveSplit.getLastModifiedTime()).thenReturn(testLastModifiedTime); + + HiveColumnHandle partitionColumnHandle; + TupleDomain effectivePredicateForPartition; + partitionColumnHandle = mock(HiveColumnHandle.class); + // partition column should be filtered out this should never get called + when(partitionColumnHandle.getName()).thenThrow(Exception.class); + effectivePredicateForPartition = TupleDomain.withColumnDomains(ImmutableMap.of(testColumnHandle, domain, + partitionColumnHandle, domain)); + List partitionColumns = ImmutableList.of(partitionColumnHandle); + + IndexMetadata indexMetadata = mock(IndexMetadata.class); + when(indexMetadata.getLastModifiedTime()).thenReturn(testLastModifiedTime); + + Index index = mock(Index.class); + when(indexMetadata.getIndex()).then(new Returns(index)); + when(index.getMemoryUsage()).thenReturn(new DataSize(1, KILOBYTE).toBytes()); + + List expectedIndices = new LinkedList<>(); + expectedIndices.add(indexMetadata); + + IndexCacheLoader indexCacheLoader = mock(IndexCacheLoader.class); + when(indexCacheLoader.load(any())).then(new Returns(expectedIndices)); + + IndexCache indexCache = new IndexCache(indexCacheLoader, loadDelay, new NoOpIndexClient()); + List actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicateForPartition, partitionColumns); + assertEquals(actualSplitIndex.size(), 0); + Thread.sleep(loadDelay + 500); + actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicateForPartition, partitionColumns); + + assertEquals(actualSplitIndex.size(), numberOfIndexTypes); + } + + // TODO: test is unstable and disabled for now + // @Test + public void testIndexCacheEviction() throws Exception + { + HiveSplit testHiveSplit; + testHiveSplit = mock(HiveSplit.class); + when(testHiveSplit.getPath()).thenReturn(testPath); + when(testHiveSplit.getLastModifiedTime()).thenReturn(testLastModifiedTime); + + IndexCacheLoader indexCacheLoader = mock(IndexCacheLoader.class); + IndexCache indexCache = new IndexCache(indexCacheLoader, loadDelay, new NoOpIndexClient()); + + // get index for split1 + IndexMetadata indexMetadata1 = mock(IndexMetadata.class); + when(indexMetadata1.getLastModifiedTime()).thenReturn(testLastModifiedTime); + Index index1 = mock(Index.class); + when(indexMetadata1.getIndex()).thenReturn(index1); + when(index1.getMemoryUsage()).thenReturn(new DataSize(2, KILOBYTE).toBytes()); + + List expectedIndices1 = new LinkedList<>(); + expectedIndices1.add(indexMetadata1); + when(indexCacheLoader.load(any())).then(new Returns(expectedIndices1)); + + // each index is has memory usage of 2, and limit is 2*types of idx, so all should be loaded + List actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicate, testPartitions); + assertEquals(actualSplitIndex.size(), 0); + Thread.sleep(loadDelay + 500); + actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicate, testPartitions); + assertEquals(actualSplitIndex.size(), numberOfIndexTypes); + assertEquals(actualSplitIndex.get(0), indexMetadata1); + assertEquals(indexCache.getCacheSize(), numberOfIndexTypes); + + // get index for split2 + when(testHiveSplit.getPath()).thenReturn(testPath2); + IndexMetadata indexMetadata2 = mock(IndexMetadata.class); + when(indexMetadata2.getLastModifiedTime()).thenReturn(testLastModifiedTime); + Index index2 = mock(Index.class); + when(indexMetadata2.getIndex()).thenReturn(index2); + when(index2.getMemoryUsage()).thenReturn(new DataSize(2, KILOBYTE).toBytes()); + + // previous indexes should be evicted bc cache was at max weight limit and new ones should be added + List expectedIndices2 = new LinkedList<>(); + expectedIndices2.add(indexMetadata2); + when(indexCacheLoader.load(any())).then(new Returns(expectedIndices2)); + + actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicate, testPartitions); + assertEquals(actualSplitIndex.size(), 0); + assertEquals(indexCache.getCacheSize(), numberOfIndexTypes); + Thread.sleep(loadDelay + 500); + actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicate, testPartitions); + assertEquals(actualSplitIndex.size(), numberOfIndexTypes); + assertEquals(actualSplitIndex.get(0), indexMetadata2); + assertEquals(indexCache.getCacheSize(), numberOfIndexTypes); + + // get index for split1 + when(testHiveSplit.getPath()).thenReturn(testPath); + actualSplitIndex = indexCache.getIndices(catalog, table, testHiveSplit, effectivePredicate, testPartitions); + assertEquals(actualSplitIndex.size(), 0); + assertEquals(indexCache.getCacheSize(), numberOfIndexTypes); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestIndexCacheLoader.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestIndexCacheLoader.java new file mode 100644 index 00000000..8526eb4f --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestIndexCacheLoader.java @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import io.airlift.units.Duration; +import io.prestosql.spi.HetuConstant; +import io.prestosql.spi.heuristicindex.IndexCacheKey; +import io.prestosql.spi.heuristicindex.IndexClient; +import io.prestosql.spi.heuristicindex.IndexMetadata; +import io.prestosql.spi.service.PropertyService; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; + +public class TestIndexCacheLoader +{ + private static final String TABLE = "test_table"; + private static final String COLUMN = "test_column"; + + @BeforeTest + private void setProperties() + { + PropertyService.setProperty(HetuConstant.FILTER_ENABLED, true); + PropertyService.setProperty(HetuConstant.INDEXSTORE_FILESYSTEM_PROFILE, "local-config-default"); + PropertyService.setProperty(HetuConstant.FILTER_CACHE_TTL, new Duration(10, TimeUnit.MINUTES)); + PropertyService.setProperty(HetuConstant.FILTER_CACHE_LOADING_DELAY, new Duration(5000, TimeUnit.MILLISECONDS)); + PropertyService.setProperty(HetuConstant.FILTER_CACHE_LOADING_THREADS, 2L); + } + + @Test(expectedExceptions = Exception.class) + public void testNoLastModifiedTime() throws Exception + { + IndexClient indexclient = mock(IndexClient.class); + IndexCacheLoader indexCacheLoader = new IndexCacheLoader(indexclient); + + IndexCacheKey indexCacheKey = new IndexCacheKey("/path/to/split", 1); + + // throw exception to produce "no last modified time file found" behaviour + when(indexclient.getLastModifiedTime((indexCacheKey.getPath()))).thenThrow(Exception.class); + + indexCacheLoader.load(indexCacheKey); + } + + @Test(expectedExceptions = Exception.class) + public void testNoMatchingLastModifiedTime() throws Exception + { + IndexClient indexclient = mock(IndexClient.class); + IndexCacheLoader indexCacheLoader = new IndexCacheLoader(indexclient); + + IndexCacheKey indexCacheKey = new IndexCacheKey("/path/to/split", 1L); + + // return different last modified time to simulate expired index + when(indexclient.getLastModifiedTime((indexCacheKey.getPath()))).thenReturn(2L); + + indexCacheLoader.load(indexCacheKey); + } + + @Test(expectedExceptions = Exception.class) + public void testNoValidIndexFilesFoundException() throws Exception + { + IndexClient indexclient = mock(IndexClient.class); + IndexCacheLoader indexCacheLoader = new IndexCacheLoader(indexclient); + + long lastModifiedTime = 1L; + IndexCacheKey indexCacheKey = new IndexCacheKey("/path/to/split", lastModifiedTime); + when(indexclient.getLastModifiedTime((indexCacheKey.getPath()))).thenReturn(lastModifiedTime); + when(indexclient.readSplitIndex((indexCacheKey.getPath()))).thenThrow(Exception.class); + + indexCacheLoader.load(indexCacheKey); + } + + @Test(expectedExceptions = Exception.class) + public void testNoValidIndexFilesFound() throws Exception + { + IndexClient indexclient = mock(IndexClient.class); + IndexCacheLoader indexCacheLoader = new IndexCacheLoader(indexclient); + + long lastModifiedTime = 1L; + IndexCacheKey indexCacheKey = new IndexCacheKey("/path/to/split", lastModifiedTime); + when(indexclient.getLastModifiedTime((indexCacheKey.getPath()))).thenReturn(lastModifiedTime); + when(indexclient.readSplitIndex((indexCacheKey.getPath()))).thenReturn(Collections.emptyList()); + + indexCacheLoader.load(indexCacheKey); + } + + @Test + public void testIndexFound() throws Exception + { + IndexClient indexclient = mock(IndexClient.class); + IndexCacheLoader indexCacheLoader = new IndexCacheLoader(indexclient); + + List expectedSplitIndexes = new LinkedList<>(); + expectedSplitIndexes.add(mock(IndexMetadata.class)); + + long lastModifiedTime = 1L; + IndexCacheKey indexCacheKey = new IndexCacheKey("/path/to/split", lastModifiedTime); + when(indexclient.getLastModifiedTime((indexCacheKey.getPath()))).thenReturn(lastModifiedTime); + when(indexclient.readSplitIndex((indexCacheKey.getPath()))).thenReturn(expectedSplitIndexes); + + List actualSplitIndexes = indexCacheLoader.load(indexCacheKey); + assertEquals(expectedSplitIndexes.size(), actualSplitIndexes.size()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestLazyMap.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestLazyMap.java new file mode 100644 index 00000000..bf9d98a8 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestLazyMap.java @@ -0,0 +1,82 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableMap; +import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef; +import org.apache.hadoop.hive.serde2.lazy.LazyMap; +import org.apache.hadoop.hive.serde2.lazy.LazyString; +import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector; +import org.apache.hadoop.io.Text; +import org.testng.annotations.Test; + +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; + +import static org.apache.hadoop.hive.serde2.lazy.LazyFactory.createLazyObject; +import static org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyObjectInspectorFactory.getLazySimpleMapObjectInspector; +import static org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyPrimitiveObjectInspectorFactory.getLazyStringObjectInspector; +import static org.testng.Assert.assertEquals; + +public class TestLazyMap +{ + private static final LazyStringObjectInspector LAZY_STRING_OBJECT_INSPECTOR = getLazyStringObjectInspector(false, (byte) 0); + + @Test + public void test() + { + assertMapDecode("\\N\u0003ignored", ImmutableMap.of()); + assertMapDecode("\\N\u0003ignored\u0002alice\u0003apple", ImmutableMap.of(lazyString("alice"), lazyString("apple"))); + assertMapDecode("alice\u0003apple\u0002\\N\u0003ignored", ImmutableMap.of(lazyString("alice"), lazyString("apple"))); + assertMapDecode("alice\u0003apple\u0002\\N\u0003ignored\u0002bob\u0003banana", + ImmutableMap.of(lazyString("alice"), lazyString("apple"), lazyString("bob"), lazyString("banana"))); + assertMapDecode("\\N\u0003ignored\u0002\u0003", ImmutableMap.of(lazyString(""), lazyString(""))); + + HashMap expectedMap = new HashMap<>(); + expectedMap.put("null", null); + assertMapDecode("\\N\u0003ignored\u0002null\u0003\\N", expectedMap); + } + + public static void assertMapDecode(String encodedMap, Map expectedMap) + { + LazyMap lazyMap = (LazyMap) createLazyObject(getLazySimpleMapObjectInspector( + LAZY_STRING_OBJECT_INSPECTOR, + getLazyStringObjectInspector(false, (byte) 0), + (byte) 2, + (byte) 3, + new Text("\\N"), + false, + (byte) 0)); + + lazyMap.init(newByteArrayRef(encodedMap), 0, encodedMap.length()); + + Map map = lazyMap.getMap(); + assertEquals(map, expectedMap); + } + + private static LazyString lazyString(String string) + { + LazyString lazyString = new LazyString(LAZY_STRING_OBJECT_INSPECTOR); + lazyString.init(newByteArrayRef(string), 0, string.length()); + return lazyString; + } + + public static ByteArrayRef newByteArrayRef(String encodedMap) + { + ByteArrayRef bytes = new ByteArrayRef(); + bytes.setData(encodedMap.getBytes(StandardCharsets.US_ASCII)); + return bytes; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestLoggingInvocationHandler.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestLoggingInvocationHandler.java new file mode 100644 index 00000000..f5f48eea --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestLoggingInvocationHandler.java @@ -0,0 +1,93 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore; +import org.testng.annotations.Test; + +import java.lang.reflect.InvocationHandler; +import java.util.ArrayList; +import java.util.List; + +import static com.google.common.reflect.Reflection.newProxy; +import static java.lang.reflect.Proxy.newProxyInstance; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +public class TestLoggingInvocationHandler +{ + private static final String DURATION_PATTERN = "\\d+(\\.\\d+)?\\w{1,2}"; + + @Test + public void testLoggingAndExceptions() + { + SomeInterface delegate = new SomeInterface() + { + @Override + public void run(boolean ok, String s) + { + if (!ok) { + throw new ArrayStoreException(s); + } + } + }; + List messages = new ArrayList<>(); + InvocationHandler handler = new LoggingInvocationHandler(delegate, new LoggingInvocationHandler.ReflectiveParameterNamesProvider(), messages::add); + SomeInterface proxy = newProxy(SomeInterface.class, handler); + + proxy.run(true, "xyz"); + + assertThatThrownBy(() -> proxy.run(false, "bad")) + .isInstanceOf(ArrayStoreException.class) + .hasMessage("bad"); + + assertThat(messages) + .hasSize(2) + .satisfies(list -> { + assertThat(list.get(0)).matches("\\QInvocation of run(ok=true, s='xyz') succeeded in\\E " + DURATION_PATTERN); + assertThat(list.get(1)).matches("\\QInvocation of run(ok=false, s='bad') took\\E " + DURATION_PATTERN + + " \\Qand failed with java.lang.ArrayStoreException: bad\\E"); + }); + } + + @Test + public void testWithThriftHiveMetastoreClient() + throws Exception + { + List messages = new ArrayList<>(); + // LoggingInvocationHandler is used e.g. with ThriftHiveMetastore.Iface. Since the logging is reflection-based, + // we test it with this interface as well. + ThriftHiveMetastore.Iface proxy = newProxy(ThriftHiveMetastore.Iface.class, new LoggingInvocationHandler( + dummyThriftHiveMetastoreClient(), + new LoggingInvocationHandler.AirliftParameterNamesProvider(ThriftHiveMetastore.Iface.class, ThriftHiveMetastore.Client.class), + messages::add)); + proxy.get_table("some_database", "some_table_name"); + assertThat(messages) + .hasSize(1) + .element(0).matches(message -> message.matches("\\QInvocation of get_table(dbname='some_database', tbl_name='some_table_name') succeeded in\\E " + DURATION_PATTERN)); + } + + private static ThriftHiveMetastore.Iface dummyThriftHiveMetastoreClient() + { + return (ThriftHiveMetastore.Iface) newProxyInstance( + TestLoggingInvocationHandler.class.getClassLoader(), + new Class[] {ThriftHiveMetastore.Iface.class}, + (proxy, method, args) -> null); + } + + private interface SomeInterface + { + default void run(boolean ok, String s) {} + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestMergingPageIterator.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestMergingPageIterator.java new file mode 100644 index 00000000..ee0f197a --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestMergingPageIterator.java @@ -0,0 +1,101 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import io.prestosql.spi.Page; +import io.prestosql.spi.PageBuilder; +import io.prestosql.spi.block.SortOrder; +import io.prestosql.spi.type.Type; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.ThreadLocalRandom; +import java.util.stream.IntStream; + +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static java.util.Comparator.naturalOrder; +import static java.util.Comparator.nullsFirst; +import static java.util.stream.Collectors.toList; +import static org.assertj.core.api.Assertions.assertThat; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +public class TestMergingPageIterator +{ + @Test + public void testMerging() + { + List types = ImmutableList.of(INTEGER, INTEGER); + List sortIndexes = ImmutableList.of(1); + List sortOrders = ImmutableList.of(SortOrder.ASC_NULLS_FIRST); + + List> pageLists = new ArrayList<>(); + PageBuilder pageBuilder = new PageBuilder(types); + + for (int i = 0; i < 10; i++) { + Iterator values = IntStream.range(0, 1000) + .map(ignored -> ThreadLocalRandom.current().nextInt(100_000)) + .mapToObj(n -> ((n % 100) == 0) ? null : n) + .sorted(nullsFirst(naturalOrder())) + .iterator(); + List pages = new ArrayList<>(); + for (int j = 0; j < 10; j++) { + for (int k = 0; k < 100; k++) { + Integer n = values.next(); + pageBuilder.declarePosition(); + if (n == null) { + pageBuilder.getBlockBuilder(0).appendNull(); + pageBuilder.getBlockBuilder(1).appendNull(); + } + else { + INTEGER.writeLong(pageBuilder.getBlockBuilder(0), n); + INTEGER.writeLong(pageBuilder.getBlockBuilder(1), n * 22L); + } + } + pages.add(pageBuilder.build()); + pageBuilder.reset(); + } + pageLists.add(pages); + assertFalse(values.hasNext()); + } + + List> pages = pageLists.stream() + .map(List::iterator) + .collect(toList()); + Iterator iterator = new MergingPageIterator(pages, types, sortIndexes, sortOrders); + + List values = new ArrayList<>(); + while (iterator.hasNext()) { + Page page = iterator.next(); + for (int i = 0; i < page.getPositionCount(); i++) { + if (page.getBlock(0).isNull(i)) { + assertTrue(page.getBlock(1).isNull(i)); + values.add(null); + } + else { + long x = INTEGER.getLong(page.getBlock(0), i); + long y = INTEGER.getLong(page.getBlock(1), i); + assertEquals(y, x * 22); + values.add(x); + } + } + } + + assertThat(values).isSortedAccordingTo(nullsFirst(naturalOrder())); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestSerDeUtils.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestSerDeUtils.java new file mode 100644 index 00000000..a2edf81b --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestSerDeUtils.java @@ -0,0 +1,334 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.reflect.TypeToken; +import io.airlift.slice.DynamicSliceOutput; +import io.airlift.slice.Slice; +import io.airlift.slice.SliceOutput; +import io.airlift.slice.Slices; +import io.hetu.core.transport.block.BlockSerdeUtil; +import io.prestosql.plugin.hive.HiveTestUtils; +import io.prestosql.spi.block.Block; +import io.prestosql.spi.block.BlockBuilder; +import io.prestosql.spi.block.BlockEncodingSerde; +import io.prestosql.spi.type.ArrayType; +import io.prestosql.spi.type.RowType; +import io.prestosql.tests.StructuralTestUtil; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.io.BytesWritable; +import org.joda.time.DateTime; +import org.testng.annotations.Test; + +import java.lang.reflect.Type; +import java.time.LocalDate; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import static io.airlift.slice.Slices.utf8Slice; +import static io.prestosql.metadata.MetadataManager.createTestMetadataManager; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.VarbinaryType.VARBINARY; +import static io.prestosql.spi.type.VarcharType.VARCHAR; +import static io.prestosql.spi.type.VarcharType.createUnboundedVarcharType; +import static io.prestosql.tests.StructuralTestUtil.arrayBlockOf; +import static io.prestosql.tests.StructuralTestUtil.mapBlockOf; +import static io.prestosql.tests.StructuralTestUtil.rowBlockOf; +import static java.lang.Double.doubleToLongBits; +import static java.lang.Float.floatToRawIntBits; +import static java.lang.Math.toIntExact; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions; +import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getReflectionObjectInspector; +import static org.testng.Assert.assertEquals; + +@SuppressWarnings("PackageVisibleField") +public class TestSerDeUtils +{ + private final BlockEncodingSerde blockEncodingSerde = createTestMetadataManager().getFunctionAndTypeManager().getBlockEncodingSerde(); + + private static class ListHolder + { + List array; + } + + private static class InnerStruct + { + public InnerStruct(Integer intVal, Long longVal) + { + this.intVal = intVal; + this.longVal = longVal; + } + + Integer intVal; + Long longVal; + } + + private static class OuterStruct + { + Byte byteVal; + Short shortVal; + Integer intVal; + Long longVal; + Float floatVal; + Double doubleVal; + String stringVal; + byte[] byteArray; + List structArray; + Map map; + InnerStruct innerStruct; + } + + private static synchronized ObjectInspector getInspector(Type type) + { + // ObjectInspectorFactory.getReflectionObjectInspector is not thread-safe although it + // gives people a first impression that it is. This may have been fixed in HIVE-11586. + + // Presto only uses getReflectionObjectInspector here, in a test method. Therefore, we + // choose to work around this issue by synchronizing this method. Before synchronizing + // this method, test in this class fails approximately 1 out of 10 runs on Travis. + + return getReflectionObjectInspector(type, ObjectInspectorOptions.JAVA); + } + + @Test + public void testPrimitiveSlice() + { + // boolean + Block expectedBoolean = VARBINARY.createBlockBuilder(null, 1).writeByte(1).closeEntry().build(); + Block actualBoolean = toBinaryBlock(BOOLEAN, true, getInspector(Boolean.class)); + assertBlockEquals(actualBoolean, expectedBoolean); + + // byte + Block expectedByte = VARBINARY.createBlockBuilder(null, 1).writeByte(5).closeEntry().build(); + Block actualByte = toBinaryBlock(TINYINT, (byte) 5, getInspector(Byte.class)); + assertBlockEquals(actualByte, expectedByte); + + // short + Block expectedShort = VARBINARY.createBlockBuilder(null, 1).writeShort(2).closeEntry().build(); + Block actualShort = toBinaryBlock(SMALLINT, (short) 2, getInspector(Short.class)); + assertBlockEquals(actualShort, expectedShort); + + // int + Block expectedInt = VARBINARY.createBlockBuilder(null, 1).writeInt(1).closeEntry().build(); + Block actualInt = toBinaryBlock(INTEGER, 1, getInspector(Integer.class)); + assertBlockEquals(actualInt, expectedInt); + + // long + Block expectedLong = VARBINARY.createBlockBuilder(null, 1).writeLong(10).closeEntry().build(); + Block actualLong = toBinaryBlock(BIGINT, 10L, getInspector(Long.class)); + assertBlockEquals(actualLong, expectedLong); + + // float + Block expectedFloat = VARBINARY.createBlockBuilder(null, 1).writeInt(floatToRawIntBits(20.0f)).closeEntry().build(); + Block actualFloat = toBinaryBlock(REAL, 20.0f, getInspector(Float.class)); + assertBlockEquals(actualFloat, expectedFloat); + + // double + Block expectedDouble = VARBINARY.createBlockBuilder(null, 1).writeLong(doubleToLongBits(30.12)).closeEntry().build(); + Block actualDouble = toBinaryBlock(DOUBLE, 30.12d, getInspector(Double.class)); + assertBlockEquals(actualDouble, expectedDouble); + + // string + Block expectedString = VARBINARY.createBlockBuilder(null, 1).writeBytes(utf8Slice("abdd"), 0, 4).closeEntry().build(); + Block actualString = toBinaryBlock(createUnboundedVarcharType(), "abdd", getInspector(String.class)); + assertBlockEquals(actualString, expectedString); + + // date + int date = toIntExact(LocalDate.of(2008, 10, 28).toEpochDay()); + Block expectedDate = VARBINARY.createBlockBuilder(null, 1).writeInt(date).closeEntry().build(); + Block actualDate = toBinaryBlock(BIGINT, Date.ofEpochDay(date), getInspector(Date.class)); + assertBlockEquals(actualDate, expectedDate); + + // timestamp + DateTime dateTime = new DateTime(2008, 10, 28, 16, 7, 15, 0); + Block expectedTimestamp = VARBINARY.createBlockBuilder(null, 1).writeLong(dateTime.getMillis()).closeEntry().build(); + Block actualTimestamp = toBinaryBlock(BIGINT, Timestamp.ofEpochMilli(dateTime.getMillis()), getInspector(Timestamp.class)); + assertBlockEquals(actualTimestamp, expectedTimestamp); + + // binary + byte[] byteArray = {81, 82, 84, 85}; + Block expectedBinary = VARBINARY.createBlockBuilder(null, 1).writeBytes(Slices.wrappedBuffer(byteArray), 0, 4).closeEntry().build(); + Block actualBinary = toBinaryBlock(createUnboundedVarcharType(), byteArray, getInspector(byte[].class)); + assertBlockEquals(actualBinary, expectedBinary); + } + + @Test + public void testListBlock() + { + List array = new ArrayList<>(2); + array.add(new InnerStruct(8, 9L)); + array.add(new InnerStruct(10, 11L)); + ListHolder listHolder = new ListHolder(); + listHolder.array = array; + + io.prestosql.spi.type.Type rowType = RowType.anonymous(ImmutableList.of(INTEGER, BIGINT)); + io.prestosql.spi.type.Type arrayOfRowType = RowType.anonymous(ImmutableList.of(new ArrayType(rowType))); + Block actual = toBinaryBlock(arrayOfRowType, listHolder, getInspector(ListHolder.class)); + BlockBuilder blockBuilder = rowType.createBlockBuilder(null, 1024); + rowType.writeObject(blockBuilder, rowBlockOf(ImmutableList.of(INTEGER, BIGINT), 8, 9L)); + rowType.writeObject(blockBuilder, rowBlockOf(ImmutableList.of(INTEGER, BIGINT), 10, 11L)); + Block expected = rowBlockOf(ImmutableList.of(new ArrayType(rowType)), blockBuilder.build()); + + assertBlockEquals(actual, expected); + } + + private static class MapHolder + { + Map map; + } + + @Test + public void testMapBlock() + { + MapHolder holder = new MapHolder(); + holder.map = new TreeMap<>(); + holder.map.put("twelve", new InnerStruct(13, 14L)); + holder.map.put("fifteen", new InnerStruct(16, 17L)); + + RowType rowType = RowType.anonymous(ImmutableList.of(INTEGER, BIGINT)); + RowType rowOfMapOfVarcharRowType = RowType.anonymous(ImmutableList.of(HiveTestUtils.mapType(VARCHAR, rowType))); + Block actual = toBinaryBlock(rowOfMapOfVarcharRowType, holder, getInspector(MapHolder.class)); + + Block mapBlock = mapBlockOf( + VARCHAR, + rowType, + new Object[] {utf8Slice("fifteen"), utf8Slice("twelve")}, + new Object[] {rowBlockOf(rowType.getTypeParameters(), 16, 17L), rowBlockOf(rowType.getTypeParameters(), 13, 14L)}); + Block expected = StructuralTestUtil.rowBlockOf(ImmutableList.of(HiveTestUtils.mapType(VARCHAR, rowType)), mapBlock); + + assertBlockEquals(actual, expected); + } + + @Test + public void testStructBlock() + { + // test simple structs + InnerStruct innerStruct = new InnerStruct(13, 14L); + + io.prestosql.spi.type.Type rowType = RowType.anonymous(ImmutableList.of(INTEGER, BIGINT)); + Block actual = toBinaryBlock(rowType, innerStruct, getInspector(InnerStruct.class)); + + Block expected = rowBlockOf(ImmutableList.of(INTEGER, BIGINT), 13, 14L); + assertBlockEquals(actual, expected); + + // test complex structs + OuterStruct outerStruct = new OuterStruct(); + outerStruct.byteVal = (byte) 1; + outerStruct.shortVal = (short) 2; + outerStruct.intVal = 3; + outerStruct.longVal = 4L; + outerStruct.floatVal = 5.01f; + outerStruct.doubleVal = 6.001d; + outerStruct.stringVal = "seven"; + outerStruct.byteArray = new byte[] {'2'}; + InnerStruct is1 = new InnerStruct(2, -5L); + InnerStruct is2 = new InnerStruct(-10, 0L); + outerStruct.structArray = new ArrayList<>(2); + outerStruct.structArray.add(is1); + outerStruct.structArray.add(is2); + outerStruct.map = new TreeMap<>(); + outerStruct.map.put("twelve", new InnerStruct(0, 5L)); + outerStruct.map.put("fifteen", new InnerStruct(-5, -10L)); + outerStruct.innerStruct = new InnerStruct(18, 19L); + + io.prestosql.spi.type.Type innerRowType = RowType.anonymous(ImmutableList.of(INTEGER, BIGINT)); + io.prestosql.spi.type.Type arrayOfInnerRowType = new ArrayType(innerRowType); + io.prestosql.spi.type.Type mapOfInnerRowType = HiveTestUtils.mapType(createUnboundedVarcharType(), innerRowType); + List outerRowParameterTypes = ImmutableList.of(TINYINT, SMALLINT, INTEGER, BIGINT, REAL, DOUBLE, createUnboundedVarcharType(), createUnboundedVarcharType(), arrayOfInnerRowType, mapOfInnerRowType, innerRowType); + io.prestosql.spi.type.Type outerRowType = RowType.anonymous(outerRowParameterTypes); + + actual = toBinaryBlock(outerRowType, outerStruct, getInspector(OuterStruct.class)); + + ImmutableList.Builder outerRowValues = ImmutableList.builder(); + outerRowValues.add((byte) 1); + outerRowValues.add((short) 2); + outerRowValues.add(3); + outerRowValues.add(4L); + outerRowValues.add(5.01f); + outerRowValues.add(6.001d); + outerRowValues.add("seven"); + outerRowValues.add(new byte[] {'2'}); + outerRowValues.add(arrayBlockOf(innerRowType, rowBlockOf(innerRowType.getTypeParameters(), 2, -5L), rowBlockOf(ImmutableList.of(INTEGER, BIGINT), -10, 0L))); + outerRowValues.add(mapBlockOf( + VARCHAR, + innerRowType, + new Object[] {utf8Slice("fifteen"), utf8Slice("twelve")}, + new Object[] {rowBlockOf(innerRowType.getTypeParameters(), -5, -10L), rowBlockOf(innerRowType.getTypeParameters(), 0, 5L)})); + outerRowValues.add(rowBlockOf(ImmutableList.of(INTEGER, BIGINT), 18, 19L)); + + assertBlockEquals(actual, rowBlockOf(outerRowParameterTypes, outerRowValues.build().toArray())); + } + + @Test + public void testReuse() + { + BytesWritable value = new BytesWritable(); + + byte[] first = "hello world".getBytes(UTF_8); + value.set(first, 0, first.length); + + byte[] second = "bye".getBytes(UTF_8); + value.set(second, 0, second.length); + + Type type = new TypeToken>() {}.getType(); + ObjectInspector inspector = getInspector(type); + + Block actual = SerDeUtils.getBlockObject(HiveTestUtils.mapType(createUnboundedVarcharType(), BIGINT), ImmutableMap.of(value, 0L), inspector); + Block expected = mapBlockOf(createUnboundedVarcharType(), BIGINT, "bye", 0L); + + assertBlockEquals(actual, expected); + } + + private void assertBlockEquals(Block actual, Block expected) + { + assertEquals(blockToSlice(actual), blockToSlice(expected)); + } + + private Slice blockToSlice(Block block) + { + // This function is strictly for testing use only + SliceOutput sliceOutput = new DynamicSliceOutput(1000); + BlockSerdeUtil.writeBlock(blockEncodingSerde, sliceOutput, block); + return sliceOutput.slice(); + } + + private static Block toBinaryBlock(io.prestosql.spi.type.Type type, Object object, ObjectInspector inspector) + { + if (inspector.getCategory() == Category.PRIMITIVE) { + return getPrimitiveBlock(type, object, inspector); + } + return SerDeUtils.getBlockObject(type, object, inspector); + } + + private static Block getPrimitiveBlock(io.prestosql.spi.type.Type type, Object object, ObjectInspector inspector) + { + BlockBuilder builder = VARBINARY.createBlockBuilder(null, 1); + SerDeUtils.serializeObject(type, builder, object, inspector); + return builder.build(); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestStatistics.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestStatistics.java new file mode 100644 index 00000000..794beea5 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestStatistics.java @@ -0,0 +1,238 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableMap; +import io.prestosql.plugin.hive.HiveBasicStatistics; +import io.prestosql.plugin.hive.metastore.BooleanStatistics; +import io.prestosql.plugin.hive.metastore.DateStatistics; +import io.prestosql.plugin.hive.metastore.DecimalStatistics; +import io.prestosql.plugin.hive.metastore.DoubleStatistics; +import io.prestosql.plugin.hive.metastore.HiveColumnStatistics; +import io.prestosql.plugin.hive.metastore.IntegerStatistics; +import org.testng.annotations.Test; + +import java.math.BigDecimal; +import java.time.LocalDate; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalDouble; +import java.util.OptionalLong; + +import static io.prestosql.plugin.hive.HiveBasicStatistics.createEmptyStatistics; +import static io.prestosql.plugin.hive.HiveBasicStatistics.createZeroStatistics; +import static io.prestosql.plugin.hive.util.Statistics.ReduceOperator.ADD; +import static io.prestosql.plugin.hive.util.Statistics.ReduceOperator.SUBTRACT; +import static io.prestosql.plugin.hive.util.Statistics.merge; +import static io.prestosql.plugin.hive.util.Statistics.reduce; +import static org.assertj.core.api.Assertions.assertThat; + +public class TestStatistics +{ + @Test + public void testReduce() + { + assertThat(reduce(createEmptyStatistics(), createEmptyStatistics(), ADD)).isEqualTo(createEmptyStatistics()); + assertThat(reduce(createZeroStatistics(), createEmptyStatistics(), ADD)).isEqualTo(createEmptyStatistics()); + assertThat(reduce(createEmptyStatistics(), createZeroStatistics(), ADD)).isEqualTo(createEmptyStatistics()); + assertThat(reduce(createEmptyStatistics(), createEmptyStatistics(), SUBTRACT)).isEqualTo(createEmptyStatistics()); + assertThat(reduce(createZeroStatistics(), createEmptyStatistics(), SUBTRACT)).isEqualTo(createEmptyStatistics()); + assertThat(reduce(createEmptyStatistics(), createZeroStatistics(), SUBTRACT)).isEqualTo(createEmptyStatistics()); + assertThat(reduce( + new HiveBasicStatistics(11, 9, 7, 5), + new HiveBasicStatistics(1, 2, 3, 4), ADD)) + .isEqualTo(new HiveBasicStatistics(12, 11, 10, 9)); + assertThat(reduce( + new HiveBasicStatistics(11, 9, 7, 5), + new HiveBasicStatistics(1, 2, 3, 4), SUBTRACT)) + .isEqualTo(new HiveBasicStatistics(10, 7, 4, 1)); + } + + @Test + public void testMergeEmptyColumnStatistics() + { + assertMergeHiveColumnStatistics(HiveColumnStatistics.empty(), HiveColumnStatistics.empty(), HiveColumnStatistics.empty()); + } + + @Test + public void testMergeIntegerColumnStatistics() + { + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setIntegerStatistics(new IntegerStatistics(OptionalLong.empty(), OptionalLong.empty())).build(), + HiveColumnStatistics.builder().setIntegerStatistics(new IntegerStatistics(OptionalLong.empty(), OptionalLong.empty())).build(), + HiveColumnStatistics.builder().setIntegerStatistics(new IntegerStatistics(OptionalLong.empty(), OptionalLong.empty())).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setIntegerStatistics(new IntegerStatistics(OptionalLong.of(1), OptionalLong.of(2))).build(), + HiveColumnStatistics.builder().setIntegerStatistics(new IntegerStatistics(OptionalLong.empty(), OptionalLong.empty())).build(), + HiveColumnStatistics.builder().setIntegerStatistics(new IntegerStatistics(OptionalLong.of(1), OptionalLong.of(2))).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setIntegerStatistics(new IntegerStatistics(OptionalLong.of(1), OptionalLong.of(2))).build(), + HiveColumnStatistics.builder().setIntegerStatistics(new IntegerStatistics(OptionalLong.of(0), OptionalLong.of(3))).build(), + HiveColumnStatistics.builder().setIntegerStatistics(new IntegerStatistics(OptionalLong.of(0), OptionalLong.of(3))).build()); + } + + @Test + public void testMergeDoubleColumnStatistics() + { + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setDoubleStatistics(new DoubleStatistics(OptionalDouble.empty(), OptionalDouble.empty())).build(), + HiveColumnStatistics.builder().setDoubleStatistics(new DoubleStatistics(OptionalDouble.empty(), OptionalDouble.empty())).build(), + HiveColumnStatistics.builder().setDoubleStatistics(new DoubleStatistics(OptionalDouble.empty(), OptionalDouble.empty())).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setDoubleStatistics(new DoubleStatistics(OptionalDouble.of(1), OptionalDouble.of(2))).build(), + HiveColumnStatistics.builder().setDoubleStatistics(new DoubleStatistics(OptionalDouble.empty(), OptionalDouble.empty())).build(), + HiveColumnStatistics.builder().setDoubleStatistics(new DoubleStatistics(OptionalDouble.of(1), OptionalDouble.of(2))).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setDoubleStatistics(new DoubleStatistics(OptionalDouble.of(1), OptionalDouble.of(2))).build(), + HiveColumnStatistics.builder().setDoubleStatistics(new DoubleStatistics(OptionalDouble.of(0), OptionalDouble.of(3))).build(), + HiveColumnStatistics.builder().setDoubleStatistics(new DoubleStatistics(OptionalDouble.of(0), OptionalDouble.of(3))).build()); + } + + @Test + public void testMergeDecimalColumnStatistics() + { + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setDecimalStatistics(new DecimalStatistics(Optional.empty(), Optional.empty())).build(), + HiveColumnStatistics.builder().setDecimalStatistics(new DecimalStatistics(Optional.empty(), Optional.empty())).build(), + HiveColumnStatistics.builder().setDecimalStatistics(new DecimalStatistics(Optional.empty(), Optional.empty())).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setDecimalStatistics(new DecimalStatistics(Optional.of(BigDecimal.valueOf(1)), Optional.of(BigDecimal.valueOf(2)))).build(), + HiveColumnStatistics.builder().setDecimalStatistics(new DecimalStatistics(Optional.empty(), Optional.empty())).build(), + HiveColumnStatistics.builder().setDecimalStatistics(new DecimalStatistics(Optional.of(BigDecimal.valueOf(1)), Optional.of(BigDecimal.valueOf(2)))).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setDecimalStatistics(new DecimalStatistics(Optional.of(BigDecimal.valueOf(1)), Optional.of(BigDecimal.valueOf(2)))).build(), + HiveColumnStatistics.builder().setDecimalStatistics(new DecimalStatistics(Optional.of(BigDecimal.valueOf(0)), Optional.of(BigDecimal.valueOf(3)))).build(), + HiveColumnStatistics.builder().setDecimalStatistics(new DecimalStatistics(Optional.of(BigDecimal.valueOf(0)), Optional.of(BigDecimal.valueOf(3)))).build()); + } + + @Test + public void testMergeDateColumnStatistics() + { + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setDateStatistics(new DateStatistics(Optional.empty(), Optional.empty())).build(), + HiveColumnStatistics.builder().setDateStatistics(new DateStatistics(Optional.empty(), Optional.empty())).build(), + HiveColumnStatistics.builder().setDateStatistics(new DateStatistics(Optional.empty(), Optional.empty())).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setDateStatistics(new DateStatistics(Optional.of(LocalDate.ofEpochDay(1)), Optional.of(LocalDate.ofEpochDay(2)))).build(), + HiveColumnStatistics.builder().setDateStatistics(new DateStatistics(Optional.empty(), Optional.empty())).build(), + HiveColumnStatistics.builder().setDateStatistics(new DateStatistics(Optional.of(LocalDate.ofEpochDay(1)), Optional.of(LocalDate.ofEpochDay(2)))).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setDateStatistics(new DateStatistics(Optional.of(LocalDate.ofEpochDay(1)), Optional.of(LocalDate.ofEpochDay(2)))).build(), + HiveColumnStatistics.builder().setDateStatistics(new DateStatistics(Optional.of(LocalDate.ofEpochDay(0)), Optional.of(LocalDate.ofEpochDay(3)))).build(), + HiveColumnStatistics.builder().setDateStatistics(new DateStatistics(Optional.of(LocalDate.ofEpochDay(0)), Optional.of(LocalDate.ofEpochDay(3)))).build()); + } + + @Test + public void testMergeBooleanColumnStatistics() + { + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setBooleanStatistics(new BooleanStatistics(OptionalLong.empty(), OptionalLong.empty())).build(), + HiveColumnStatistics.builder().setBooleanStatistics(new BooleanStatistics(OptionalLong.empty(), OptionalLong.empty())).build(), + HiveColumnStatistics.builder().setBooleanStatistics(new BooleanStatistics(OptionalLong.empty(), OptionalLong.empty())).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setBooleanStatistics(new BooleanStatistics(OptionalLong.of(1), OptionalLong.of(2))).build(), + HiveColumnStatistics.builder().setBooleanStatistics(new BooleanStatistics(OptionalLong.empty(), OptionalLong.empty())).build(), + HiveColumnStatistics.builder().setBooleanStatistics(new BooleanStatistics(OptionalLong.empty(), OptionalLong.empty())).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setBooleanStatistics(new BooleanStatistics(OptionalLong.of(1), OptionalLong.of(2))).build(), + HiveColumnStatistics.builder().setBooleanStatistics(new BooleanStatistics(OptionalLong.of(2), OptionalLong.of(3))).build(), + HiveColumnStatistics.builder().setBooleanStatistics(new BooleanStatistics(OptionalLong.of(3), OptionalLong.of(5))).build()); + } + + @Test + public void testMergeStringColumnStatistics() + { + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setMaxValueSizeInBytes(OptionalLong.empty()).build(), + HiveColumnStatistics.builder().setMaxValueSizeInBytes(OptionalLong.empty()).build(), + HiveColumnStatistics.builder().setMaxValueSizeInBytes(OptionalLong.empty()).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setMaxValueSizeInBytes(OptionalLong.of(1)).build(), + HiveColumnStatistics.builder().setMaxValueSizeInBytes(OptionalLong.empty()).build(), + HiveColumnStatistics.builder().setMaxValueSizeInBytes(OptionalLong.of(1)).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setMaxValueSizeInBytes(OptionalLong.of(2)).build(), + HiveColumnStatistics.builder().setMaxValueSizeInBytes(OptionalLong.of(3)).build(), + HiveColumnStatistics.builder().setMaxValueSizeInBytes(OptionalLong.of(3)).build()); + + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setTotalSizeInBytes(OptionalLong.empty()).build(), + HiveColumnStatistics.builder().setTotalSizeInBytes(OptionalLong.empty()).build(), + HiveColumnStatistics.builder().setTotalSizeInBytes(OptionalLong.empty()).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setTotalSizeInBytes(OptionalLong.of(1)).build(), + HiveColumnStatistics.builder().setTotalSizeInBytes(OptionalLong.empty()).build(), + HiveColumnStatistics.builder().setTotalSizeInBytes(OptionalLong.of(1)).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setTotalSizeInBytes(OptionalLong.of(2)).build(), + HiveColumnStatistics.builder().setTotalSizeInBytes(OptionalLong.of(3)).build(), + HiveColumnStatistics.builder().setTotalSizeInBytes(OptionalLong.of(5)).build()); + } + + @Test + public void testMergeGenericColumnStatistics() + { + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setDistinctValuesCount(OptionalLong.empty()).build(), + HiveColumnStatistics.builder().setDistinctValuesCount(OptionalLong.empty()).build(), + HiveColumnStatistics.builder().setDistinctValuesCount(OptionalLong.empty()).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setDistinctValuesCount(OptionalLong.of(1)).build(), + HiveColumnStatistics.builder().setDistinctValuesCount(OptionalLong.empty()).build(), + HiveColumnStatistics.builder().setDistinctValuesCount(OptionalLong.empty()).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setDistinctValuesCount(OptionalLong.of(1)).build(), + HiveColumnStatistics.builder().setDistinctValuesCount(OptionalLong.of(2)).build(), + HiveColumnStatistics.builder().setDistinctValuesCount(OptionalLong.of(2)).build()); + + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setNullsCount(OptionalLong.empty()).build(), + HiveColumnStatistics.builder().setNullsCount(OptionalLong.empty()).build(), + HiveColumnStatistics.builder().setNullsCount(OptionalLong.empty()).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setNullsCount(OptionalLong.of(1)).build(), + HiveColumnStatistics.builder().setNullsCount(OptionalLong.empty()).build(), + HiveColumnStatistics.builder().setNullsCount(OptionalLong.empty()).build()); + assertMergeHiveColumnStatistics( + HiveColumnStatistics.builder().setNullsCount(OptionalLong.of(1)).build(), + HiveColumnStatistics.builder().setNullsCount(OptionalLong.of(2)).build(), + HiveColumnStatistics.builder().setNullsCount(OptionalLong.of(3)).build()); + } + + @Test + public void testMergeHiveColumnStatisticsMap() + { + Map first = ImmutableMap.of( + "column1", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(1), OptionalLong.of(2), OptionalLong.of(3), OptionalLong.of(4)), + "column2", HiveColumnStatistics.createDoubleColumnStatistics(OptionalDouble.of(2), OptionalDouble.of(3), OptionalLong.of(4), OptionalLong.of(5)), + "column3", HiveColumnStatistics.createBinaryColumnStatistics(OptionalLong.of(5), OptionalLong.of(5), OptionalLong.of(10)), + "column4", HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(1), OptionalLong.of(2), OptionalLong.of(3))); + Map second = ImmutableMap.of( + "column5", HiveColumnStatistics.createIntegerColumnStatistics(OptionalLong.of(1), OptionalLong.of(2), OptionalLong.of(3), OptionalLong.of(4)), + "column2", HiveColumnStatistics.createDoubleColumnStatistics(OptionalDouble.of(1), OptionalDouble.of(4), OptionalLong.of(4), OptionalLong.of(6)), + "column3", HiveColumnStatistics.createBinaryColumnStatistics(OptionalLong.of(6), OptionalLong.of(5), OptionalLong.of(10)), + "column6", HiveColumnStatistics.createBooleanColumnStatistics(OptionalLong.of(1), OptionalLong.of(2), OptionalLong.of(3))); + Map expected = ImmutableMap.of( + "column2", HiveColumnStatistics.createDoubleColumnStatistics(OptionalDouble.of(1), OptionalDouble.of(4), OptionalLong.of(8), OptionalLong.of(6)), + "column3", HiveColumnStatistics.createBinaryColumnStatistics(OptionalLong.of(6), OptionalLong.of(10), OptionalLong.of(20))); + assertThat(merge(first, second)).isEqualTo(expected); + assertThat(merge(ImmutableMap.of(), ImmutableMap.of())).isEqualTo(ImmutableMap.of()); + } + + private static void assertMergeHiveColumnStatistics(HiveColumnStatistics first, HiveColumnStatistics second, HiveColumnStatistics expected) + { + assertThat(merge(first, second)).isEqualTo(expected); + assertThat(merge(second, first)).isEqualTo(expected); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestThrottledAsyncQueue.java b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestThrottledAsyncQueue.java new file mode 100644 index 00000000..511aaa01 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/java/io/prestosql/plugin/hive/util/TestThrottledAsyncQueue.java @@ -0,0 +1,256 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.prestosql.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.util.concurrent.ListenableFuture; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; + +import static io.airlift.concurrent.MoreFutures.getFutureValue; +import static io.airlift.concurrent.Threads.daemonThreadsNamed; +import static java.util.concurrent.Executors.newCachedThreadPool; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +public class TestThrottledAsyncQueue +{ + private ExecutorService executor; + + @BeforeClass + public void setUpClass() + { + executor = newCachedThreadPool(daemonThreadsNamed("test-async-queue-%s")); + } + + @AfterClass(alwaysRun = true) + public void tearDownClass() + { + executor.shutdownNow(); + } + + @Test(timeOut = 10_000) + public void testThrottle() + { + // Make sure that the dequeuing is throttled even if we have enough elements in the queue + + ThrottledAsyncQueue queue = new ThrottledAsyncQueue<>(3, 10, executor); + assertTrue(queue.offer(1).isDone()); + assertTrue(queue.offer(2).isDone()); + assertTrue(queue.offer(3).isDone()); + assertTrue(queue.offer(4).isDone()); + assertTrue(queue.offer(5).isDone()); + assertTrue(queue.offer(6).isDone()); + queue.finish(); + + // no throttling, enough elements in the queue + ListenableFuture> future1 = queue.getBatchAsync(2); + assertTrue(future1.isDone()); + assertEquals(getFutureValue(future1), ImmutableList.of(1, 2)); + assertFalse(queue.isFinished()); + + // we can only dequeue one more element before being throttled + ListenableFuture> future2 = queue.getBatchAsync(2); + assertFalse(future2.isDone()); + assertEquals(getFutureValue(future2), ImmutableList.of(3, 4)); + assertFalse(queue.isFinished()); + + // we are now throttled, this future will not be immediate + ListenableFuture> future3 = queue.getBatchAsync(2); + assertFalse(future3.isDone()); + assertEquals(getFutureValue(future3), ImmutableList.of(5, 6)); + assertTrue(queue.isFinished()); + } + + @Test(timeOut = 10_000) + public void testThrottleEmptyQueue() + throws Exception + { + // Make sure that dequeuing is throttled if we dequeued enough elements before, even if it is empty. + // The future should only complete once the queue becomes non-empty again. + + ThrottledAsyncQueue queue = new ThrottledAsyncQueue<>(2, 10, executor); + assertTrue(queue.offer(1).isDone()); + assertTrue(queue.offer(2).isDone()); + + // no throttling, enough elements in the queue + ListenableFuture> future1 = queue.getBatchAsync(2); + assertTrue(future1.isDone()); + assertEquals(getFutureValue(future1), ImmutableList.of(1, 2)); + assertFalse(queue.isFinished()); + + // we are now throttled and the queue is empty + ListenableFuture> future2 = queue.getBatchAsync(2); + assertFalse(future2.isDone()); + + Thread.sleep(1000L); // wait one second, after which we should not be throttled any more + + // no batch is ready at that point as the queue is still empty + assertFalse(future2.isDone()); + + assertTrue(queue.offer(3).isDone()); + assertTrue(queue.offer(4).isDone()); + queue.finish(); + + assertEquals(getFutureValue(future2), ImmutableList.of(3, 4)); + + assertTrue(queue.isFinished()); + } + + @Test(timeOut = 10_000) + public void testBorrowThrows() + throws Exception + { + // It doesn't matter the exact behavior when the caller-supplied function to borrow fails. + // However, it must not block pending futures. + + AsyncQueue queue = new ThrottledAsyncQueue<>(100, 4, executor); + queue.offer(1); + queue.offer(2); + queue.offer(3); + queue.offer(4); + queue.offer(5); + + ListenableFuture future1 = queue.offer(6); + assertFalse(future1.isDone()); + + Runnable runnable = () -> { + getFutureValue(queue.borrowBatchAsync(1, elements -> { + throw new RuntimeException("test fail"); + })); + }; + + assertThatThrownBy(() -> executor.submit(runnable).get()) + .isInstanceOf(ExecutionException.class) + .hasMessageContaining("test fail"); + + ListenableFuture future2 = queue.offer(7); + assertFalse(future1.isDone()); + assertFalse(future2.isDone()); + queue.finish(); + future1.get(); + future2.get(); + assertTrue(queue.offer(8).isDone()); + + assertThatThrownBy(() -> executor.submit(runnable).get()) + .isInstanceOf(ExecutionException.class) + .hasMessageContaining("test fail"); + + assertTrue(queue.offer(9).isDone()); + + assertFalse(queue.isFinished()); + // 1 and 2 were removed by borrow call; 8 and 9 were never inserted because insertion happened after finish. + assertEquals(queue.getBatchAsync(100).get(), ImmutableList.of(3, 4, 5, 6, 7)); + assertTrue(queue.isFinished()); + } + + @Test(timeOut = 10_000) + public void testGetPartial() + throws Exception + { + AsyncQueue queue = new ThrottledAsyncQueue<>(100, 4, executor); + + queue.offer("1"); + queue.offer("2"); + queue.offer("3"); + assertEquals(queue.getBatchAsync(100).get(), ImmutableList.of("1", "2", "3")); + + queue.finish(); + assertTrue(queue.isFinished()); + } + + @Test(timeOut = 10_000) + public void testFullQueue() + throws Exception + { + AsyncQueue queue = new ThrottledAsyncQueue<>(100, 4, executor); + + assertTrue(queue.offer("1").isDone()); + assertTrue(queue.offer("2").isDone()); + assertTrue(queue.offer("3").isDone()); + + assertFalse(queue.offer("4").isDone()); + assertFalse(queue.offer("5").isDone()); + ListenableFuture offerFuture = queue.offer("6"); + assertFalse(offerFuture.isDone()); + + assertEquals(queue.getBatchAsync(2).get(), ImmutableList.of("1", "2")); + assertFalse(offerFuture.isDone()); + + assertEquals(queue.getBatchAsync(1).get(), ImmutableList.of("3")); + offerFuture.get(); + + offerFuture = queue.offer("7"); + assertFalse(offerFuture.isDone()); + + queue.finish(); + offerFuture.get(); + assertFalse(queue.isFinished()); + assertEquals(queue.getBatchAsync(4).get(), ImmutableList.of("4", "5", "6", "7")); + assertTrue(queue.isFinished()); + } + + @Test(timeOut = 10_000) + public void testEmptyQueue() + throws Exception + { + AsyncQueue queue = new ThrottledAsyncQueue<>(100, 4, executor); + + assertTrue(queue.offer("1").isDone()); + assertTrue(queue.offer("2").isDone()); + assertTrue(queue.offer("3").isDone()); + assertEquals(queue.getBatchAsync(2).get(), ImmutableList.of("1", "2")); + assertEquals(queue.getBatchAsync(2).get(), ImmutableList.of("3")); + ListenableFuture batchFuture = queue.getBatchAsync(2); + assertFalse(batchFuture.isDone()); + + assertTrue(queue.offer("4").isDone()); + assertEquals(batchFuture.get(), ImmutableList.of("4")); + + batchFuture = queue.getBatchAsync(2); + assertFalse(batchFuture.isDone()); + queue.finish(); + batchFuture.get(); + assertTrue(queue.isFinished()); + } + + @Test(timeOut = 10_000) + public void testOfferAfterFinish() + throws Exception + { + AsyncQueue queue = new ThrottledAsyncQueue<>(100, 4, executor); + + assertTrue(queue.offer("1").isDone()); + assertTrue(queue.offer("2").isDone()); + assertTrue(queue.offer("3").isDone()); + assertFalse(queue.offer("4").isDone()); + + queue.finish(); + assertTrue(queue.offer("5").isDone()); + assertTrue(queue.offer("6").isDone()); + assertTrue(queue.offer("7").isDone()); + assertFalse(queue.isFinished()); + + assertEquals(queue.getBatchAsync(100).get(), ImmutableList.of("1", "2", "3", "4")); + assertTrue(queue.isFinished()); + } +} diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/addressbook.parquet b/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/addressbook.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0f5b8e0c07dc8cdd6b7f92c5cd6c72da1c52ce7a GIT binary patch literal 3956 zcmc&%-EQMV6rQ+gTB7y>340x>T3L}AwNkM9vxOzfMO!w|h_(r7E07Rcxv@hm5<6=< zzzSubfXlt-4fY}SnhTy_UjUwfE6$v;$79ECED$19{d0c4^PL}$$NjUxp-kjGd9Nu; za8Z#fJL=lGdyfZ&z)ynzHTZ)gJAoD~599;EWnBh`UNU+xPYkM7-j-D5#dt)GnPY+~s{cdF(omT?~eZVn2BYsoDkT=nqi_{qsf zA3y!%KozbL6iY)0WxuvgNEObpH6T5fA9EyHO8iG_ILK1%w~Ot*;k}q!!Od`NO$}Kh zCz84`vpnB*d}vFBoRrn=@cwx)hg9WFasmZ&jHxA;iM+KdsnwC~`N4B4+67!HdT(L~ z5(y-dZ8Ft51hOY!2%##ovFlho(iXc71BK`nXFi1vN66*i<;>dMB#KnqsZAQ2JMva- zmsDyO5$z;hBq|V;!BH70qsC@CHEj*22vnRHBE3>xHJmn|J$`}hY5ZB*fhtpWPqW!U zQ4q#dOfYV9%p&fE2W5C{8?QBIZ9##qelYgzQJ|ZaKlJQb;Ckw> zAN*X^w7PB9wU$=DxgFendRuQ&(Qs@}OwV$t;v0=xfsl?aT85*JrXU~L*2L^0$RYIx zk!QV_+nzPGoB%zZbv`@0yJX;?rqfB~3>X9cMLQ7Ykie+RL<*2W zlQba|F4u*M=`XWBOeIo8uB;+lnavSF(?uI}zP#%WI?uv7bBHXm2#`-0)mcrAK+%BL zHeRYyvC%9)1yrZgNcKJvFfS9tiL>cme_qGA5}g+*$)d@}DK?A8g)g-$GCE533z__1 z8EM}pauhRO$~Mjv>Xytwd_Ys_t$0wQLM}b9ToE>@dwi7aP(iNvmV1EiaVnEyomfd1 zmb#^xD#)0^3vfxvzAD%2K*VW@7Orf}xW$#f^lOIQ3w2 zlXRlJ{%yB+eRi2wyP}zPuJ5jLFo1nLm(lJaA|#%}*Zx6vcGNOn)vkKy-OEmLk%_T# z>xdw%IT-^U0mNZ0dRPA{NKPA&2A|%qdV?==Gf6G_5yi~u$*@h^CGW+%h2R~@*_j@x zc0ttY%>GXI@w-R;r4(-bVB;3Fv^|;+oZ`(wOB;?2&+t6sWs|>e1>t_vHd~tQ1WjD` z1x=GJQ4uhH%ZO?^mBRi0Vjs9|g*)0I#PZc6_`1}CAMGHpJl!+`L!WtW;OfB~wz{@4 g(L?Jq6E?QKf9|^9)AZPv-G@_;{IN|)AHKK$22x{7AOHXW literal 0 HcmV?d00001 diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/fullacid_delete_delta_test/delete_delta_0000004_0000004_0000/bucket_00000 b/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/fullacid_delete_delta_test/delete_delta_0000004_0000004_0000/bucket_00000 new file mode 100644 index 0000000000000000000000000000000000000000..3cd87d75382afbe709b00f2b94e90066d76f0da9 GIT binary patch literal 646 zcmeYdau#G@;9?VE;b074&;~MvxtJLk7=(B@7=_q4SOi!kSa6G*Ffcrx^gJmoAwgi$ zg9H(wM;Q!EVvGiwjB3mhn1(S3Fi0?9F^ofiLxLlKK^AB^4;Ks2at5Gl7y=lTfdc#x z*Klw!2r+>)vokQbF*ra-UIvCT1_z*WAQwo11shly82&SKF)-wwHsm{?z{7k%z+*v4 zrZ$`Y-~Z;)YkFkYOxCUZ^})N~TrT5^Wup4IH+4(zY)Niat@AToYmsp}FXU1B8`)1U zZd{yS@SkB414C-JAy-2Hk8||O`8OQBmOol)^{=^tp+8TTuSt@rfBK7+On#SS-!LZ$ zO)@#&-=B0mRPc4*yQ15>RTnQfFW%-*yz|LUEr)$Jv!xlB7*c&tdmS+l@x8U>-b$lm zJRT>!xVokUS&7D%J}$cBAt=7NH}`^pvvy|smAUgDi2q>!w|J(K&q40*G3Oo^9awt5 z?*{Aie+hL(Ce}wEGQTmDx7PbuW0(~;b>iH-EW^y6x@%iw?4%;MZ+heUKQmM>MBV1{ zp1Ywy~vpc20Q{sl)9_2fmBJ;kyu$Zyu#a6GBm8(-0+6l#mzna^- z`Ny9(k6sz=otIWvCHyg=UOL60gnYGnc8gnfB&0Huj!FpGg-It*9Y%{bGeKwmWk@;-qbCAj$8)}I9RscNKomhn6yvpxXlv&7nieD+utQ7&WPRF zDxuzCRF!PRHEYAIXNk&gAE&><)5rQ^E?jNyij;km2_o~wzBozD@k`G{;ta2S!K2< zFZE{kz06{8lO>s|-(_v(%>z&#!4e sICPei@yyM%clC15AINibFbOb9G&Be(F){FH1Q@V2ePL$y4|0|O0D2?o9{>OV literal 0 HcmV?d00001 diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/io/prestosql/plugin/hive/security.json b/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/io/prestosql/plugin/hive/security.json new file mode 100644 index 00000000..58bede69 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/io/prestosql/plugin/hive/security.json @@ -0,0 +1,17 @@ +{ + "tables": [ + { + "user": "hive", + "privileges": [ + "SELECT" + ] + } + ], + "schemas": [ + { + "user": "hive", + "owner": true + } + ] +} + diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/nationFile25kRowsSortedOnNationKey/bucket_00000 b/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/nationFile25kRowsSortedOnNationKey/bucket_00000 new file mode 100644 index 0000000000000000000000000000000000000000..204dd51621e9484f9cd44161653028fc1d04677a GIT binary patch literal 11406 zcmds-XIN9)zOQErEd&SwLR6H{Lk}njqClt#y%P{Ap-EAC5nBQQ>C&W0la3++iVIXU zKtMW(4Lcx31w{}W>f+9z`|h>Q+WX$U&$Bm`gwYyX25qNF#U zl=MY80Z(>|xt}dwvRjOHNiro{^mS=;^YpYt`W(kjr?4{K$zJ1VPtgJwngl-(2sW{s z-E5LG12&N!z5~FOpq_S6Bb6)x&ndeZs@GI(mxWCRi=#VmS2W7Wy9Kzk6&PiAwXoR^ z4Sue|8tB4>W_rx@iJt{HRpeDLocW@|1_rn-hckJx`KgqN9mHd>_oF_wr{C6(uFH>3e&!@6) zb1Geu1*C^ZyQG=YVRUhY(FBygpl3D(Cm;=K_tg?rG_)YhTksv=9#_Z{?`a zR@c`+<09Z4lmpMCUF z6HrFmWuUVYB6F#*>glb*9o;j*V&-Yvzh@`2Z1MPj4o9<0n*2feS0+1)k4Rl=#TeAz zaCx93_nAh?vvKr(0aDvFOGu% z7cY&CTB_Y5psST&-C#2ewlewCIlyAJ)$*G}h z%n~vj{yforWKnBLF!G+D*OU{f<%qlUsmi##`!^`om)G4@>UuvpISmVkR^I%!6vJKl z)D(HI?DfJxySdq?5n^BH0IM!N{#J9RMx{^Da7=MY?$cfFr$63OQ^i7n&Wuj>WSMx= z;jQ&{H5FQFz~$!nm*t@|c|unO*bh9HY@@lLsWFi=-&OxW zJ7PWU&8WYMZ-Debzn$>B7Iz2o+>hJj&tIWD0Fa8b0k8m|^QD|Vjgxpf$^?GBOx%3g z4FEgXpWcFSQ?UGYX1^H@LjaSXfB$-FgOeX60Z2;gGoZ=ifFLKX8)~FQ*d846-V^#? zzee|0CsPr)-^bw9^e=k6S)4N)!TMX1HzI%leokO70Pyt^1pxv#yhXA1mfSs@_ajqT zLv_WnwD6S!S7P{i0l)-)#LvbM0EiZ*#&%>P(azM^%7JV}G_WJOk}V90RzxGBv8juV zgTf!XW2F3+z|x|viXs2hN!8Q~p+E(8C+4jld1+xQZeS+@0AK~*Y2oV~M1TOsr)~M6 zYTQPb)VT;p<(4K?1cwk8?>tEh()=COebx)u)>eC?A}gZwe2ByAYRZw{vIHL?Rbc0q zeeRLaPDjH$W3HROPcc*TsM(F|nZ%ABQqabCElFYAEFX&L$(pLjwv7r;;`hxwsBuG8 zKIMv5*d16>=JBp$u|(>UajA-?J#e&DyvVS%_QIP+$DCtQAC`g?_`5bn1l|FFWk#1^ z4>(o(&p|&M0R!n829TA|&l00@FRnj%QnDol_DQ}^GtyjaJkp8vNI3?$#?(2}6b8%L z&x+wwKA>D>P*4Wj-oEa2`BW>d&A-j3b(CLe$RMWF{zscwZ!dZRtz6A}{kz2%h1Ut^ zpE$((xj%#Wu6ygg2Y^E(*ZH7oD8}@M^&@8iQA>n@Fu=oM1M#)t-fkm#oUpsm$e?+> z@rB}(CkLPK=y+IanfDm`x%qY(K6s`4N$1`3*UX*r!|n1~&l4q9dNZyNyt6svT&iNk zW|?JMx9_@JwT>j6K6>gym4(OE$wIA9{x>^ATYk3^{|h+&M>81w7ro5+$!;{8-K-I{ zF6$}xMpnj|4=ylldv!YnOYtD|%~sV1^1S%Y%>4+-W>1{)FeIHJH2&g`nC8Jh-QECu6vw>Q2DSH zLh}tzOb)a`2!tR0fey65SX!WuexOmu?4 z^&QbT$9T8KvJ)!$i);?-RUj-Y5L+q`##sn?6{`xwNxg@j6$)_s83O@t00uDrMMldq zWmkv-;Ms%CXt#Z7KpG2rq&{v7Bd$;jumCR#@$)RW3Z4a`HsGd&R8lQWtR(g&g*J%P)Sj82bx^4G zotK#hEESY2l~v@E3X5_n)m2$8dF2K2M%8;#xI|rO`D+B|p=rqDiDB;6+N2=`({nhv z7HJJRk%8bu*6i%}i5of8M8va7mYHJW#EprW36>c9G;!A!Z)T}FLRPXPSy+vVwhr3j z`^ie=5wf!U1H){E5k+~Y`f+8m#w&LC5k+Mc%i%jjTU<*;+CCaRNK;CbH=&=`?;JT@ zOm*_iHEf4iTAGNRVvc#5eVRy)(B;(#JCW1~g$M<@CT0#cXQpl^fL$-zwk-hZSu}N`tB+LX}UsQ+Qtxo4)aemmYK3b5(IF_o@V{)8&*Y0YU$#8&RCYWt7N8y30muJaxi?4EKyt*0$855 zMaL76MwfD6IIj1aYAQA;0u$s@vS5n+6GROHL6Owqi_)YxIOX8IVw$|H}a=aI}ed}#M zAb<6}=l0*x;!~4nC;A@eIy0?JlgddO(Cdtz>oZc~-7IQ{|Fa~-&|pX~0C05Cd6*yq znT{vom!un#De3nMC*B^FbAyTAR(O_3`p7qU|HP!=t_%jR>Xw%QrO_v00~30yzX(P| zGNzyWHpUZr2W3DrLh$g7!`f^(H0}V(YRG&NcVo!MOKpqK*5kVS^khqX`dlz>KD`I7 zzS{SR>#%P9bhfacn~2z0ch;8~v-xJvY!TB(l>$pOUZy)WTwf87_c_Hq)hQ7dSvNDk znv%OQ{F}4N)fcb%{<6>hk4D%7Ho|_!C^Q(OR9HhTp8QqJ>-W>qyPGt^IS8Qxq2N~C z>6tWef--_{4%uz<>GzPr#&M4JCHk{;=(uU{df zkXFb#b)UeqgTY|X7%ciT0*@erkw?p8G42w0wlEm!v~*U&etjO`DSHNf;tI{i6>F?_ zX3Oj=Ik>D0J86av{6v*Wz5slP%V1W7wz^mU2v61tqKcF1ZJ+{zGNZN$N$$f)PL}`vU&VT?wW{7*pl%++|CG z&@c=HlftoT^HGv=FAcg$+W2Arm!2rkEKhE5Cc%65<@uS-iBOma2cFmCn^){qoNM2Z zRhU~|;9gRcb3UuQ)+Q(WyiJKckFeFyu=%hpiL6W-F(;3hk6I`{ut!N6o@T|nUj;au zI?{hps+JJnxa>6>J~(8-Syb*(;$qjBl~?ZJR+&Y~@c^O?y#oBgHJ^$YHzd|H^vd*p zLJ(Fe@2yRg7)eNu0BrsYf!*NElsCC{)fuleVlg_}08gzhn3pzFI=H~?IhDEqN)0fX z3y-eEACRY+%iOx6RcB`{p}bR7&D2z)t+E2&)=rVwxAS5(MM7ImQBTfM!ZsQ2i00C4 z0hAB=!XM+HSI%L447U^<@oP0O3nZQhlO1QVDjgE5ONMjO9V)J)b63sHg=xVR#>Nc+ zHQJ%SVdRCS5!uy(00IKo{|%6MQr=|fS!Ei1QJoFGVGw-wo8aSD3-47BuAZ;)nXgf- zc9Jf@)L|5Tq-*OIsuilwV0`BbFbg2UH-VQgyx5iZ2DpzAuEB^Z3j55TnN^gok)0P& z%#-tjMa|{a*7!m-M5HkV_&n}A4`Nezf;wQ>e>;*{Z33xF%%nD9LsSWp8^oN&5IVRl zc0h9dnX@8uA0h& z-l)H^=z#ie18H@TMVVfp58*bs)D1~@ey`M6wLM$k+3envp?gZdD}8-_G|}o>t7e5i zkK8G~{X+3u6O`TDo9%WlymtO#soL>2;pw@{RtbqV+jqoTn^ihl)mK>;IvB3fsgFyX zp%)0dmL>D6%rK@d_)E37jW1sSPADF8yfB#5rOj$ z@QaMVd3yzh1pDH=z?g7oLh7ZDA(X z3>#Dg=8S2pXxtGHUH$^rdZrTGVB8a^H{hr@;H-zv*W*rlnCe-ZN<`yfOL(AyeQp0o z{e6If#1BW%L3dkST$(_pk!PTKHrgaNc)ggy=%)DIp`Ze)Tp`cgFBPK9O=Pv5{wWf- z%b;HkI^@X(dU7~b)LzF5DXB!yd=Pn*M8wN)k+#R>#0wsL@$lP3LJXuvt@eb-VnEw* zOsEx+f3kUO_iZW7Y}JERwMwJ|Naay6WmA_2v#!2olewMti*QFVca5+0tGP#OmnMC+ zADsODe6Mb(Iziv}d_&_8ySvO2Rx9$HAu;ttF0{3-xOm8M-ofe?S$FBHc4gGfBN&oU9Vjg;q<&JYH8C5x=+_6tEXZHb5deETD^nbn|AVK^a)9Cd(^pO9Mhb&ue7-K z^fX8MSawrx_|5Fq3OZys-ShIYj^oJn!w>GCaBsU5rxTJncI~w?U-A?rc)KvxJp5cT zwEZ^xI7{Lb%ZjuY6=ikJqFp#-qT7F8By4yhQg0{eoAq~I3e8tM>!$)F8Q?+IdfNt%fvE#`NSz0F9!HVd$D`H5y^vyk@0Q*thuN zeQjEg^H}j5dk){aw6rj3$v^0#8$Wn=;_BdeN+ZXMxgW6K*&6gMxWJG`&MQE&NMWz# zY3hAT$&k>Y)F{t2(>;_@2j z|65t|@n85I=E;?-|AI$0W!tt)9CojpHqi?Z!TQZGnXqU9T`Yt73cN)p z4zk1K;n-2>_siRCKCNNYHp66aPmC-*;?OV{OdEv+Kzjct0RGoBc}}820vxyfAJb$$ zEuX~@+Dwy~v`kjQ;%1sGqm{9e7BtW7eybG4e!s)122akCzLISWZjz>6L&tP4smueyXvmjE4Wy%|q^Dp$Q3qS~x5 zO&PyS3wsckmmiDbb?`dqv+YY?*meIraAS!n>JeMMhsiY$&f!r-V%| z(k54yl#3Qsb+8Xj_D&9JMkzQ`y8|jP|4R0>C^4oPQ|Ws*ov-XqC#n)}=r#|1Dkck& zP2v3xK>}di{G98jnkzDyar@T_Y5l(YuflVt>Jl9J}pcyQFjGE)a)%{Z#3&xl)eU6hbUuK7G0 z_uZ{gA#_SNL;q)Pyr>M`7Ja^QK zN;aT-*(JfBXAEx$RPubX$zh5+&Ho9ahKdeog< z);q~59o?7x5p69&HLJ)7P!Ds3?8we-d`lm!rv5qvdDDew)kQdZQy@1WI?eGB*JuL+R>!X5l!co4zu8n*)ZFHCp}iAb zUcBY8ylf`kA4k`OlWCfE1x0dM^l_IYlD90G%9Ko6%GLK8HqhL-|J!N*G#|HN_t<(|+XK#-*16AC5XJBOsym~)(E}=!{>W=A_bUhL_N)*xSV&Fep=M6+ z&UWmLp1F9s>No%#$Y>Gi2Dmv6F#85qt+QO*=^iQ5k&PCoAZx3%nM3f^Gy%YtMU5Cd}Mn6n&&BQOhHAXk++%SHGKBV83g=;`;5 zy{t#VPcU<^!OYJFa~vC%+kgPrpJDljx}onatmx0W;U<`a--d!44=;j`|5GF~8X2|X z5Q+?sT#ZD2{D2ILvVH$)Eh1*+bK+{ep_rjtLeJg5;fv}X)F5&WTGX}wRy>aY?7*f)BIuLp-TzT>!8zt>9TxCB05Let^xr>F`1 z$^L751k{&=bW7Sp9mRJ!*@*5yv7+{nR+0o4o-R4IjUV*CGFH6%YtA_fuPXN_+Wps$ zT(ye$hVr;?NIBAaH6XlX}`@1>7F9zL$) zwID7us_uP;L)0a#KsoO85Bgbwz*yaN>C*r$s(T8XtDaNPIlmujD^rekllg+WxBOJRb$E>Nh2QEF;Cu#)u=^tVY z61YS4ee;+Yd`f7G&U*WG&LeveE!VD@axzG62mK&s-@2Y_9QDMGfTyhfd&V>3dwTR0#tY; zi%}~JYh@>35HBq|SCCI@^TW(7_{u6)&xu0j)`Y<_Q1%*0=}Qori3+2oEb{( z17MY9wJHDKG zLpvR<`JD_M|#zAfV&lR=Q?f8ikopl?m!};8hT)Dzjab<+*59#pZV+RjkZ+mmlbdDF@58Kog-@ zPjyMCI-1L=73Q1QUQIJFi+58hBW9j3XxvoKXucOkv~U%|91?u$NZ>FXne(!^o7?Wi zK)=WZo2=r3oT7rJEZ1x|*AmyiEwU^p*SsPR*Ah@C|5{y*T=wT-4*PB3Kr*gC0)eh%{bne zk6C~vNRrj?;27DFl&2NyE<=-;5s-Qa^;2=mdnY35C z3QLiCtaTwak^)y2g16L7b&0oR#H$~?)adl8 z((>&33q(3T@EOmoJ#+nd`N+mQjfc|Kj-S2#SvL*>=ybC*&J?AcsWJp>w`t-eo|Z5I zH-SWe6O7*fUvE1sskq=J2-3|nwa%!@0B35D*G`^5c?T0irv6Nqrh7#?x>Ftj1e2YKSEgV+euJZ5Dep?EjhJAmSnyZe2LhUx zfS^mlZY+zSB2=6|GWwHkLqi60mGL>nwq1=oxI75;xn8nmc=B5tWj(@dode-n>mlFY z1RX8+61F848X0>qVj1~MgBwvFC>Q51UM;_QQ$0@m#Phlj?^Yk5gd78JKu=8JTZ!JG zE~U@{dn#X=L7T~`Yr^8DI=7gI_)~TdBQ#LI%Wt8d5kspMiJZrM4=RS1x}lFwy{U6S z?48e}@J}mpguUl(QBzmD+}uhIn-{Id$mILXFOYc_M15=h!2&-+SY}>CPBww#QI1*e zWqb2#O4{nsKKp70SW!>4`(T!Jh5)O{E7U5a7jmTd*{X9q?&%)w8j$HN6)oqwDy`FI zm&(T)u;k#u)YKYsXn&Z={SXmBH!;irfISev+nGA!$w`gAcuZwFeYn-3lQlam+_`km ztaJEA-aW)QZmI3|-aC0CVCIHW+l9={&4YX(=8+IXq)ZYvFr68A#i#v?AO=$)oiEHA zwzX(07W)b~mya#L8C+FrtJm78XPtbf-?P`tL{XC;w}t5`15_sdoqjhiGMJLT=oj~h zNe%Z@6U^kqc?tr#XniP93;}{D<3kuGO3+tkv}J}+g%aROV0LKwrEJmJ=Bx(k9aD4V zq09V)11`1m;-o$mf^_P((dLA}QeX>?wD1eqi^9ZsGpgHCrtZ5YVp*oal2=)i#EHzC zXwlJB=$S}$sn;)Q#g9e~wq*!5hKWcOv>2FBadsnLo=! zeq^;Q+ni#zN_I^w?-q%u|F4F6wYpM3h%BE;Qh9WC)JxiyePB1F-=prM&vD fxJUqZJO>JbU57|Q0VIBsm%tgn27_7K8Djqzs(~D- literal 0 HcmV?d00001 diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/nation_delete_deltas/delete_delta_0000003_0000003_0000/bucket_00000 b/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/nation_delete_deltas/delete_delta_0000003_0000003_0000/bucket_00000 new file mode 100644 index 0000000000000000000000000000000000000000..26a94eacfd1e04ce1de384f2ee39e879f042f708 GIT binary patch literal 873 zcmeYdau#M_;9?hI<#@p!z@QIgi*hkDFfa)50R@EEIamZ(Bo@fyld)l7cpUU3DJ>yE zVA7+CgL*1wE;6v_FeL;rx#&&cPhem$2O25~F;oa-C^yFhKY@F(5;s=PXT@ePn*f`{ z2fY9WS)e{1E*79;7=&0j7y=lTfdc#xgE%-CgqS240vP0hV!RMBHVy_Qpa^lQ1Q{3( z{%2u0_?(3Si7COrVEto1BS(Y;pI!R}22qfr34TlmpHB#50xM$!>R@AFWnidcIL5$` zd&Zit!GMSPg4y2yrjC9I9X6GB^`1>94;&EKdu!#QtyfhOt{m@Z?wVb2v1hW_d+o4W ziOYGV-`Z*yZ>eS$HBVe7DXjaZ`L6aES*F~OJ5Stp|6jE){Qm6OThgDT-?O{D|NLvV zQ^3%jb^657Qy-gb8ydYCJUBNNZ{XPAp#u!sjt9q-l$eDCpJgyGi7^^zGAbS1^e0A1 zAu_LzvySO|l4{_Bq;^2I;_pRDh=m-aq9SZ;OGQPEj^F)U5!_c(+t{hQ!AQ|GZdiI1kAjn(b+kv~q3tkB>4t6rKY>Q!KCuw zET-kU)eVhZ%VHnZ&YRwIV>9pO^D|NdHauh!tF-Sqo-Dby!(dv}`k7aEXPmmD(8IRw zPN}k8g8!0BSzGw|`ukU0pQ3;Mv)?Q0XV0>XJ)2DwJIpc`dup!DpFMS|WvbG|vo5(8 zcRqZbkyj!r@2$wjBoUEc9lJFASNof^C0%h(7%k?9nH85`zJKbD(5%uA*SCKC`Eix^ zu8ls85>fZ_<}ppY6k5Hje}WU&G^O3Dg+_m!{nFb%7 literal 0 HcmV?d00001 diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/nation_delete_deltas/delete_delta_0000004_0000004_0000/bucket_00000 b/omnidata/omnidata-openlookeng-connector/connector/src/test/resources/nation_delete_deltas/delete_delta_0000004_0000004_0000/bucket_00000 new file mode 100644 index 0000000000000000000000000000000000000000..cf55d2609526270a01006521b21101ade3101e42 GIT binary patch literal 882 zcmeYdau#M_;9?hI<#@p!z@QIgi*hkDFfa)50R@EEIamZ(Bo@fyld)l7cpUU3DJ>yE zVA7+CgL*1wE;6v_FeL;rx#&&cPheoM1{x|2F;oO(C@;r{6-)yAb}&iYSbjzbo6#Hs z91;iY0vKe0I(fKQfX-nMV&Py2U{nSQ@I#E^;9wA9l3)m6kOzwKLd4iO7?gk_#HkWw zU^w`nh2h|H76v4y1kk-d_A_!sNbuRUUto{~DFQl|>EQDZ8jou6%oEwWbaBT3<0S0f!gJVic%tC_CG8mY|7!5QTl@4zD z6O*J68P~_JJ%i=j+k{CwHvZ`mh>*$RYiDRb!f3t0GDn7ou}5%CjEoGAlaCOO!|g^T zF9j)Q7KRB-4387UB2MWXJ$z8-Lv!FES4H2)th<8**!u;fm4%JDwP%Z@*EwbdCV8gS z&6`qC7noL+7MJAb5ttNLm}C?&L3gEzrS(>quB{sT?-;v!Y3$sxLc?@DOC+apfLCW{ zvt&o)gbXp&;03liZ)E1g1l_6pSef9xDOh{y4Tm{*LM9{xW?6+yFxI!Tv^N*eFuB4} zCb4$uN!2+MBd4vJK4Z?zMeHG~r_Y=|DN0c#Wl3^YPFiAKLT1L2TzS!zp*yLGHwIAK?Em*reYbN7^7Gx z>nBDY!8;KyDGo}9WOyA8C^4w1I;ccUyYI!La{BbgMApd%_R0DPnNOQ9$kgq8_MyOo z2^I NY)xO7nf-&DB>*^v9ZdiL literal 0 HcmV?d00001 diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/sql/create-test.sql b/omnidata/omnidata-openlookeng-connector/connector/src/test/sql/create-test.sql new file mode 100644 index 00000000..e68f675d --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/sql/create-test.sql @@ -0,0 +1,340 @@ +CREATE TABLE presto_test_sequence ( + n INT +) +COMMENT 'Presto test data' +; + +CREATE TABLE presto_test_partition_format ( + t_string STRING, + t_tinyint TINYINT, + t_smallint SMALLINT, + t_int INT, + t_bigint BIGINT, + t_float FLOAT, + t_double DOUBLE, + t_boolean BOOLEAN +) +COMMENT 'Presto test data' +PARTITIONED BY (ds STRING, file_format STRING, dummy INT) +; + +CREATE TABLE presto_test_unpartitioned ( + t_string STRING, + t_tinyint TINYINT +) +COMMENT 'Presto test data' +STORED AS TEXTFILE +; + +CREATE TABLE presto_test_offline ( + t_string STRING +) +COMMENT 'Presto test data' +PARTITIONED BY (ds STRING) +TBLPROPERTIES ('PROTECT_MODE'='OFFLINE') +; + +CREATE TABLE presto_test_offline_partition ( + t_string STRING +) +COMMENT 'Presto test data' +PARTITIONED BY (ds STRING) +; + +CREATE TABLE presto_test_not_readable ( + t_string STRING +) +COMMENT 'Presto test data' +PARTITIONED BY (ds STRING) +TBLPROPERTIES ('object_not_readable'='reason for not readable') +; + +CREATE TABLE presto_test_bucketed_by_string_int ( + t_string STRING, + t_tinyint TINYINT, + t_smallint SMALLINT, + t_int INT, + t_bigint BIGINT, + t_float FLOAT, + t_double DOUBLE, + t_boolean BOOLEAN +) +COMMENT 'Presto test bucketed table' +PARTITIONED BY (ds STRING) +CLUSTERED BY (t_string, t_int) INTO 32 BUCKETS +STORED AS RCFILE +; + +CREATE TABLE presto_test_bucketed_by_bigint_boolean ( + t_string STRING, + t_tinyint TINYINT, + t_smallint SMALLINT, + t_int INT, + t_bigint BIGINT, + t_float FLOAT, + t_double DOUBLE, + t_boolean BOOLEAN +) +COMMENT 'Presto test bucketed table' +PARTITIONED BY (ds STRING) +CLUSTERED BY (t_bigint, t_boolean) INTO 32 BUCKETS +STORED AS RCFILE +; + +CREATE TABLE presto_test_bucketed_by_double_float ( + t_string STRING, + t_tinyint TINYINT, + t_smallint SMALLINT, + t_int INT, + t_bigint BIGINT, + t_float FLOAT, + t_double DOUBLE, + t_boolean BOOLEAN +) +COMMENT 'Presto test bucketed table' +PARTITIONED BY (ds STRING) +CLUSTERED BY (t_double, t_float) INTO 32 BUCKETS +STORED AS RCFILE +; + +CREATE TABLE presto_test_partition_schema_change ( + t_data STRING, + t_extra STRING +) +COMMENT 'Presto test partition schema change' +PARTITIONED BY (ds STRING) +STORED AS TEXTFILE +; + +CREATE TABLE presto_test_partition_schema_change_non_canonical ( + t_data STRING +) +COMMENT 'Presto test non-canonical boolean partition table' +PARTITIONED BY (t_boolean BOOLEAN) +; + +CREATE VIEW presto_test_view +COMMENT 'Presto test view' +AS SELECT * FROM presto_test_unpartitioned +; + +DROP TABLE IF EXISTS tmp_presto_test_load; +CREATE TABLE tmp_presto_test_load (word STRING) STORED AS TEXTFILE; +LOAD DATA LOCAL INPATH '/usr/share/dict/words' +INTO TABLE tmp_presto_test_load +; + +INSERT OVERWRITE TABLE presto_test_sequence +SELECT TRANSFORM(word) +USING 'awk "BEGIN { n = 0 } { print ++n }"' AS n +FROM tmp_presto_test_load +LIMIT 100 +; + +DROP TABLE tmp_presto_test_load; + +DROP TABLE IF EXISTS tmp_presto_test; +CREATE TABLE tmp_presto_test ( + t_string STRING, + t_tinyint TINYINT, + t_smallint SMALLINT, + t_int INT, + t_bigint BIGINT, + t_float FLOAT, + t_double DOUBLE, + t_boolean BOOLEAN +) +; +INSERT INTO TABLE tmp_presto_test +SELECT + CASE n % 19 WHEN 0 THEN NULL WHEN 1 THEN '' ELSE 'test' END +, 1 + n +, 2 + n +, 3 + n +, 4 + n + CASE WHEN n % 13 = 0 THEN NULL ELSE 0 END +, 5.1 + n +, 6.2 + n +, CASE n % 3 WHEN 0 THEN false WHEN 1 THEN true ELSE NULL END +FROM presto_test_sequence +LIMIT 100 +; + +ALTER TABLE presto_test_partition_format SET FILEFORMAT TEXTFILE; +ALTER TABLE presto_test_partition_format SET SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'; +ALTER TABLE presto_test_partition_format ADD PARTITION (ds='2012-12-29', file_format='textfile', dummy=1); +INSERT INTO TABLE presto_test_partition_format PARTITION (ds='2012-12-29', file_format='textfile', dummy=1) +SELECT * FROM tmp_presto_test +; + +ALTER TABLE presto_test_partition_format SET FILEFORMAT SEQUENCEFILE; +ALTER TABLE presto_test_partition_format SET SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'; +ALTER TABLE presto_test_partition_format ADD PARTITION (ds='2012-12-29', file_format='sequencefile', dummy=2); +INSERT INTO TABLE presto_test_partition_format PARTITION (ds='2012-12-29', file_format='sequencefile', dummy=2) +SELECT * FROM tmp_presto_test +; + +ALTER TABLE presto_test_partition_format SET FILEFORMAT RCFILE; +ALTER TABLE presto_test_partition_format SET SERDE 'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe'; +ALTER TABLE presto_test_partition_format ADD PARTITION (ds='2012-12-29', file_format='rctext', dummy=3); +INSERT INTO TABLE presto_test_partition_format PARTITION (ds='2012-12-29', file_format='rctext', dummy=3) +SELECT * FROM tmp_presto_test +; + +ALTER TABLE presto_test_partition_format SET FILEFORMAT RCFILE; +ALTER TABLE presto_test_partition_format SET SERDE 'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'; +ALTER TABLE presto_test_partition_format ADD PARTITION (ds='2012-12-29', file_format='rcbinary', dummy=4); +INSERT INTO TABLE presto_test_partition_format PARTITION (ds='2012-12-29', file_format='rcbinary', dummy=4) +SELECT * FROM tmp_presto_test +; + +INSERT INTO TABLE presto_test_unpartitioned +SELECT + CASE n % 19 WHEN 0 THEN NULL WHEN 1 THEN '' ELSE 'unpartitioned' END +, 1 + n +FROM presto_test_sequence LIMIT 100; + +INSERT INTO TABLE presto_test_offline_partition PARTITION (ds='2012-12-29') +SELECT 'test' FROM presto_test_sequence LIMIT 100; + +INSERT INTO TABLE presto_test_offline_partition PARTITION (ds='2012-12-30') +SELECT 'test' FROM presto_test_sequence LIMIT 100; + +ALTER TABLE presto_test_offline_partition PARTITION (ds='2012-12-30') ENABLE OFFLINE; + +SET hive.enforce.bucketing = true; + +INSERT OVERWRITE TABLE presto_test_bucketed_by_string_int +PARTITION (ds='2012-12-29') +SELECT t_string, t_tinyint, t_smallint, t_int, t_bigint, t_float, t_double, t_boolean +FROM tmp_presto_test +; + +INSERT OVERWRITE TABLE presto_test_bucketed_by_bigint_boolean +PARTITION (ds='2012-12-29') +SELECT t_string, t_tinyint, t_smallint, t_int, t_bigint, t_float, t_double, t_boolean +FROM tmp_presto_test +; + +INSERT OVERWRITE TABLE presto_test_bucketed_by_double_float +PARTITION (ds='2012-12-29') +SELECT t_string, t_tinyint, t_smallint, t_int, t_bigint, t_float, t_double, t_boolean +FROM tmp_presto_test +; + +DROP TABLE tmp_presto_test; + +ALTER TABLE presto_test_partition_schema_change ADD PARTITION (ds='2012-12-29'); +INSERT OVERWRITE TABLE presto_test_partition_schema_change PARTITION (ds='2012-12-29') +SELECT '123', '456' FROM presto_test_sequence; +ALTER TABLE presto_test_partition_schema_change REPLACE COLUMNS (t_data DOUBLE); + +INSERT OVERWRITE TABLE presto_test_partition_schema_change_non_canonical PARTITION (t_boolean='0') +SELECT 'test' FROM presto_test_sequence LIMIT 100; + +ANALYZE TABLE presto_test_unpartitioned COMPUTE STATISTICS; +ANALYZE TABLE presto_test_unpartitioned COMPUTE STATISTICS FOR COLUMNS; +ANALYZE TABLE presto_test_bucketed_by_string_int PARTITION(ds) COMPUTE STATISTICS; +ANALYZE TABLE presto_test_bucketed_by_string_int PARTITION(ds) COMPUTE STATISTICS FOR COLUMNS; + + +CREATE TABLE presto_test_types_textfile ( + t_string STRING +, t_tinyint TINYINT +, t_smallint SMALLINT +, t_int INT +, t_bigint BIGINT +, t_float FLOAT +, t_double DOUBLE +, t_boolean BOOLEAN +, t_timestamp TIMESTAMP +, t_binary BINARY +, t_date DATE +, t_varchar VARCHAR(50) +, t_char CHAR(25) +, t_map MAP +, t_array_string ARRAY +, t_array_struct ARRAY> +, t_struct STRUCT +, t_complex MAP>> +) +STORED AS TEXTFILE +; + +INSERT INTO TABLE presto_test_types_textfile +SELECT + CASE n % 19 WHEN 0 THEN NULL WHEN 1 THEN '' ELSE 'test' END +, 1 + n +, 2 + n +, 3 + n +, 4 + n + CASE WHEN n % 13 = 0 THEN NULL ELSE 0 END +, 5.1 + n +, 6.2 + n +, CASE n % 3 WHEN 0 THEN false WHEN 1 THEN true ELSE NULL END +, CASE WHEN n % 17 = 0 THEN NULL ELSE '2011-05-06 07:08:09.1234567' END +, CASE WHEN n % 23 = 0 THEN NULL ELSE CAST('test binary' AS BINARY) END +, CASE WHEN n % 37 = 0 THEN NULL ELSE '2013-08-09' END +, CASE n % 39 WHEN 0 THEN NULL WHEN 1 THEN '' ELSE 'test varchar' END +, CASE n % 41 WHEN 0 THEN NULL WHEN 1 THEN '' ELSE 'test char' END +, CASE WHEN n % 27 = 0 THEN NULL ELSE map('test key', 'test value') END +, CASE WHEN n % 29 = 0 THEN NULL ELSE array('abc', 'xyz', 'data') END +, CASE WHEN n % 31 = 0 THEN NULL ELSE + array(named_struct('s_string', 'test abc', 's_double', 0.1), + named_struct('s_string' , 'test xyz', 's_double', 0.2)) END +, CASE WHEN n % 31 = 0 THEN NULL ELSE + named_struct('s_string', 'test abc', 's_double', 0.1) END +, CASE WHEN n % 33 = 0 THEN NULL ELSE + map(1, array(named_struct('s_string', 'test abc', 's_double', 0.1), + named_struct('s_string' , 'test xyz', 's_double', 0.2))) END +FROM presto_test_sequence +LIMIT 100 +; + + +CREATE TABLE presto_test_types_sequencefile LIKE presto_test_types_textfile; +ALTER TABLE presto_test_types_sequencefile SET FILEFORMAT SEQUENCEFILE; + +INSERT INTO TABLE presto_test_types_sequencefile +SELECT * FROM presto_test_types_textfile +; + + +CREATE TABLE presto_test_types_rctext LIKE presto_test_types_textfile; +ALTER TABLE presto_test_types_rctext SET FILEFORMAT RCFILE; +ALTER TABLE presto_test_types_rctext SET SERDE 'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe'; + +INSERT INTO TABLE presto_test_types_rctext +SELECT * FROM presto_test_types_textfile +; + + +CREATE TABLE presto_test_types_rcbinary LIKE presto_test_types_textfile; +ALTER TABLE presto_test_types_rcbinary SET FILEFORMAT RCFILE; +ALTER TABLE presto_test_types_rcbinary SET SERDE 'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'; + +INSERT INTO TABLE presto_test_types_rcbinary +SELECT * FROM presto_test_types_textfile +; + + +CREATE TABLE presto_test_types_orc LIKE presto_test_types_textfile; +ALTER TABLE presto_test_types_orc SET FILEFORMAT ORC; + +INSERT INTO TABLE presto_test_types_orc +SELECT * FROM presto_test_types_textfile +; + + +CREATE TABLE presto_test_types_parquet LIKE presto_test_types_textfile; +ALTER TABLE presto_test_types_parquet SET FILEFORMAT PARQUET; + +INSERT INTO TABLE presto_test_types_parquet +SELECT * FROM presto_test_types_textfile +; + + +ALTER TABLE presto_test_types_textfile ADD COLUMNS (new_column INT); +ALTER TABLE presto_test_types_sequencefile ADD COLUMNS (new_column INT); +ALTER TABLE presto_test_types_rctext ADD COLUMNS (new_column INT); +ALTER TABLE presto_test_types_rcbinary ADD COLUMNS (new_column INT); +ALTER TABLE presto_test_types_orc ADD COLUMNS (new_column INT); +ALTER TABLE presto_test_types_parquet ADD COLUMNS (new_column INT); diff --git a/omnidata/omnidata-openlookeng-connector/connector/src/test/sql/drop-test.sql b/omnidata/omnidata-openlookeng-connector/connector/src/test/sql/drop-test.sql new file mode 100644 index 00000000..e2d96917 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/connector/src/test/sql/drop-test.sql @@ -0,0 +1,34 @@ +DROP TABLE IF EXISTS presto_test_sequence; + +DROP TABLE IF EXISTS presto_test; + +DROP TABLE IF EXISTS presto_test_partition_format; + +DROP TABLE IF EXISTS presto_test_unpartitioned; + +CREATE TABLE IF NOT EXISTS presto_test_offline (x INT); +ALTER TABLE presto_test_offline DISABLE OFFLINE; +DROP TABLE IF EXISTS presto_test_offline; + +CREATE TABLE IF NOT EXISTS presto_test_offline_partition (x INT) PARTITIONED BY (ds STRING); +ALTER TABLE presto_test_offline_partition ADD IF NOT EXISTS PARTITION (ds='2012-12-30'); +ALTER TABLE presto_test_offline_partition PARTITION (ds='2012-12-30') DISABLE OFFLINE; +DROP TABLE IF EXISTS presto_test_offline_partition; + +DROP TABLE IF EXISTS presto_test_not_readable; + +DROP TABLE IF EXISTS presto_test_bucketed_by_string_int; +DROP TABLE IF EXISTS presto_test_bucketed_by_bigint_boolean; +DROP TABLE IF EXISTS presto_test_bucketed_by_double_float; + +DROP TABLE IF EXISTS presto_test_partition_schema_change; +DROP TABLE IF EXISTS presto_test_partition_schema_change_non_canonical; + +DROP VIEW IF EXISTS presto_test_view; + +DROP TABLE IF EXISTS presto_test_types_textfile; +DROP TABLE IF EXISTS presto_test_types_sequencefile; +DROP TABLE IF EXISTS presto_test_types_rctext; +DROP TABLE IF EXISTS presto_test_types_rcbinary; +DROP TABLE IF EXISTS presto_test_types_orc; +DROP TABLE IF EXISTS presto_test_types_parquet; diff --git a/omnidata/omnidata-openlookeng-connector/pom.xml b/omnidata/omnidata-openlookeng-connector/pom.xml new file mode 100644 index 00000000..72f43c2e --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/pom.xml @@ -0,0 +1,20 @@ + + + 4.0.0 + + + io.hetu.core + presto-root + 1.4.0 + + + openlookeng-omnidata-connector-root + openLooKeng OmniData Connector root + pom + + + stub + connector + + + diff --git a/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2010.txt b/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2010.txt new file mode 100644 index 00000000..8e65f88c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2010.txt @@ -0,0 +1,13 @@ +Copyright 2010 Proofpoint, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2012.txt b/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2012.txt new file mode 100644 index 00000000..3197b0a2 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2012.txt @@ -0,0 +1,13 @@ +Copyright 2012 Proofpoint, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2020.txt b/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2020.txt new file mode 100644 index 00000000..0de8ccbb --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2020.txt @@ -0,0 +1,12 @@ +Copyright (C) 2018-2020. Huawei Technologies Co., Ltd. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2021.txt b/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2021.txt new file mode 100644 index 00000000..6a498fa4 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-alternate-2021.txt @@ -0,0 +1,12 @@ +Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-third.txt b/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-third.txt new file mode 100644 index 00000000..a4928795 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header-third.txt @@ -0,0 +1,12 @@ +Copyright (C) 2018-2020. Autohome Technologies Co., Ltd. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header.txt b/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header.txt new file mode 100644 index 00000000..6a498fa4 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/src/main/resource/license/license-header.txt @@ -0,0 +1,12 @@ +Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/src/modernizer/violations.xml b/omnidata/omnidata-openlookeng-connector/src/modernizer/violations.xml new file mode 100644 index 00000000..67dee5e1 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/src/modernizer/violations.xml @@ -0,0 +1,32 @@ + + + + java/lang/Class.newInstance:()Ljava/lang/Object; + 1.1 + Prefer Class.getConstructor().newInstance() + + + + java/lang/String.toLowerCase:()Ljava/lang/String; + 1.1 + Prefer String.toLowerCase(java.util.Locale) + + + + com/google/common/primitives/Ints.checkedCast:(J)I + 1.8 + Prefer Math.toIntExact(long) + + + + org/testng/Assert.assertEquals:(Ljava/lang/Iterable;Ljava/lang/Iterable;)V + 1.8 + Use io.prestosql.testing.assertions.Assert.assertEquals due to TestNG #543 + + + + org/testng/Assert.assertEquals:(Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/String;)V + 1.8 + Use io.prestosql.testing.assertions.Assert.assertEquals due to TestNG #543 + + diff --git a/omnidata/omnidata-openlookeng-connector/stub/client/pom.xml b/omnidata/omnidata-openlookeng-connector/stub/client/pom.xml new file mode 100644 index 00000000..c9edcd45 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/client/pom.xml @@ -0,0 +1,27 @@ + + + + + com.huawei.boostkit + omniDataStub + 1.0.0 + + + 4.0.0 + boostkit-omnidata-client + jar + + + + com.huawei.boostkit + boostkit-omnidata-core + ${dep.os.arch} + compile + + + + + + \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/block/BlockDeserializer.java b/omnidata/omnidata-openlookeng-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/block/BlockDeserializer.java new file mode 100644 index 00000000..c36b4847 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/block/BlockDeserializer.java @@ -0,0 +1,20 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.block; + +import com.huawei.boostkit.omnidata.decode.Deserializer; + +public class BlockDeserializer implements Deserializer { +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/reader/DataReader.java b/omnidata/omnidata-openlookeng-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/reader/DataReader.java new file mode 100644 index 00000000..df206b14 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/reader/DataReader.java @@ -0,0 +1,21 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.reader; + +public interface DataReader extends AutoCloseable{ + T getNextPage() throws Exception; + T getNextPageBlocking() throws Exception; + boolean isFinished(); +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/reader/DataReaderFactory.java b/omnidata/omnidata-openlookeng-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/reader/DataReaderFactory.java new file mode 100644 index 00000000..c1e4394c --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/reader/DataReaderFactory.java @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.reader; + +import com.huawei.boostkit.omnidata.decode.Deserializer; +import com.huawei.boostkit.omnidata.model.TaskSource; + +import java.util.Properties; + +public class DataReaderFactory { + public static DataReader create(Properties properties, TaskSource taskSource, Deserializer deserializer) { + return null; + } + } diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/pom.xml b/omnidata/omnidata-openlookeng-connector/stub/core/pom.xml new file mode 100644 index 00000000..451302ff --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/pom.xml @@ -0,0 +1,57 @@ + + + + + com.huawei.boostkit + omniDataStub + 1.0.0 + + + 4.0.0 + jar + boostkit-omnidata-core + + + 1.4.0 + 1.68 + 1.9.3 + 1.14 + 1.4 + 2.8.0 + + + + + io.hetu.core + hetu-filesystem-client + ${dep.hetu.version} + + + org.bouncycastle + bcpkix-jdk15on + ${dep.bouncycastle.version} + + + com.melloware + jasypt + ${dep.melloware.version} + + + commons-codec + commons-codec + ${dep.commons.codc.version} + + + commons-cli + commons-cli + ${dep.commons.cli.version} + + + commons-io + commons-io + ${dep.commons.io.version} + + + \ No newline at end of file diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/OmniDataProperty.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/OmniDataProperty.java new file mode 100644 index 00000000..5d5fae72 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/OmniDataProperty.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata; + +/** + * Property for omniData communication + * + * @since 2021-08-27 + */ +public class OmniDataProperty { + private OmniDataProperty() {} + + /** + * constant string for "grpc.client.target.list" + */ + public static final String GRPC_CLIENT_TARGET_LIST = "grpc.client.target.list"; + + /** + * delimiter string for "grpc.client.target.list" + * examples: 192.0.2.1:80,192.0.2.2:80 + */ + public static final String HOSTADDRESS_DELIMITER = ","; + + /** + * constant string for "grpc.client.target" + */ + public static final String GRPC_CLIENT_TARGET = "grpc.client.target"; + + /** + * constant string for "grpc.ssl.enabled" + */ + public static final String GRPC_SSL_ENABLED = "grpc.ssl.enabled"; + + /** + * Directory of Public Key Infrastructure. + */ + public static final String PKI_DIR = "pki.dir"; + + /** + * Path to the SSL client certificate file. + */ + public static final String GRPC_CLIENT_CERT_PATH = "grpc.client.cert.file.path"; + + /** + * Path to the SSL private key file. + */ + public static final String GRPC_CLIENT_PRIVATE_KEY_PATH = "grpc.client.private.key.file.path"; + + /** + * Path to the SSL trust certificate file. + */ + public static final String GRPC_TRUST_CA_PATH = "grpc.trust.ca.file.path"; + + /** + * Path to the SSL Certificate Revocation List file. + */ + public static final String GRPC_CRL_PATH = "grpc.crl.file.path"; + +} + diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/decode/Deserializer.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/decode/Deserializer.java new file mode 100644 index 00000000..ea5c2957 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/decode/Deserializer.java @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.decode; + +public interface Deserializer { +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniDataException.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniDataException.java new file mode 100644 index 00000000..98ca5c44 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniDataException.java @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.exception; + +import static com.huawei.boostkit.omnidata.exception.OmniErrorCode.OMNIDATA_GENERIC_ERROR; + +public class OmniDataException { + public OmniErrorCode getErrorCode() { + return OMNIDATA_GENERIC_ERROR; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniErrorCode.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniErrorCode.java new file mode 100644 index 00000000..9ae2b2b6 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniErrorCode.java @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.exception; + +public enum OmniErrorCode { + OMNIDATA_GENERIC_ERROR, + OMNIDATA_UNSUPPORTED_OPERATOR, + OMNIDATA_INSUFFICIENT_RESOURCES, + OMNIDATA_INVALID_ARGUMENT, + OMNIDATA_IO_ERROR, + OMNIDATA_NOT_FOUND + + } diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/expression/OmniExpressionChecker.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/expression/OmniExpressionChecker.java new file mode 100644 index 00000000..8bc17c68 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/expression/OmniExpressionChecker.java @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.expression; + +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.type.Type; + +public class OmniExpressionChecker { + public static boolean checkType(Type type) { + return true; + } + + public static boolean checkExpression(RowExpression expression) + { + return true; + } + + public static boolean checkAggregateFunction(CallExpression callExpression) { + return true; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/AggregationInfo.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/AggregationInfo.java new file mode 100644 index 00000000..93b8cdc0 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/AggregationInfo.java @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model; + +import com.google.common.base.Objects; +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.RowExpression; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +public class AggregationInfo { + private final Map aggregations; + List groupingKeys; + public AggregationInfo(Map aggregations , List groupingKeys) { + this.aggregations = aggregations; + this.groupingKeys = groupingKeys; + } + + public Map getAggregations() { + return aggregations; + } + + public List getGroupingKeys() { + return groupingKeys; + } + + @Override + public boolean equals(Object object) { + if (this == object) { + return true; + } + if (!(object instanceof AggregationInfo)) { + return false; + } + AggregationInfo that = (AggregationInfo) object; + return aggregations.equals(that.aggregations) && groupingKeys.equals(that.groupingKeys); + } + + @Override + public int hashCode() { + return Objects.hashCode(aggregations, groupingKeys); + } + + public static class AggregateFunction { + private CallExpression callExpression; + boolean isDistinct; + public AggregateFunction(CallExpression callExpression, boolean isDistinct) { + this.callExpression = callExpression; + this.isDistinct = isDistinct; + } + public CallExpression getCall() { + return callExpression; + } + public boolean isDistinct() { + return isDistinct; + } + + @Override + public boolean equals(Object object) { + if (this == object) { + return true; + } + if (!(object instanceof AggregateFunction)) { + return false; + } + AggregateFunction that = (AggregateFunction) object; + return callExpression.equals(that.callExpression) && isDistinct == that.isDistinct; + } + + @Override + public int hashCode() { + return Objects.hashCode(callExpression, isDistinct); + } + } +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Column.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Column.java new file mode 100644 index 00000000..99043f20 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Column.java @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model; + + +import io.prestosql.spi.type.Type; + +public class Column { + public Column(int fieldId, String name, Type type, boolean isPartiontionKey, Object partiontionKeyValues) { + } + + public Column(int fieldId, String name, Type type) { + } +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Predicate.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Predicate.java new file mode 100644 index 00000000..9250bfff --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Predicate.java @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model; + +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.type.Type; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalLong; + +public class Predicate { + public Predicate(List types, List columns, Optional filter, List projections, + Map domainMap, Map bloomFilters, Optional aggregations, + OptionalLong limit) { + } + + public List getTypes() { + return Collections.emptyList(); + } + + public List getColumns() { + return Collections.emptyList(); + } + + public Optional getFilter() { + return Optional.empty(); + } + + public List getProjections() { + return Collections.emptyList(); + } + + public Optional getAggregations() { + return Optional.empty(); + } + +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/TaskSource.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/TaskSource.java new file mode 100644 index 00000000..079403ca --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/TaskSource.java @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model; + + +import com.huawei.boostkit.omnidata.model.datasource.DataSource; + +public class TaskSource { + public static final int ONE_MEGABYTES = 1 * 1024 * 1024; + public TaskSource(DataSource dataSource, Predicate predicate, int maxPageSizeInBytes) { + } + + public DataSource getDataSource() { + return null; + } + + public Predicate getPredicate() { + return null; + } + + public int getMaxPageSizeInBytes() { + return 0; + } +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/DataSource.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/DataSource.java new file mode 100644 index 00000000..33fd3430 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/DataSource.java @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model.datasource; + +public class DataSource { +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsOrcDataSource.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsOrcDataSource.java new file mode 100644 index 00000000..3c913590 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsOrcDataSource.java @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model.datasource.hdfs; + +import com.huawei.boostkit.omnidata.model.datasource.DataSource; + +public class HdfsOrcDataSource extends DataSource { + public HdfsOrcDataSource(String path, long start, long length, boolean useColumnNames) { + } +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsParquetDataSource.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsParquetDataSource.java new file mode 100644 index 00000000..da8d0382 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsParquetDataSource.java @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model.datasource.hdfs; + +import com.huawei.boostkit.omnidata.model.datasource.DataSource; + +public class HdfsParquetDataSource extends DataSource { + public HdfsParquetDataSource(String path, long start, long length, boolean useColumnNames) { + } +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsRecordDataSource.java b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsRecordDataSource.java new file mode 100644 index 00000000..a2dacfee --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsRecordDataSource.java @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model.datasource.hdfs; + +import com.huawei.boostkit.omnidata.model.datasource.DataSource; + +import java.util.Properties; + +public class HdfsRecordDataSource extends DataSource { + public HdfsRecordDataSource(String path, long start, long length, long fileSize, Properties schema) { + } +} diff --git a/omnidata/omnidata-openlookeng-connector/stub/pom.xml b/omnidata/omnidata-openlookeng-connector/stub/pom.xml new file mode 100644 index 00000000..66f8aff7 --- /dev/null +++ b/omnidata/omnidata-openlookeng-connector/stub/pom.xml @@ -0,0 +1,83 @@ + + + 4.0.0 + + com.huawei.boostkit + omniDataStub + pom + 1.0.0 + + + core + client + + + + 1.4.0 + ${os.detected.arch} + + + + + + com.huawei.boostkit + boostkit-omnidata-client + ${project.version} + ${dep.os.arch} + + + com.huawei.boostkit + boostkit-omnidata-core + ${project.version} + ${dep.os.arch} + + + + + + + + io.hetu.core + presto-spi + ${dep.hetu.version} + + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.1.2 + + + + true + true + + + + + + + jar + + + ${dep.os.arch} + + false + + + + + default-jar + none + + + + + + + \ No newline at end of file diff --git a/omnidata/omnidata-spark-connector/.gitignore b/omnidata/omnidata-spark-connector/.gitignore new file mode 100644 index 00000000..d869f46c --- /dev/null +++ b/omnidata/omnidata-spark-connector/.gitignore @@ -0,0 +1,5 @@ +.idea +target +*.iml +*-pom.xml +*.DS_Store \ No newline at end of file diff --git a/omnidata/omnidata-spark-connector/LICENSE b/omnidata/omnidata-spark-connector/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/omnidata/omnidata-spark-connector/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/omnidata/omnidata-spark-connector/README.md b/omnidata/omnidata-spark-connector/README.md index adad9fcd..99aed9af 100644 --- a/omnidata/omnidata-spark-connector/README.md +++ b/omnidata/omnidata-spark-connector/README.md @@ -1 +1,28 @@ -# omnidata-spark-connector \ No newline at end of file +# omnidata-spark-connector + + + +Introduction +============ + +The omnidata spark connector library running on Kunpeng processors is a Spark SQL plugin that pushes computing-side operators to storage nodes for computing. It is developed based on original APIs of Apache [Spark 3.0.0](https://github.com/apache/spark/tree/v3.0.0). This library applies to the big data storage separation scenario or large-scale fusion scenario where a large number of compute nodes read data from remote nodes. In this scenario, a large amount of raw data is transferred from storage nodes to compute nodes over the network for processing, resulting in a low proportion of valid data and a huge waste of network bandwidth. You can find the latest documentation, including a programming guide, on the project web page. This README file only contains basic setup instructions. + + +Building And Packageing +==================== + +(1) Build the project under the "omnidata-spark-connector" directory: + + mvn clean package + +(2) Obtain "boostkit-omnidata-spark-sql_2.12-3.0.0-1.0.0.jar" under the "omnidata-spark-connector/connector/target" directory. + +Contribution Guidelines +======== + +Track the bugs and feature requests via GitHub [issues](https://github.com/kunpengcompute/omnidata-spark-connector/issues). + +More Information +======== + +For further assistance, send an email to kunpengcompute@huawei.com. diff --git a/omnidata/omnidata-spark-connector/connector/pom.xml b/omnidata/omnidata-spark-connector/connector/pom.xml new file mode 100644 index 00000000..fa632881 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/pom.xml @@ -0,0 +1,144 @@ + + + + org.apache.spark + omnidata-spark-connector-root + 1.0.0 + + + 4.0.0 + boostkit-omnidata-spark-sql_2.12-3.0.0 + 1.0.0 + boostkit omnidata spark sql + 2021 + jar + + + 1.8 + 1.8 + UTF-8 + 2.12 + 3.0.0 + 2.12.0 + 1.4.0 + 1.35.0 + + + + org.apache.spark + spark-hive_2.12 + ${spark.version} + compile + + + io.hetu.core + presto-spi + ${dep.hetu.version} + + + io.hetu.core + presto-main + ${dep.hetu.version} + + + org.slf4j + slf4j-jdk14 + + + org.slf4j + log4j-over-slf4j + + + + + com.huawei.boostkit + boostkit-omnidata-client-stub + 1.0.0 + compile + + + com.huawei.boostkit + boostkit-omnidata-core-stub + 1.0.0 + compile + + + io.airlift + slice + 0.38 + + + org.apache.curator + curator-framework + ${dep.curator.version} + + + org.apache.curator + curator-recipes + ${dep.curator.version} + + + + src/main/scala + + + org.codehaus.mojo + build-helper-maven-plugin + 3.0.0 + + + generate-sources + + add-source + + + + src/main/java + + + + + + + org.scala-tools + maven-scala-plugin + 2.15.2 + + + scala-compile-first + process-resources + + add-source + compile + + + + compile + + compile + + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.5 + + + + jar + + + + default-jar + none + + + + + + + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/DataIoAdapter.java b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/DataIoAdapter.java new file mode 100644 index 00000000..eeb83700 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/DataIoAdapter.java @@ -0,0 +1,883 @@ +package org.apache.spark.sql; + +import static io.prestosql.spi.function.FunctionKind.AGGREGATE; +import static io.prestosql.spi.function.FunctionKind.SCALAR; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; + +import com.huawei.boostkit.omnidata.exception.OmniDataException; +import com.huawei.boostkit.omnidata.exception.OmniErrorCode; +import com.huawei.boostkit.omnidata.model.AggregationInfo; +import com.huawei.boostkit.omnidata.model.Column; +import com.huawei.boostkit.omnidata.model.Predicate; +import com.huawei.boostkit.omnidata.model.TaskSource; +import com.huawei.boostkit.omnidata.model.datasource.DataSource; +import com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsOrcDataSource; +import com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsParquetDataSource; +import com.huawei.boostkit.omnidata.reader.impl.DataReaderImpl; +import com.huawei.boostkit.omnidata.spark.SparkDeserializer; +import com.huawei.boostkit.omnidata.type.DecodeType; +import com.huawei.boostkit.omnidata.type.LongDecodeType; +import com.huawei.boostkit.omnidata.type.RowDecodeType; + +import com.google.common.collect.ImmutableMap; + +import io.prestosql.spi.connector.QualifiedObjectName; +import io.prestosql.spi.function.BuiltInFunctionHandle; +import io.prestosql.spi.function.FunctionHandle; +import io.prestosql.spi.function.FunctionKind; +import io.prestosql.spi.function.Signature; +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.ConstantExpression; +import io.prestosql.spi.relation.InputReferenceExpression; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.relation.SpecialForm; +import io.prestosql.spi.type.BigintType; +import io.prestosql.spi.type.DoubleType; +import io.prestosql.spi.type.RowType; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeSignature; +import scala.collection.JavaConverters; +import scala.collection.Seq; + +import org.apache.hadoop.hive.ql.exec.TaskExecutionException; +import org.apache.spark.sql.catalyst.expressions.Add; +import org.apache.spark.sql.catalyst.expressions.And; +import org.apache.spark.sql.catalyst.expressions.Attribute; +import org.apache.spark.sql.catalyst.expressions.AttributeReference; +import org.apache.spark.sql.catalyst.expressions.BinaryArithmetic; +import org.apache.spark.sql.catalyst.expressions.Cast; +import org.apache.spark.sql.catalyst.expressions.Divide; +import org.apache.spark.sql.catalyst.expressions.EqualTo; +import org.apache.spark.sql.catalyst.expressions.Expression; +import org.apache.spark.sql.catalyst.expressions.GreaterThan; +import org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual; +import org.apache.spark.sql.catalyst.expressions.In; +import org.apache.spark.sql.catalyst.expressions.IsNotNull; +import org.apache.spark.sql.catalyst.expressions.IsNull; +import org.apache.spark.sql.catalyst.expressions.LessThan; +import org.apache.spark.sql.catalyst.expressions.LessThanOrEqual; +import org.apache.spark.sql.catalyst.expressions.Literal; +import org.apache.spark.sql.catalyst.expressions.Multiply; +import org.apache.spark.sql.catalyst.expressions.NamedExpression; +import org.apache.spark.sql.catalyst.expressions.Not; +import org.apache.spark.sql.catalyst.expressions.Or; +import org.apache.spark.sql.catalyst.expressions.Remainder; +import org.apache.spark.sql.catalyst.expressions.Subtract; +import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction; +import org.apache.spark.sql.execution.datasources.PartitionedFile; +import org.apache.spark.sql.execution.ndp.AggExeInfo; +import org.apache.spark.sql.execution.ndp.FilterExeInfo; +import org.apache.spark.sql.execution.ndp.PushDownInfo; +import org.apache.spark.sql.execution.vectorized.WritableColumnVector; +import org.apache.spark.sql.hive.HiveSimpleUDF; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.sql.Date; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalLong; +import java.util.Properties; +import java.util.Set; + +/** + * DataIoAdapter + * + * @since 2021-03-31 + */ +public class DataIoAdapter { + private int TASK_FAILED_TIMES = 3; + + private List omnidataTypes = new ArrayList<>(); + + private List omnidataColumns = new ArrayList<>(); + + private Optional prestoFilter = Optional.empty(); + + private Set columnNameSet = new HashSet<>(); + + private List omnidataProjections = new ArrayList<>(); + + private boolean hasNextPage = false; + + private DataReaderImpl orcDataReader = null; + + private List columnTypesList = new ArrayList<>(); + + private List columnOrdersList = new ArrayList<>(); + + private List filterTypesList = new ArrayList<>(); + + private List filterOrdersList = new ArrayList<>(); + + private Map fieldMap = new HashMap<>(); + + private Map columnNameMap = new HashMap<>(); + + private Set partitionColumnName = new HashSet<>(); + + private List listAtt = new ArrayList<>(); + + private int columnOffset = 0; + + private String filePath = ""; + + private int columnOrder = 0; + + private NdpUdfExpressions ndpUdfExpressions = new NdpUdfExpressions(); + + private static final Logger LOG = LoggerFactory.getLogger(DataIoAdapter.class); + + /** + * Contact with Omni-Data-Server + * @param pageCandidate file split info + * @param sparkOutPut data schema + * @param partitionColumn partition column + * @param filterOutPut filter schema + * @param pushDownOperators push down expressions + * @param omniDataProperties auth properties + * @return WritableColumnVector data result info + * @throws TaskExecutionException connect to omni-data-server failed exception + * @notice 3rd parties api throws Exception, function has to catch basic Exception + */ + public Iterator getPageIterator( + PageCandidate pageCandidate, + Seq sparkOutPut, + Seq partitionColumn, + Seq filterOutPut, + PushDownInfo pushDownOperators, + OmniDataProperties omniDataProperties) throws TaskExecutionException, UnknownHostException { + // initCandidates + initCandidates(pageCandidate, filterOutPut); + + // create AggregationInfo + // init agg candidates + List partitionColumnBatch = JavaConverters.seqAsJavaList(partitionColumn); + for (Attribute attribute : partitionColumnBatch) { + partitionColumnName.add(attribute.name()); + } + List aggExecutionList = + JavaConverters.seqAsJavaList(pushDownOperators.aggExecutions()); + if (aggExecutionList.size() == 0) { + initColumnInfo(sparkOutPut); + } + DataSource dataSource = initDataSource(pageCandidate); + RowExpression rowExpression = initFilter(pushDownOperators.filterExecutions()); + prestoFilter = rowExpression == null ? + Optional.empty() : Optional.of(rowExpression); + Optional aggregations = + initAggAndGroupInfo(aggExecutionList); + // create limitLong + OptionalLong limitLong = NdpUtils.convertLimitExeInfo(pushDownOperators.limitExecution()); + + Predicate predicate = new Predicate( + omnidataTypes, omnidataColumns, prestoFilter, omnidataProjections, + ImmutableMap.of(), ImmutableMap.of(), aggregations, limitLong); + TaskSource taskSource = new TaskSource(dataSource, predicate, 1048576); + SparkDeserializer deserializer = initSparkDesializer(); + Properties properties = NdpUtils.getProperties(omniDataProperties); + WritableColumnVector[] page = null; + int failedTimes = 0; + String[] sdiHostArray = pageCandidate.getSdiHosts().split(","); + int random_index = (int) (Math.random() * sdiHostArray.length); + Iterator sdiHosts = Arrays.stream(sdiHostArray).iterator(); + Set sdiHostSet = new HashSet<>(); + sdiHostSet.add(sdiHostArray[random_index]); + while (sdiHosts.hasNext()) { + String sdiHost; + if (failedTimes == 0) { + sdiHost = sdiHostArray[random_index]; + } else { + sdiHost = sdiHosts.next(); + if (sdiHostSet.contains(sdiHost)) { + continue; + } + } + String ipAddress = InetAddress.getByName(sdiHost).getHostAddress(); + properties.put("grpc.client.target", ipAddress + ":" + pageCandidate.getSdiPort()); + try { + orcDataReader = new DataReaderImpl( + properties, taskSource, deserializer); + hasNextPage = true; + page = (WritableColumnVector[]) orcDataReader.getNextPageBlocking(); + if (orcDataReader.isFinished()) { + orcDataReader.close(); + hasNextPage = false; + } + break; + } catch (OmniDataException omniDataException) { + OmniErrorCode errorCode = omniDataException.getErrorCode(); + switch (errorCode) { + case OMNIDATA_INSUFFICIENT_RESOURCES: + LOG.warn("OMNIDATA_INSUFFICIENT_RESOURCES: " + + "OmniData-server's push down queue is full, " + + "begin to find next OmniData-server"); + break; + case OMNIDATA_UNSUPPORTED_OPERATOR: + LOG.warn("OMNIDATA_UNSUPPORTED_OPERATOR: " + + "OmniDataException: exist unsupported operator"); + break; + case OMNIDATA_GENERIC_ERROR: + LOG.warn("OMNIDATA_GENERIC_ERROR: Current OmniData-server unavailable, " + + "begin to find next OmniData-server"); + break; + case OMNIDATA_NOT_FOUND: + LOG.warn("OMNIDATA_NOT_FOUND: Current OmniData-Server not found, " + + "begin to find next OmniData-server"); + break; + case OMNIDATA_INVALID_ARGUMENT: + LOG.warn("OMNIDATA_INVALID_ARGUMENT: INVALID_ARGUMENT, " + + "exist unsupported operator or dataType"); + break; + case OMNIDATA_IO_ERROR: + LOG.warn("OMNIDATA_IO_ERROR: Current OmniData-Server io exception, " + + "begin to find next OmniData-server"); + break; + default: + LOG.warn("OmniDataException: OMNIDATA_ERROR."); + } + LOG.warn("Failed host name is : {}.", sdiHost); + ++failedTimes; + } catch (Exception e) { + LOG.warn("Failed host name is : {}.", sdiHost); + ++failedTimes; + } + } + int retryTime = Math.min(TASK_FAILED_TIMES, sdiHostArray.length); + if (failedTimes >= retryTime) { + LOG.warn("No Omni-data-server to Connect, Task has tried {} times.", retryTime); + throw new TaskExecutionException("No Omni-data-server to Connect"); + } + List l = new ArrayList<>(); + l.add(page); + return l.iterator(); + } + + public boolean hasnextIterator(List pageList, PageToColumnar pageToColumnarClass, + PartitionedFile partitionFile, boolean isVectorizedReader) + throws Exception { + if (!hasNextPage) { + return hasNextPage; + } + WritableColumnVector[] page = (WritableColumnVector[]) orcDataReader.getNextPageBlocking(); + if (orcDataReader.isFinished()) { + orcDataReader.close(); + return false; + } + List l = new ArrayList<>(); + l.add(page); + pageList.addAll(pageToColumnarClass + .transPageToColumnar(l.iterator(), isVectorizedReader)); + return true; + } + + private void initCandidates(PageCandidate pageCandidate, Seq filterOutPut) { + omnidataTypes.clear(); + omnidataColumns.clear(); + omnidataProjections.clear(); + fieldMap.clear(); + columnNameSet.clear(); + columnTypesList.clear(); + columnOrdersList.clear(); + filterTypesList.clear(); + filterOrdersList.clear(); + partitionColumnName.clear(); + columnNameMap.clear(); + columnOrder = 0; + filePath = pageCandidate.getFilePath(); + columnOffset = pageCandidate.getColumnOffset(); + listAtt = JavaConverters.seqAsJavaList(filterOutPut); + } + + private RowExpression extractNamedExpression(Expression namedExpression) { + Type prestoType = NdpUtils.transOlkDataType(namedExpression.dataType(), false); + int aggProjectionId; + String aggColumnName = namedExpression.toString().split("#")[0]; + columnOrdersList.add(columnOrder++); + columnTypesList.add(NdpUtils.transDataIoDataType(namedExpression.dataType())); + + if (null != fieldMap.get(aggColumnName)) { + aggProjectionId = fieldMap.get(aggColumnName); + } else { + int columnId = NdpUtils + .getColumnId(namedExpression.toString()) - columnOffset; + aggProjectionId = fieldMap.size(); + fieldMap.put(aggColumnName, aggProjectionId); + omnidataTypes.add(prestoType); + boolean isPartitionKey = partitionColumnName.contains(aggColumnName); + String partitionValue = NdpUtils.getPartitionValue(filePath, aggColumnName); + omnidataColumns.add(new Column(columnId, aggColumnName, + prestoType, isPartitionKey, partitionValue)); + columnNameSet.add(aggColumnName); + if (null == columnNameMap.get(aggColumnName)) { + columnNameMap.put(aggColumnName, columnNameMap.size()); + } + omnidataProjections.add(new InputReferenceExpression(aggProjectionId, prestoType)); + } + + return new InputReferenceExpression(aggProjectionId, prestoType); + } + + private void extractSumMaxMinAggregation(Type prestoType, String expressionName, + Map aggregationMap) { + String operatorName = expressionName.split("\\(")[0]; + List arguments = new ArrayList<>(); + Type returnType = NdpUtils.transAggRetType(prestoType); + if(operatorName.equals("sum")) { + columnTypesList.add(NdpUtils.transAggDecodeType(returnType)); + } else { + columnTypesList.add(NdpUtils.transAggDecodeType(prestoType)); + } + FunctionHandle functionHandle = new BuiltInFunctionHandle( + new Signature(QualifiedObjectName.valueOfDefaultFunction(operatorName), AGGREGATE, + prestoType.getTypeSignature(), prestoType.getTypeSignature())); + int aggProjectionId = fieldMap.get(expressionName); + RowExpression rowExpression = new InputReferenceExpression(aggProjectionId, prestoType); + arguments.add(rowExpression); + CallExpression callExpression = new CallExpression(operatorName, + functionHandle, returnType, arguments, Optional.empty()); + aggregationMap.put(String.format("%s_%s", operatorName, columnOrder), + new AggregationInfo.AggregateFunction(callExpression, false)); + } + + private void extractAvgAggregation(Type prestoType, String expressionName, + Map aggregationMap) { + List arguments = new ArrayList<>(); + int aggProjectionId = fieldMap.get(expressionName); + RowExpression rowExpression = new InputReferenceExpression(aggProjectionId, prestoType); + arguments.add(rowExpression); + FunctionHandle functionHandle = new BuiltInFunctionHandle( + new Signature(QualifiedObjectName.valueOfDefaultFunction("avg"), AGGREGATE, + DoubleType.DOUBLE.getTypeSignature(), prestoType.getTypeSignature())); + List rowType = Arrays.asList(DoubleType.DOUBLE, BigintType.BIGINT); + RowType returnType = RowType.anonymous(rowType); + CallExpression callExpression = new CallExpression("avg", + functionHandle, returnType, arguments, Optional.empty()); + aggregationMap.put(String.format("%s_%s", "avg", columnOrder), + new AggregationInfo.AggregateFunction(callExpression, false)); + } + + private void extractCountAggregation(Type prestoType, String expressionName, + Map aggregationMap) { + List arguments = new ArrayList<>(); + Signature signature = new Signature(QualifiedObjectName.valueOfDefaultFunction("count"), + AGGREGATE, BIGINT.getTypeSignature()); + if (!expressionName.equals("count(1)")) { + int aggProjectionId = fieldMap.get(expressionName); + RowExpression rowExpression = new InputReferenceExpression(aggProjectionId, prestoType); + arguments.add(rowExpression); + signature = new Signature(QualifiedObjectName.valueOfDefaultFunction("count"), + AGGREGATE, BIGINT.getTypeSignature(), prestoType.getTypeSignature()); + } + FunctionHandle functionHandle = new BuiltInFunctionHandle(signature); + CallExpression callExpression = new CallExpression("count", + functionHandle, BIGINT, arguments, Optional.empty()); + aggregationMap.put(String.format("%s_%s", "count", columnOrder), + new AggregationInfo.AggregateFunction(callExpression, false)); + } + + private CallExpression createAggBinCall(BinaryArithmetic expression, + String operatorName, Type prestoType) { + List arguments = new ArrayList<>(); + Type leftPrestoType = NdpUtils.transOlkDataType( + expression.left().dataType(), false); + Type rightPrestoType = NdpUtils.transOlkDataType( + expression.right().dataType(), false); + FunctionHandle functionHandle = new BuiltInFunctionHandle( + new Signature(QualifiedObjectName.valueOfDefaultFunction("$operator$" + operatorName), + SCALAR, prestoType.getTypeSignature(), + leftPrestoType.getTypeSignature(), + rightPrestoType.getTypeSignature())); + arguments.add(createAggProjection(expression.left())); + arguments.add(createAggProjection(expression.right())); + return new CallExpression(operatorName, functionHandle, + prestoType, arguments, Optional.empty()); + } + + private RowExpression createAggProjection(Expression expression) { + Type prestoType = NdpUtils.transOlkDataType(expression.dataType(), false); + AggExpressionType aggExpressionType = AggExpressionType + .valueOf(expression.getClass().getSimpleName()); + switch (aggExpressionType) { + case Add: + return createAggBinCall((Add) expression, "Add", prestoType); + case Subtract: + return createAggBinCall((Subtract) expression, "Subtract", prestoType); + case Multiply: + return createAggBinCall((Multiply) expression, "Multiply", prestoType); + case Divide: + return createAggBinCall((Divide) expression, "Divide", prestoType); + case Remainder: + return createAggBinCall((Remainder) expression, "Remainder", prestoType); + case Literal: + Object value = NdpUtils.transData( + expression.dataType().toString(), expression.toString()); + return new ConstantExpression(value, prestoType); + case AttributeReference: + String aggColumnName = expression.toString().split("#")[0]; + int field; + if (null == columnNameMap.get(aggColumnName)) { + field = columnNameMap.size(); + columnNameMap.put(aggColumnName, field); + int columnId = NdpUtils + .getColumnId(expression.toString()) - columnOffset; + boolean isPartitionKey = partitionColumnName.contains(aggColumnName); + String partitionValue = NdpUtils + .getPartitionValue(filePath, aggColumnName); + omnidataColumns.add( + new Column(columnId, aggColumnName, + prestoType, isPartitionKey, partitionValue)); + } else { + field = columnNameMap.get(aggColumnName); + } + return new InputReferenceExpression(field, prestoType); + + default: + throw new UnsupportedOperationException("unsupported agg operation type"); + } + } + + enum AggExpressionType { + Multiply, + Add, + Subtract, + Divide, + Remainder, + Literal, + AttributeReference + } + + private void extractAggregateFunction(AggregateFunction aggregateFunction, + Map aggregationMap) { + List expressions = JavaConverters.seqAsJavaList(aggregateFunction.children()); + String aggregateFunctionName = aggregateFunction.toString(); + Type prestoType = NdpUtils.transOlkDataType(aggregateFunction.dataType(), false); + AggregateFunctionType aggregateFunctionType = AggregateFunctionType.valueOf( + aggregateFunction.getClass().getSimpleName()); + for (Expression expression : expressions) { + if(!(expression instanceof Literal)){ + omnidataProjections.add(createAggProjection(expression)); + int projectionId = fieldMap.size(); + fieldMap.put(aggregateFunctionName, projectionId); + if (aggregateFunctionType.equals(AggregateFunctionType.Count)) { + prestoType = NdpUtils.transOlkDataType(expression.dataType(), false); + } + omnidataTypes.add(prestoType); + break; + } + } + columnOrdersList.add(columnOrder++); + switch (aggregateFunctionType) { + case Sum: + case Max: + case Min: + extractSumMaxMinAggregation(prestoType, aggregateFunctionName, aggregationMap); + break; + case Average: + columnTypesList.add(new RowDecodeType()); + columnOrdersList.add(columnOrder++); + extractAvgAggregation(prestoType, aggregateFunctionName, aggregationMap); + break; + case Count: + columnTypesList.add(new LongDecodeType()); + extractCountAggregation(prestoType, aggregateFunctionName, aggregationMap); + break; + } + } + + enum AggregateFunctionType { + Sum, + Average, + Max, + Min, + Count + } + + enum ExpressionOperator { + And, + Or, + Not, + EqualTo, + IsNotNull, + LessThan, + GreaterThan, + GreaterThanOrEqual, + LessThanOrEqual, + In, + HiveSimpleUDF, + IsNull + } + + private Optional createAggregationInfo( + List aggregateFunctions, + List namedExpressions) { + List groupingKeys = new ArrayList<>(); + Map aggregationMap = new LinkedHashMap<>(); + boolean isEmpty = true; + for (NamedExpression namedExpression : namedExpressions) { + RowExpression groupingKey = extractNamedExpression((Expression) namedExpression); + groupingKeys.add(groupingKey); + isEmpty = false; + } + for (AggregateFunction aggregateFunction : aggregateFunctions) { + extractAggregateFunction(aggregateFunction, aggregationMap); + isEmpty = false; + } + return isEmpty ? Optional.empty() : Optional.of( + new AggregationInfo(aggregationMap, groupingKeys)); + } + + private Optional extractAggAndGroupExpression( + List aggExecutionList) { + Optional resAggregationInfo = Optional.empty(); + for (AggExeInfo aggExeInfo : aggExecutionList) { + List aggregateExpressions = JavaConverters.seqAsJavaList( + aggExeInfo.aggregateExpressions()); + List namedExpressions = JavaConverters.seqAsJavaList( + aggExeInfo.groupingExpressions()); + resAggregationInfo = createAggregationInfo(aggregateExpressions, namedExpressions); + } + return resAggregationInfo; + } + + private RowExpression extractFilterExpression(Seq filterExecution) { + List filterExecutionList = JavaConverters.seqAsJavaList(filterExecution); + RowExpression resRowExpression = null; + for (FilterExeInfo filterExeInfo : filterExecutionList) { + resRowExpression = reverseExpressionTree(filterExeInfo.filter()); + } + return resRowExpression; + } + + private RowExpression reverseExpressionTree(Expression filterExpression) { + RowExpression resRowExpression = null; + if (filterExpression == null) { + return resRowExpression; + } + List tempRowExpression = new ArrayList<>(); + if (filterExpression instanceof Or) { + RowExpression a1 = getExpression(((Or) filterExpression).left()); + RowExpression a2 = getExpression(((Or) filterExpression).right()); + tempRowExpression.add(a1); + tempRowExpression.add(a2); + resRowExpression = new SpecialForm(SpecialForm.Form.valueOf("OR"), + BOOLEAN, tempRowExpression); + } else if (filterExpression instanceof And) { + RowExpression a1 = getExpression(((And) filterExpression).left()); + RowExpression a2 = getExpression(((And) filterExpression).right()); + tempRowExpression.add(a2); + tempRowExpression.add(a1); + resRowExpression = new SpecialForm(SpecialForm.Form.valueOf("AND"), + BOOLEAN, tempRowExpression); + } else { + resRowExpression = getExpression(filterExpression); + } + return resRowExpression; + } + + private RowExpression getExpression(Expression filterExpression) { + RowExpression resRowExpression = null; + List rightExpressions = new ArrayList<>(); + ExpressionOperator expressionOperType = + ExpressionOperator.valueOf(filterExpression.getClass().getSimpleName()); + Expression left; + Expression right; + String operatorName; + switch (expressionOperType) { + case Or: + case And: + return reverseExpressionTree(filterExpression); + case Not: + Signature notSignature = new Signature( + QualifiedObjectName.valueOfDefaultFunction("not"), + FunctionKind.SCALAR, new TypeSignature("boolean"), + new TypeSignature("boolean")); + RowExpression tempRowExpression = getExpression(((Not) filterExpression).child()); + List notArguments = new ArrayList<>(); + notArguments.add(tempRowExpression); + return new CallExpression("not", new BuiltInFunctionHandle(notSignature), + BOOLEAN, notArguments, Optional.empty()); + case EqualTo: + if (((EqualTo) filterExpression).left() instanceof Literal) { + rightExpressions.add(((EqualTo) filterExpression).left()); + left = ((EqualTo) filterExpression).right(); + } else { + rightExpressions.add(((EqualTo) filterExpression).right()); + left = ((EqualTo) filterExpression).left(); + } + return getRowExpression(left, + "equal", rightExpressions); + case IsNotNull: + Signature isnullSignature = new Signature( + QualifiedObjectName.valueOfDefaultFunction("not"), + FunctionKind.SCALAR, new TypeSignature("boolean"), + new TypeSignature("boolean")); + RowExpression isnullRowExpression = + getRowExpression(((IsNotNull) filterExpression).child(), + "is_null", null); + List isnullArguments = new ArrayList<>(); + isnullArguments.add(isnullRowExpression); + return new CallExpression("not", new BuiltInFunctionHandle(isnullSignature), + BOOLEAN, isnullArguments, Optional.empty()); + case IsNull: + return getRowExpression(((IsNull) filterExpression).child(), + "is_null", null); + case LessThan: + if (((LessThan) filterExpression).left() instanceof Literal) { + rightExpressions.add(((LessThan) filterExpression).left()); + left = ((LessThan) filterExpression).right(); + operatorName = "greater_than"; + } else { + rightExpressions.add(((LessThan) filterExpression).right()); + left = ((LessThan) filterExpression).left(); + operatorName = "less_than"; + } + return getRowExpression(left, + operatorName, rightExpressions); + case GreaterThan: + if (((GreaterThan) filterExpression).left() instanceof Literal) { + rightExpressions.add(((GreaterThan) filterExpression).left()); + left = ((GreaterThan) filterExpression).right(); + operatorName = "less_than"; + } else { + rightExpressions.add(((GreaterThan) filterExpression).right()); + left = ((GreaterThan) filterExpression).left(); + operatorName = "greater_than"; + } + return getRowExpression(left, + operatorName, rightExpressions); + case GreaterThanOrEqual: + if (((GreaterThanOrEqual) filterExpression).left() instanceof Literal) { + rightExpressions.add(((GreaterThanOrEqual) filterExpression).left()); + left = ((GreaterThanOrEqual) filterExpression).right(); + operatorName = "less_than_or_equal"; + } else { + rightExpressions.add(((GreaterThanOrEqual) filterExpression).right()); + left = ((GreaterThanOrEqual) filterExpression).left(); + operatorName = "greater_than_or_equal"; + } + return getRowExpression(left, + operatorName, rightExpressions); + case LessThanOrEqual: + if (((LessThanOrEqual) filterExpression).left() instanceof Literal) { + rightExpressions.add(((LessThanOrEqual) filterExpression).left()); + left = ((LessThanOrEqual) filterExpression).right(); + operatorName = "greater_than_or_equal"; + } else { + rightExpressions.add(((LessThanOrEqual) filterExpression).right()); + left = ((LessThanOrEqual) filterExpression).left(); + operatorName = "less_than_or_equal"; + } + return getRowExpression(left, + operatorName, rightExpressions); + case In: + List rightExpression = + JavaConverters.seqAsJavaList(((In) filterExpression).list()); + return getRowExpression(((In) filterExpression).value(), "in", rightExpression); + case HiveSimpleUDF: + return getRowExpression(filterExpression, + ((HiveSimpleUDF) filterExpression).name(), rightExpressions); + default: + return resRowExpression; + } + } + + private RowExpression getRowExpression(Expression leftExpression, String operatorName, + List rightExpression) { + String signatureName = operatorName; + PrestoExpressionInfo expressionInfo = new PrestoExpressionInfo(); + Type prestoType; + int filterProjectionId; + // deal with left expression only UDF and Attribute + if (leftExpression instanceof AttributeReference) { + prestoType = NdpUtils.transOlkDataType(leftExpression.dataType(), false); + filterProjectionId = putFilterValue(leftExpression, prestoType); + } else if (leftExpression instanceof Cast && (operatorName.equals("in") + || leftExpression.dataType().toString().toLowerCase(Locale.ENGLISH).equals("datetype"))) { + prestoType = NdpUtils.transOlkDataType(((Cast) leftExpression).child().dataType(), false); + filterProjectionId = putFilterValue(((Cast) leftExpression).child(), prestoType); + } else { + ndpUdfExpressions.createNdpUdf(leftExpression, expressionInfo, fieldMap); + putFilterValue(expressionInfo.getChildExpression(), expressionInfo.getFieldDataType()); + prestoType = expressionInfo.getReturnType(); + filterProjectionId = expressionInfo.getProjectionId(); + } + // deal with right expression + List argumentValues = new ArrayList<>(); + List multiArguments = new ArrayList<>(); + int rightProjectionId = -1; + RowExpression rowExpression; + if (rightExpression != null && rightExpression.size() > 0 && + rightExpression.get(0) instanceof AttributeReference) { + rightProjectionId = putFilterValue(rightExpression.get(0), prestoType); + multiArguments.add(new InputReferenceExpression(filterProjectionId, prestoType)); + multiArguments.add(new InputReferenceExpression(rightProjectionId, prestoType)); + rowExpression = NdpFilterUtils.generateRowExpression( + signatureName, expressionInfo, prestoType, filterProjectionId, + null, multiArguments, "multy_columns"); + } else { + // get right value + if (NdpUtils.isInDateExpression(leftExpression, operatorName)) { + argumentValues = getDateValue(rightExpression); + } else { + argumentValues = getValue(rightExpression, signatureName, + leftExpression.dataType().toString()); + } + rowExpression = NdpFilterUtils.generateRowExpression( + signatureName, expressionInfo, prestoType, filterProjectionId, + argumentValues, null, signatureName); + } + return rowExpression; + } + + // column projection赋值 + private int putFilterValue(Expression valueExpression, Type prestoType) { + // Filter赋值 + int columnId = NdpUtils.getColumnId(valueExpression.toString()) - columnOffset; + String filterColumnName = valueExpression.toString().split("#")[0]; + if (null != fieldMap.get(filterColumnName)) { + return fieldMap.get(filterColumnName); + } + boolean isPartitionKey = partitionColumnName.contains(filterColumnName); + int filterProjectionId = fieldMap.size(); + fieldMap.put(filterColumnName, filterProjectionId); + filterTypesList.add(NdpUtils.transDataIoDataType(valueExpression.dataType())); + filterOrdersList.add(filterProjectionId); + String partitionValue = NdpUtils.getPartitionValue(filePath, filterColumnName); + columnNameSet.add(filterColumnName); + omnidataProjections.add(new InputReferenceExpression(filterProjectionId, prestoType)); + omnidataColumns.add(new Column(columnId, filterColumnName, + prestoType, isPartitionKey, partitionValue)); + omnidataTypes.add(prestoType); + if (null == columnNameMap.get(filterColumnName)) { + columnNameMap.put(filterColumnName, columnNameMap.size()); + } + return filterProjectionId; + } + + // for date parse + private List getDateValue(List rightExpression) { + long DAY_TO_MILL_SECS = 24L * 3600L * 1000L; + List dateTimes = new ArrayList<>(); + for (Expression rExpression: rightExpression) { + String dateStr = rExpression.toString(); + if (NdpUtils.isValidDateFormat(dateStr)) { + String[] dateStrArray = dateStr.split("-"); + int year = Integer.parseInt(dateStrArray[0]) - 1900; + int month = Integer.parseInt(dateStrArray[1]) - 1; + int day = Integer.parseInt(dateStrArray[2]); + Date date = new Date(year, month, day); + dateTimes.add(String.valueOf((date.getTime() - date.getTimezoneOffset() * 60000L) / DAY_TO_MILL_SECS)); + } else { + throw new UnsupportedOperationException("decode date failed: " + dateStr); + } + } + return dateTimes; + } + + private List getValue(List rightExpression, + String operatorName, + String sparkType) { + Object objectValue; + List argumentValues = new ArrayList<>(); + if (null == rightExpression || rightExpression.size() == 0) { + return argumentValues; + } + switch (operatorName.toLowerCase(Locale.ENGLISH)) { + case "in": + List inValue = new ArrayList<>(); + for (Expression rExpression : rightExpression) { + inValue.add(rExpression.toString()); + } + argumentValues = inValue; + break; + default: + argumentValues.add(rightExpression.get(0).toString()); + break; + } + return argumentValues; + } + + private SparkDeserializer initSparkDesializer() { + DecodeType[] columnTypes = columnTypesList.toArray(new DecodeType[0]); + int[] columnOrders = columnOrdersList.stream().mapToInt(Integer::intValue).toArray(); + DecodeType[] filterTypes = filterTypesList.toArray(new DecodeType[0]); + int[] filterOrders = filterOrdersList.stream().mapToInt(Integer::intValue).toArray(); + SparkDeserializer deserializer; + if (columnTypes.length == 0) { + deserializer = new SparkDeserializer(filterTypes, filterOrders); + } else { + deserializer = new SparkDeserializer(columnTypes, columnOrders); + } + return deserializer; + } + + private DataSource initDataSource(PageCandidate pageCandidate) + throws UnsupportedOperationException { + DataSource dataSource; + String fileFormat = pageCandidate.getFileFormat(); + Long fileStartPos = pageCandidate.getStartPos(); + Long fileLen = pageCandidate.getSplitLen(); + if ("ORC".equalsIgnoreCase(fileFormat)) { + dataSource = new HdfsOrcDataSource(filePath, fileStartPos, fileLen, false); + } else if ("PARQUET".equalsIgnoreCase(fileFormat)) { + dataSource = new HdfsParquetDataSource(filePath, fileStartPos, fileLen, false); + } else { + throw new UnsupportedOperationException("unsupported data format : " + fileFormat); + } + return dataSource; + } + + private RowExpression initFilter(Seq filterExecutions) { + RowExpression rowExpression = extractFilterExpression(filterExecutions); + return rowExpression; + } + + private Optional initAggAndGroupInfo( + List aggExecutionList) { + // create AggregationInfo + Optional aggregationInfo = extractAggAndGroupExpression(aggExecutionList); + return aggregationInfo; + } + + private void initColumnInfo(Seq sparkOutPut) { + if (listAtt == null || listAtt.size() == 0) { + return; + } + + List outputColumnList = JavaConverters.seqAsJavaList(sparkOutPut); + boolean isPartitionKey; + int filterColumnId = 0; + for (int p = 0; p < outputColumnList.size(); p++) { + Attribute resAttribute = NdpUtils.getColumnAttribute(outputColumnList.get(p), listAtt); + String columnName = resAttribute.name(); + Type type = NdpUtils.transOlkDataType(resAttribute.dataType(), false); + int columnId = NdpUtils.getColumnId(resAttribute.toString()) - columnOffset; + isPartitionKey = partitionColumnName.contains(columnName); + String partitionValue = NdpUtils.getPartitionValue(filePath, columnName); + omnidataColumns.add(new Column(columnId, + columnName, type, isPartitionKey, partitionValue)); + omnidataTypes.add(type); + filterTypesList.add(NdpUtils.transDataIoDataType(resAttribute.dataType())); + filterOrdersList.add(filterColumnId); + omnidataProjections.add(new InputReferenceExpression(filterColumnId, type)); + fieldMap.put(columnName, filterColumnId); + ++filterColumnId; + } + } +} + + + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpFilterUtils.java b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpFilterUtils.java new file mode 100644 index 00000000..bf61913c --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpFilterUtils.java @@ -0,0 +1,120 @@ +package org.apache.spark.sql; + +import static io.prestosql.spi.relation.SpecialForm.Form.IN; +import static io.prestosql.spi.relation.SpecialForm.Form.IS_NULL; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; + +import io.prestosql.spi.connector.QualifiedObjectName; +import io.prestosql.spi.function.BuiltInFunctionHandle; +import io.prestosql.spi.function.FunctionKind; +import io.prestosql.spi.function.Signature; +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.InputReferenceExpression; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.relation.SpecialForm; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeSignature; + +import org.apache.spark.sql.catalyst.expressions.Expression; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +/** + * NdpFilterUtils + * + * @since 2021-06-30 + */ +public class NdpFilterUtils { + + public static int getFilterProjectionId(Expression expression, Map fieldMap) { + String filterColumnName = expression.toString().split("#")[0]; + if (fieldMap.containsKey(filterColumnName)) { + return fieldMap.get(filterColumnName); + } else { + return fieldMap.size(); + } + } + + public static RowExpression generateRowExpression( + String signatureName, PrestoExpressionInfo expressionInfo, + Type prestoType, int filterProjectionId, + List argumentValues, + List multiArguments, String operatorName) { + RowExpression rowExpression; + List rowArguments; + Signature signature = new Signature( + QualifiedObjectName.valueOfDefaultFunction("$operator$" + + signatureName.toLowerCase(Locale.ENGLISH)), + FunctionKind.SCALAR, new TypeSignature("boolean"), + new TypeSignature(prestoType.toString()), new TypeSignature(prestoType.toString())); + switch (operatorName.toLowerCase(Locale.ENGLISH)) { + case "is_null": + List notnullArguments = new ArrayList<>(); + notnullArguments.add(new InputReferenceExpression(filterProjectionId, prestoType)); + rowExpression = new SpecialForm(IS_NULL, BOOLEAN, notnullArguments); + break; + case "in": + rowArguments = getConstantArguments(prestoType, argumentValues, filterProjectionId); + rowExpression = new SpecialForm(IN, BOOLEAN, rowArguments); + break; + case "multy_columns": + Signature signatureMulti = new Signature( + QualifiedObjectName.valueOfDefaultFunction("$operator$" + + signatureName.toLowerCase(Locale.ENGLISH)), + FunctionKind.SCALAR, new TypeSignature("boolean"), + new TypeSignature(prestoType.toString()), + new TypeSignature(prestoType.toString())); + rowExpression = new CallExpression(signatureName, + new BuiltInFunctionHandle(signatureMulti), BOOLEAN, multiArguments); + break; + case "isempty": + case "isdeviceidlegal": + case "ismessycode": + rowExpression = expressionInfo.getPrestoRowExpression(); + break; + default: + if (expressionInfo.getReturnType() != null) { + rowArguments = getUdfArguments(prestoType, + argumentValues, expressionInfo.getPrestoRowExpression()); + } else { + rowArguments = getConstantArguments(prestoType, + argumentValues, filterProjectionId); + } + rowExpression = new CallExpression(signatureName, + new BuiltInFunctionHandle(signature), BOOLEAN, rowArguments); + break; + } + return rowExpression; + } + + public static List getConstantArguments(Type typeStr, + List argumentValues, + int columnId) { + List arguments = new ArrayList<>(); + arguments.add(new InputReferenceExpression(columnId, typeStr)); + if (null != argumentValues && argumentValues.size() > 0) { + for (Object argumentValue : argumentValues) { + arguments.add(NdpUtils + .transArgumentData(argumentValue.toString(), typeStr)); + } + } + return arguments; + } + + public static List getUdfArguments(Type typeStr, List argumentValues, + RowExpression callExpression) { + List arguments = new ArrayList<>(); + arguments.add(callExpression); + if (null != argumentValues && argumentValues.size() > 0) { + for (Object argumentValue : argumentValues) { + arguments.add(NdpUtils + .transArgumentData(argumentValue.toString(), typeStr)); + } + } + return arguments; + } +} + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpUdfEnum.java b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpUdfEnum.java new file mode 100644 index 00000000..ebaab92a --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpUdfEnum.java @@ -0,0 +1,45 @@ +package org.apache.spark.sql; + +/** + * udf enum + * + * @since 2021-06-26 + */ +public enum NdpUdfEnum { + // Supported push-down Udf + SUBSTRING("substr","substr"), + LENGTH("length","length"), + UPPER("upper","upper"), + LOWER("lower","lower"), + CAST("cast","$operator$cast"), + REPLACE("replace","replace"), + INSTR("instr","instr"), + SUBSCRIPT("SUBSCRIPT","$operator$subscript"), + SPLIT("split","split"), + STRINGINSTR("instr","instr"); + + private String signatureName; + private String operatorName; + + NdpUdfEnum(String signatureName, String operatorName) { + this.signatureName = signatureName; + this.operatorName = operatorName; + } + + public String getSignatureName() { + return signatureName; + } + + public void setSignatureName(String signatureName) { + this.signatureName = signatureName; + } + + public String getOperatorName() { + return operatorName; + } + + public void setOperatorName(String operatorName) { + this.operatorName = operatorName; + } +} + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpUdfExpressions.java b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpUdfExpressions.java new file mode 100644 index 00000000..d20a6caa --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpUdfExpressions.java @@ -0,0 +1,292 @@ +package org.apache.spark.sql; + +import io.prestosql.spi.connector.QualifiedObjectName; +import io.prestosql.spi.function.BuiltInFunctionHandle; +import io.prestosql.spi.function.FunctionKind; +import io.prestosql.spi.function.Signature; +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.InputReferenceExpression; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.type.Type; +import io.prestosql.spi.type.TypeSignature; +import scala.collection.JavaConverters; + +import org.apache.spark.sql.catalyst.expressions.AttributeReference; +import org.apache.spark.sql.catalyst.expressions.Cast; +import org.apache.spark.sql.catalyst.expressions.Expression; +import org.apache.spark.sql.catalyst.expressions.GetArrayItem; +import org.apache.spark.sql.catalyst.expressions.Length; +import org.apache.spark.sql.catalyst.expressions.Literal; +import org.apache.spark.sql.catalyst.expressions.Lower; +import org.apache.spark.sql.catalyst.expressions.StringInstr; +import org.apache.spark.sql.catalyst.expressions.StringReplace; +import org.apache.spark.sql.catalyst.expressions.StringSplit; +import org.apache.spark.sql.catalyst.expressions.Substring; +import org.apache.spark.sql.catalyst.expressions.Upper; +import org.apache.spark.sql.hive.HiveSimpleUDF; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +/** + * Used to process Spark`s UDF, which is converted to presto. + * + * @since 2021-06-24 + */ +public class NdpUdfExpressions { + + /** + * + * @param childExpression + * @param prestoExpressionInfo + * @param fieldMap + * @param childType + * @param rowArguments + */ + public void checkAttributeReference(Expression childExpression, + PrestoExpressionInfo prestoExpressionInfo, + Map fieldMap, Type childType, List rowArguments) { + if ((childExpression instanceof AttributeReference)) { + int lengthProjectId = NdpFilterUtils.getFilterProjectionId(childExpression, fieldMap); + rowArguments.add(new InputReferenceExpression(lengthProjectId, childType)); + prestoExpressionInfo.setProjectionId(lengthProjectId); + prestoExpressionInfo.setFieldDataType( + NdpUtils.transOlkDataType(childExpression.dataType(), false)); + prestoExpressionInfo.setChildExpression(childExpression); + } else { + createNdpUdf(childExpression, prestoExpressionInfo, fieldMap); + rowArguments.add(prestoExpressionInfo.getPrestoRowExpression()); + } + } + + public void createNdpUdf(Expression udfExpression, PrestoExpressionInfo prestoExpressionInfo, + Map fieldMap) { + if (udfExpression instanceof Length) { + createNdpLength((Length) udfExpression, prestoExpressionInfo, fieldMap); + } else if (udfExpression instanceof Upper) { + createNdpUpper((Upper) udfExpression, prestoExpressionInfo, fieldMap); + } else if (udfExpression instanceof Lower) { + createNdpLower((Lower) udfExpression, prestoExpressionInfo, fieldMap); + } else if (udfExpression instanceof Cast) { + createNdpCast((Cast) udfExpression, prestoExpressionInfo, fieldMap); + } else if (udfExpression instanceof Substring) { + createNdpSubstring((Substring) udfExpression, prestoExpressionInfo, fieldMap); + } else if (udfExpression instanceof StringReplace) { + createNdpReplace((StringReplace) udfExpression, prestoExpressionInfo, fieldMap); + } else if (udfExpression instanceof StringInstr) { + createNdpInstr((StringInstr) udfExpression, prestoExpressionInfo, fieldMap); + } else if (udfExpression instanceof StringSplit) { + createNdpSplit((StringSplit) udfExpression, prestoExpressionInfo, fieldMap); + } else if (udfExpression instanceof GetArrayItem) { + createNdpSubscript((GetArrayItem) udfExpression, prestoExpressionInfo, fieldMap); + } else if (udfExpression instanceof org.apache.spark.sql.hive.HiveSimpleUDF) { + createHiveSimpleUdf((HiveSimpleUDF) udfExpression, prestoExpressionInfo, fieldMap); + } else { + throw new RuntimeException("unsupported this UDF:" + udfExpression.toString()); + } + } + + /** + * Used to create UDF with only a single parameter + */ + public void createNdpSingleParameter(NdpUdfEnum udfEnum, + Expression expression, Expression childExpression, + PrestoExpressionInfo prestoExpressionInfo, Map fieldMap) { + String signatureName = udfEnum.getSignatureName(); + Type childType = NdpUtils.transOlkDataType(childExpression.dataType(), true); + Type returnType = NdpUtils.transOlkDataType(expression.dataType(), true); + List rowArguments = new ArrayList<>(); + checkAttributeReference(childExpression, + prestoExpressionInfo, fieldMap, childType, rowArguments); + Signature signature = new Signature( + QualifiedObjectName.valueOfDefaultFunction(udfEnum.getOperatorName()), + FunctionKind.SCALAR, new TypeSignature( + returnType.toString()), new TypeSignature(childType.toString())); + RowExpression resExpression = new CallExpression( + signatureName, new BuiltInFunctionHandle(signature), + returnType, rowArguments); + prestoExpressionInfo.setReturnType(returnType); + prestoExpressionInfo.setPrestoRowExpression(resExpression); + } + + public void createNdpLength(Length expression, PrestoExpressionInfo prestoExpressionInfo, + Map fieldMap) { + createNdpSingleParameter(NdpUdfEnum.LENGTH, + expression, expression.child(), prestoExpressionInfo, fieldMap); + } + + public void createNdpUpper(Upper expression, PrestoExpressionInfo prestoExpressionInfo, + Map fieldMap) { + createNdpSingleParameter(NdpUdfEnum.UPPER, + expression, expression.child(), prestoExpressionInfo, fieldMap); + } + + public void createNdpLower(Lower expression, PrestoExpressionInfo prestoExpressionInfo, + Map fieldMap) { + createNdpSingleParameter(NdpUdfEnum.LOWER, + expression, expression.child(), prestoExpressionInfo, fieldMap); + } + + public void createNdpCast(Cast expression, PrestoExpressionInfo prestoExpressionInfo, + Map fieldMap) { + createNdpSingleParameter(NdpUdfEnum.CAST, + expression, expression.child(), prestoExpressionInfo, fieldMap); + } + + public void createHiveSimpleUdf(HiveSimpleUDF hiveSimpleUDFExpression, + PrestoExpressionInfo prestoExpressionInfo, + Map fieldMap) { + String signatureName = hiveSimpleUDFExpression.name(); + List hiveSimpleUdf = JavaConverters.seqAsJavaList( + hiveSimpleUDFExpression.children()); + Type returnType = NdpUtils.transOlkDataType( + hiveSimpleUDFExpression.dataType(), true); + List rowArguments = new ArrayList<>(); + Type strTypeCandidate = returnType; + for (Expression hiveUdf : hiveSimpleUdf) { + strTypeCandidate = NdpUtils.transOlkDataType(hiveUdf.dataType(), true); + checkAttributeReference(hiveUdf, prestoExpressionInfo, + fieldMap, strTypeCandidate, rowArguments); + } + Signature signature = new Signature( + QualifiedObjectName.valueOfDefaultFunction(signatureName), + FunctionKind.SCALAR, new TypeSignature(returnType.toString()), + new TypeSignature(strTypeCandidate.toString())); + RowExpression resExpression = new CallExpression(signatureName.toLowerCase(Locale.ENGLISH), + new BuiltInFunctionHandle(signature), returnType, rowArguments); + prestoExpressionInfo.setReturnType(returnType); + prestoExpressionInfo.setPrestoRowExpression(resExpression); + } + + public void createNdpSubstring(Substring expression, PrestoExpressionInfo prestoExpressionInfo, + Map fieldMap) { + String signatureName = NdpUdfEnum.SUBSTRING.getSignatureName(); + Type strType = NdpUtils.transOlkDataType(expression.str().dataType(), true); + Type lenType = NdpUtils.transOlkDataType(expression.len().dataType(), true); + Type posType = NdpUtils.transOlkDataType(expression.pos().dataType(), true); + Type returnType = NdpUtils.transOlkDataType(expression.dataType(), true); + List rowArguments = new ArrayList<>(); + checkAttributeReference(expression.str(), + prestoExpressionInfo, fieldMap, strType, rowArguments); + rowArguments.add(NdpUtils.transArgumentData( + expression.pos().toString(), posType)); + rowArguments.add(NdpUtils.transArgumentData( + expression.len().toString(), lenType)); + Signature signature = new Signature( + QualifiedObjectName.valueOfDefaultFunction( + NdpUdfEnum.SUBSTRING.getOperatorName()), FunctionKind.SCALAR, + new TypeSignature(returnType.toString()), new TypeSignature(strType.toString()), + new TypeSignature(posType.toString()), new TypeSignature(lenType.toString())); + RowExpression resExpression = new CallExpression( + signatureName, new BuiltInFunctionHandle(signature), + returnType, rowArguments); + prestoExpressionInfo.setPrestoRowExpression(resExpression); + prestoExpressionInfo.setReturnType(returnType); + } + + public void createNdpReplace(StringReplace expression, + PrestoExpressionInfo prestoExpressionInfo, + Map fieldMap) { + String signatureName = NdpUdfEnum.REPLACE.getSignatureName(); + Type srcType = NdpUtils.transOlkDataType(expression.srcExpr().dataType(), true); + Type searchType = NdpUtils.transOlkDataType( + expression.searchExpr().dataType(), true); + Type replaceType = NdpUtils.transOlkDataType( + expression.replaceExpr().dataType(), true); + Type returnType = NdpUtils.transOlkDataType(expression.dataType(), true); + List rowArguments = new ArrayList<>(); + checkAttributeReference(expression.srcExpr(), + prestoExpressionInfo, fieldMap, srcType, rowArguments); + rowArguments.add(NdpUtils.transArgumentData( + expression.searchExpr().toString(), searchType)); + rowArguments.add(NdpUtils.transArgumentData( + expression.replaceExpr().toString(), replaceType)); + Signature signature = new Signature( + QualifiedObjectName.valueOfDefaultFunction( + NdpUdfEnum.REPLACE.getOperatorName()), FunctionKind.SCALAR, + new TypeSignature(returnType.toString()), new TypeSignature(srcType.toString()), + new TypeSignature(searchType.toString()), new TypeSignature(replaceType.toString())); + RowExpression resExpression = new CallExpression( + signatureName, new BuiltInFunctionHandle(signature), + returnType, rowArguments); + prestoExpressionInfo.setReturnType(returnType); + prestoExpressionInfo.setPrestoRowExpression(resExpression); + } + + public void createNdpInstr(StringInstr expression, PrestoExpressionInfo prestoExpressionInfo, + Map fieldMap) { + String signatureName = NdpUdfEnum.INSTR.getSignatureName(); + Type strType = NdpUtils.transOlkDataType(expression.str().dataType(), true); + Type substrType = NdpUtils.transOlkDataType(expression.substr().dataType(), true); + Type returnType = NdpUtils.transOlkDataType(expression.dataType(), true); + List rowArguments = new ArrayList<>(); + checkAttributeReference(expression.str(), + prestoExpressionInfo, fieldMap, strType, rowArguments); + rowArguments.add(NdpUtils.transArgumentData( + expression.substr().toString(), substrType)); + Signature signature = new Signature( + QualifiedObjectName.valueOfDefaultFunction( + NdpUdfEnum.INSTR.getOperatorName()), FunctionKind.SCALAR, + new TypeSignature(returnType.toString()), new TypeSignature(strType.toString()), + new TypeSignature(substrType.toString())); + RowExpression resExpression = new CallExpression( + signatureName, new BuiltInFunctionHandle(signature), + returnType, rowArguments); + prestoExpressionInfo.setReturnType(returnType); + prestoExpressionInfo.setPrestoRowExpression(resExpression); + } + + public void createNdpSplit(StringSplit expression, PrestoExpressionInfo prestoExpressionInfo, + Map fieldMap) { + String signatureName = NdpUdfEnum.SPLIT.getSignatureName(); + Type strType = NdpUtils.transOlkDataType(expression.str().dataType(), true); + Type regexType = NdpUtils.transOlkDataType(expression.regex().dataType(), true); + Type returnType = NdpUtils.transOlkDataType(expression.dataType(), true); + List rowArguments = new ArrayList<>(); + checkAttributeReference(expression.str(), + prestoExpressionInfo, fieldMap, strType, rowArguments); + rowArguments.add(NdpUtils.transArgumentData( + expression.regex().toString(), regexType)); + Signature signature = new Signature( + QualifiedObjectName.valueOfDefaultFunction( + NdpUdfEnum.SPLIT.getOperatorName()), FunctionKind.SCALAR, + new TypeSignature(returnType.toString()), new TypeSignature(strType.toString()), + new TypeSignature(regexType.toString())); + RowExpression resExpression = new CallExpression( + signatureName, new BuiltInFunctionHandle(signature), + returnType, rowArguments); + prestoExpressionInfo.setReturnType(returnType); + prestoExpressionInfo.setPrestoRowExpression(resExpression); + } + + public void createNdpSubscript(GetArrayItem expression, + PrestoExpressionInfo prestoExpressionInfo, + Map fieldMap) { + String signatureName = NdpUdfEnum.SUBSCRIPT.getSignatureName(); + Type strType = NdpUtils.transOlkDataType(expression.child().dataType(), true); + Type ordinalType = NdpUtils.transOlkDataType( + expression.ordinal().dataType(), true); + Type returnType = NdpUtils.transOlkDataType(expression.dataType(), true); + List rowArguments = new ArrayList<>(); + checkAttributeReference(expression.child(), + prestoExpressionInfo, fieldMap, strType, rowArguments); + // The presto`s array subscript is initially 1. + int argumentValue = Integer.parseInt( + ((Literal) expression.ordinal()).value().toString()) + 1; + rowArguments.add(NdpUtils.transArgumentData( + Integer.toString(argumentValue), ordinalType)); + Signature signature = new Signature( + QualifiedObjectName.valueOfDefaultFunction( + NdpUdfEnum.SUBSCRIPT.getOperatorName()), FunctionKind.SCALAR, + new TypeSignature(returnType.toString()), new TypeSignature(strType.toString()), + new TypeSignature(ordinalType.toString())); + RowExpression resExpression = new CallExpression( + signatureName, new BuiltInFunctionHandle(signature), + returnType, rowArguments); + prestoExpressionInfo.setReturnType(returnType); + prestoExpressionInfo.setPrestoRowExpression(resExpression); + } +} + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpUtils.java b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpUtils.java new file mode 100644 index 00000000..ce8a4946 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/NdpUtils.java @@ -0,0 +1,361 @@ +package org.apache.spark.sql; + +import com.huawei.boostkit.omnidata.type.*; + +import io.airlift.slice.Slice; +import io.prestosql.spi.relation.ConstantExpression; +import io.prestosql.spi.type.ArrayType; +import io.prestosql.spi.type.Type; + +import org.apache.spark.sql.catalyst.expressions.Attribute; +import org.apache.spark.sql.catalyst.expressions.Cast; +import org.apache.spark.sql.catalyst.expressions.Expression; +import org.apache.spark.sql.catalyst.expressions.NamedExpression; +import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction; +import org.apache.spark.sql.execution.ndp.AggExeInfo; +import org.apache.spark.sql.execution.ndp.LimitExeInfo; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DateType; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +import scala.Option; +import scala.collection.JavaConverters; +import scala.collection.Seq; + +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.time.format.ResolverStyle; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.OptionalLong; +import java.util.Properties; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static io.airlift.slice.Slices.utf8Slice; +import static io.prestosql.spi.type.BigintType.BIGINT; +import static io.prestosql.spi.type.BooleanType.BOOLEAN; +import static io.prestosql.spi.type.DateType.DATE; +import static io.prestosql.spi.type.DoubleType.DOUBLE; +import static io.prestosql.spi.type.IntegerType.INTEGER; +import static io.prestosql.spi.type.RealType.REAL; +import static io.prestosql.spi.type.SmallintType.SMALLINT; +import static io.prestosql.spi.type.TinyintType.TINYINT; +import static io.prestosql.spi.type.VarcharType.VARCHAR; +import static java.lang.Float.floatToIntBits; +import static java.lang.Float.parseFloat; + +/** + * NdpUtils + * + * @since 2021-03-30 + */ +public class NdpUtils { + + public static int getColumnOffset(StructType dataSchema, Seq outPut) { + List attributeList = JavaConverters.seqAsJavaList(outPut); + String columnName = ""; + int columnTempId = 0; + if (attributeList.size() > 0) { + columnName = attributeList.get(0).name(); + columnTempId = NdpUtils.getColumnId(attributeList.get(0).toString()); + } + Map columnMap = new HashMap<>(); + scala.collection.Iterator allTableSchemas = dataSchema.iterator(); + int dataSchemaColumnNum = 0; + while (allTableSchemas.hasNext()) { + StructField structField = allTableSchemas.next(); + columnMap.put(structField.name(), dataSchemaColumnNum++); + } + int columnOffset = columnTempId - columnMap.getOrDefault(columnName, columnMap.size()); + return Math.abs(columnOffset); + } + + public static int getColumnOffsetByAggExeInfo(StructType dataSchema, + Seq aggExeInfo) { + String columnName = ""; + int columnTempId = 0; + if (aggExeInfo != null && aggExeInfo.size() > 0) { + List aggExecutionList = JavaConverters.seqAsJavaList(aggExeInfo); + for (AggExeInfo aggExeInfoTemp : aggExecutionList) { + List aggregateExpressions = JavaConverters.seqAsJavaList( + aggExeInfoTemp.aggregateExpressions()); + for (AggregateFunction aggregateFunction : aggregateExpressions) { + List expressions = JavaConverters + .seqAsJavaList(aggregateFunction.children()); + for (Expression expression : expressions) { + columnName = expression.toString().split("#")[0]; + columnTempId = NdpUtils.getColumnId(expression.toString()); + break; + } + break; + } + List namedExpressions = JavaConverters.seqAsJavaList( + aggExeInfoTemp.groupingExpressions()); + for (NamedExpression namedExpression : namedExpressions) { + columnName = namedExpression.toString().split("#")[0]; + columnTempId = NdpUtils.getColumnId(namedExpression.toString()); + break; + } + } + } + Map columnMap = new HashMap<>(); + scala.collection.Iterator allTableSchemas = dataSchema.iterator(); + int dataSchemaColumnNum = 0; + while (allTableSchemas.hasNext()) { + StructField structField = allTableSchemas.next(); + columnMap.put(structField.name(), dataSchemaColumnNum++); + } + int columnOffset = columnTempId - columnMap.getOrDefault(columnName, columnMap.size()); + return Math.abs(columnOffset); + } + + public static int getColumnId(String attribute) { + if (null == attribute) { + return -1; + } + int columnTempId = 0; + String[] columnArray = attribute.split("#"); + if (columnArray.length < 2) { + return -1; + } + String columnArrayId = columnArray[1]; + if ('L' == columnArrayId.charAt(columnArrayId.length() - 1)) { + String adf = columnArrayId.substring(0, columnArrayId.length() - 1); + columnTempId = Integer.parseInt(adf); + } else { + columnTempId = Integer.parseInt(columnArrayId); + } + return columnTempId; + } + + public static Type transOlkDataType(DataType dataType, boolean isUdfOperator) { + String strType = dataType.toString().toLowerCase(Locale.ENGLISH); + if (isUdfOperator && "integertype".equalsIgnoreCase(strType)) { + strType = "longtype"; + } + switch (strType) { + case "longtype": + return BIGINT; + case "integertype": + return INTEGER; + case "bytetype": + return TINYINT; + case "shorttype": + return SMALLINT; + case "floattype": + return REAL; + case "doubletype": + return DOUBLE; + case "booleantype": + return BOOLEAN; + case "stringtype": + return VARCHAR; + case "datetype": + return DATE; + case "arraytype(stringtype,true)": + case "arraytype(stringtype,false)": + return new ArrayType<>(VARCHAR); + case "arraytype(integertype,true)": + case "arraytype(integertype,false)": + case "arraytype(longtype,true)": + case "arraytype(longtype,false)": + return new ArrayType<>(BIGINT); + case "arraytype(floattype,true)": + case "arraytype(floattype,false)": + return new ArrayType<>(REAL); + case "arraytype(doubletype,true)": + case "arraytype(doubletype,false)": + return new ArrayType<>(DOUBLE); + default: + throw new UnsupportedOperationException("unsupported this type:" + strType); + } + } + + public static Type transAggRetType(Type prestoType) { + if (BIGINT.equals(prestoType) || INTEGER.equals(prestoType) || + SMALLINT.equals(prestoType) || TINYINT.equals(prestoType) || REAL.equals(prestoType)) { + return BIGINT; + } else { + return prestoType; + } + } + + public static DecodeType transAggDecodeType(Type prestoType) { + if (BIGINT.equals(prestoType)) { + return new LongDecodeType(); + } + if (INTEGER.equals(prestoType)) { + return new LongToIntDecodeType(); + } + if (SMALLINT.equals(prestoType)) { + return new LongToShortDecodeType(); + } + if (TINYINT.equals(prestoType)) { + return new LongToByteDecodeType(); + } + if (DOUBLE.equals(prestoType)) { + return new DoubleDecodeType(); + } + if (REAL.equals(prestoType)) { + return new LongToFloatDecodeType(); + } + if (BOOLEAN.equals(prestoType)) { + return new BooleanDecodeType(); + } + if (VARCHAR.equals(prestoType)) { + return new VarcharDecodeType(); + } + if (DATE.equals(prestoType)) { + return new DateDecodeType(); + } + throw new RuntimeException("unsupported this prestoType:" + prestoType); + } + + public static DecodeType transDataIoDataType(DataType dataType) { + String strType = dataType.toString().toLowerCase(Locale.ENGLISH); + switch (strType) { + case "integertype": + return new IntDecodeType(); + case "shorttype": + return new ShortDecodeType(); + case "longtype": + return new LongDecodeType(); + case "floattype": + return new FloatDecodeType(); + case "doubletype": + return new DoubleDecodeType(); + case "booleantype": + return new BooleanDecodeType(); + case "bytetype": + return new ByteDecodeType(); + case "stringtype": + return new VarcharDecodeType(); + case "datetype": + return new DateDecodeType(); + default: + throw new RuntimeException("unsupported this type:" + strType); + } + } + + public static ConstantExpression transArgumentData(String argumentValue, Type argumentType) { + String strType = argumentType.toString().toLowerCase(Locale.ENGLISH); + switch (strType) { + case "bigint": + case "integer": + case "date": + case "tinyint": + case "smallint": + long longValue = Long.parseLong(argumentValue); + return new ConstantExpression(longValue, argumentType); + case "real": + return new ConstantExpression( + (long)floatToIntBits(parseFloat(argumentValue)), argumentType); + case "double": + return new ConstantExpression(Double.valueOf(argumentValue), argumentType); + case "boolean": + return new ConstantExpression(Boolean.valueOf(argumentValue), argumentType); + case "varchar": + Slice charValue = utf8Slice(argumentValue); + return new ConstantExpression(charValue, argumentType); + default: + throw new UnsupportedOperationException("unsupported data type " + strType); + } + } + + public static Attribute getColumnAttribute(Attribute inputAttribute, List listAtt) { + String columnName = inputAttribute.name(); + Attribute resAttribute = inputAttribute; + if (columnName.contains("(")) { + for (Attribute att : listAtt) { + if (columnName.contains(att.name())) { + resAttribute = att; + break; + } + } + } + return resAttribute; + } + + public static Object transData(String sparkType, String columnValue) { + String strType = sparkType.toLowerCase(Locale.ENGLISH); + switch (strType) { + case "integertype": + return Integer.valueOf(columnValue); + case "bytetype": + return Byte.valueOf(columnValue); + case "shorttype": + return Short.valueOf(columnValue); + case "longtype": + return Long.valueOf(columnValue); + case "floattype": + return (long)floatToIntBits(parseFloat(columnValue)); + case "doubletype": + return Double.valueOf(columnValue); + case "booleantype": + return Boolean.valueOf(columnValue); + case "stringtype": + case "datetype": + return columnValue; + default: + return ""; + } + } + + public static OptionalLong convertLimitExeInfo(Option limitExeInfo) { + return limitExeInfo.isEmpty() ? OptionalLong.empty() + : OptionalLong.of(limitExeInfo.get().limit()); + } + + public static String getPartitionValue(String filePath, String columnName) { + String[] filePathStrArray = filePath.split("\\/"); + String partitionValue = ""; + Pattern pn = Pattern.compile(columnName + "\\="); + for (String strColumn : filePathStrArray) { + Matcher matcher = pn.matcher(strColumn); + if (matcher.find()) { + partitionValue = strColumn.split("\\=")[1]; + if (partitionValue.contains("__HIVE_DEFAULT_PARTITION__")) { + partitionValue = null; + } + break; + } + } + return partitionValue; + } + + public static Properties getProperties(OmniDataProperties omniDataProperties) { + Properties omniProperties = new Properties(); + omniProperties.put("grpc.ssl.enabled", omniDataProperties.isGrpcSslEnabled()); + omniProperties.put("grpc.client.cert.file.path", omniDataProperties.getGrpcCertPath()); + omniProperties.put("grpc.client.private.key.file.path", + omniDataProperties.getGrpcKeyPath()); + omniProperties.put("grpc.trust.ca.file.path", omniDataProperties.getGrpcCaPath()); + omniProperties.put("pki.dir", omniDataProperties.getPkiDir()); + return omniProperties; + } + + public static boolean isValidDateFormat(String dateString) { + boolean isValid = true; + String pattern = "yyyy-MM-dd"; + DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern).withResolverStyle(ResolverStyle.STRICT); + try { + formatter.parse(dateString); + } catch (DateTimeParseException e) { + isValid = false; + } + return isValid; + } + + public static boolean isInDateExpression(Expression expression, String Operator) { + boolean isInDate = false; + if (expression instanceof Cast && Operator.equals("in")) { + isInDate = ((Cast) expression).child().dataType() instanceof DateType; + } + return isInDate; + } +} + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/OmniDataProperties.java b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/OmniDataProperties.java new file mode 100644 index 00000000..91340389 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/OmniDataProperties.java @@ -0,0 +1,49 @@ +package org.apache.spark.sql; + +/** + * OmniDataProperties + * + * @date 2021/8/6 16:15 + */ +public class OmniDataProperties { + + private String grpcSslEnabled; + + private String grpcCertPath; + + private String grpcKeyPath; + + private String grpcCaPath; + + private String pkiDir; + + public OmniDataProperties(String grpcSslEnabled, String grpcCertPath, + String grpcKeyPath, String grpcCaPath, String pkiDir) { + this.grpcSslEnabled = grpcSslEnabled; + this.grpcCertPath = grpcCertPath; + this.grpcKeyPath = grpcKeyPath; + this.grpcCaPath = grpcCaPath; + this.pkiDir = pkiDir; + } + + public String isGrpcSslEnabled() { + return grpcSslEnabled; + } + + public String getGrpcCertPath() { + return grpcCertPath; + } + + public String getGrpcKeyPath() { + return grpcKeyPath; + } + + public String getGrpcCaPath() { + return grpcCaPath; + } + + public String getPkiDir() { + return pkiDir; + } +} + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PageCandidate.java b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PageCandidate.java new file mode 100644 index 00000000..fb0a296e --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PageCandidate.java @@ -0,0 +1,62 @@ +package org.apache.spark.sql; + +/** + * 构造Page传输参数 + * @date 2021/5/18 15:25 + */ +public class PageCandidate { + + public String filePath; + + public Long startPos; + + public Long splitLen; + + public int columnOffset; + + public String sdiHosts; + + private String fileFormat; + + private String sdiPort; + + public PageCandidate(String filePath, Long startPos, Long splitLen, int columnOffset, + String sdiHosts, String fileFormat, String sdiPort) { + this.filePath = filePath; + this.startPos = startPos; + this.splitLen = splitLen; + this.columnOffset = columnOffset; + this.sdiHosts = sdiHosts; + this.fileFormat = fileFormat; + this.sdiPort = sdiPort; + } + + public Long getStartPos() { + return startPos; + } + + public Long getSplitLen() { + return splitLen; + } + + public String getFilePath() { + return filePath; + } + + public int getColumnOffset() { + return columnOffset; + } + + public String getSdiHosts() { + return sdiHosts; + } + + public String getFileFormat() { + return fileFormat; + } + + public String getSdiPort() { + return sdiPort; + } +} + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PageToColumnar.java b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PageToColumnar.java new file mode 100644 index 00000000..ff895e05 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PageToColumnar.java @@ -0,0 +1,67 @@ +package org.apache.spark.sql; + +import org.apache.spark.sql.catalyst.expressions.Attribute; +import org.apache.spark.sql.execution.vectorized.MutableColumnarRow; +import org.apache.spark.sql.execution.vectorized.WritableColumnVector; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.vectorized.ColumnarBatch; +import scala.collection.Seq; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * PageToColumnar + * + * @since 2021-03-30 + */ +public class PageToColumnar implements Serializable { + StructType structType = null; + Seq outPut = null; + public PageToColumnar(StructType structType, Seq outPut) { + this.structType = structType; + this.outPut = outPut; + } + + public List transPageToColumnar(Iterator writableColumnVectors, + boolean isVectorizedReader) { + scala.collection.Iterator structFieldIterator = structType.iterator(); + List columnType = new ArrayList<>(); + + while (structFieldIterator.hasNext()) { + columnType.add(structFieldIterator.next().dataType()); + } + List internalRowList = new ArrayList<>(); + while (writableColumnVectors.hasNext()) { + WritableColumnVector[] columnVector = writableColumnVectors.next(); + if (columnVector == null) { + continue; + } + int positionCount = columnVector[0].getElementsAppended(); + if (positionCount > 0) { + if (isVectorizedReader) { + ColumnarBatch columnarBatch = new ColumnarBatch(columnVector); + columnarBatch.setNumRows(positionCount); + internalRowList.add(columnarBatch); + } else { + for (int j = 0; j < positionCount; j++) { + MutableColumnarRow mutableColumnarRow = + new MutableColumnarRow(columnVector); + mutableColumnarRow.rowId = j; + internalRowList.add(mutableColumnarRow); + } + } + } + } + return internalRowList; + } +} + + + + + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PrestoExpressionInfo.java b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PrestoExpressionInfo.java new file mode 100644 index 00000000..cc52f834 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PrestoExpressionInfo.java @@ -0,0 +1,74 @@ +package org.apache.spark.sql; + +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.type.Type; + +import org.apache.spark.sql.catalyst.expressions.Expression; + +/** + * PrestoExpressionInfo + * + * @date 2021/6/18 15:49 + */ +public class PrestoExpressionInfo { + + private int projectionId; + + private Type returnType; + + /** + * 子表达式中的字段类型 + */ + private Type fieldDataType; + + /** + * 包含字段信息的子表达式,例子:c_name#24 + */ + private Expression childExpression; + + private RowExpression prestoRowExpression; + + public PrestoExpressionInfo() { + } + + public int getProjectionId() { + return projectionId; + } + + public void setProjectionId(int projectionId) { + this.projectionId = projectionId; + } + + public RowExpression getPrestoRowExpression() { + return prestoRowExpression; + } + + public void setPrestoRowExpression(RowExpression prestoRowExpression) { + this.prestoRowExpression = prestoRowExpression; + } + + public Type getReturnType() { + return returnType; + } + + public void setReturnType(Type returnType) { + this.returnType = returnType; + } + + public Type getFieldDataType() { + return fieldDataType; + } + + public void setFieldDataType(Type fieldDataType) { + this.fieldDataType = fieldDataType; + } + + public Expression getChildExpression() { + return childExpression; + } + + public void setChildExpression(Expression childExpression) { + this.childExpression = childExpression; + } +} + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PushDownData.java b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PushDownData.java new file mode 100644 index 00000000..b9dbe297 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PushDownData.java @@ -0,0 +1,55 @@ +package org.apache.spark.sql; + +/** + * PushDownData + * + * @date 2021/3/23 20:30 + */ +public class PushDownData { + private String datanodeHost; + private String version; + private double threshold; + private int runningTasks; + private int maxTasks; + + public String getDatanodeHost() { + return datanodeHost; + } + + public void setDatanodeHost(String datanodeHost) { + this.datanodeHost = datanodeHost; + } + + public String getVersion() { + return version; + } + + public void setVersion(String version) { + this.version = version; + } + + public double getThreshold() { + return threshold; + } + + public void setThreshold(double threshold) { + this.threshold = threshold; + } + + public int getRunningTasks() { + return runningTasks; + } + + public void setRunningTasks(int runningTasks) { + this.runningTasks = runningTasks; + } + + public int getMaxTasks() { + return maxTasks; + } + + public void setMaxTasks(int maxTasks) { + this.maxTasks = maxTasks; + } +} + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PushDownManager.java b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PushDownManager.java new file mode 100644 index 00000000..b54d732b --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/java/org/apache/spark/sql/PushDownManager.java @@ -0,0 +1,112 @@ +package org.apache.spark.sql; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import org.apache.curator.framework.CuratorFramework; +import org.apache.curator.framework.CuratorFrameworkFactory; +import org.apache.curator.framework.recipes.locks.InterProcessMutex; +import org.apache.curator.retry.RetryForever; +import org.apache.zookeeper.KeeperException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import scala.collection.JavaConverters; +import scala.collection.Map$; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +/** + * PushDownManager Operate zookeeper data. + * + * @date 2021/3/13 20:26 + */ +public class PushDownManager { + private static final Logger LOG = LoggerFactory.getLogger(PushDownManager.class); + + private int aliveOmniDataServerNum = 0; + + private static final double TASK_THRESHOLD = 0.8; + + private static final int ZOOKEEPER_RETRY_INTERVAL_MS = 1000; + + private CuratorFramework zkClient; + + public scala.collection.Map getZookeeperData( + int timeOut, String parentPath, String zkAddress, + int aliveOmniDataServerNum) throws Exception { + this.aliveOmniDataServerNum = aliveOmniDataServerNum; + Map fpuMap = new HashMap<>(); + zkClient = CuratorFrameworkFactory.builder() + .connectString(zkAddress) + .sessionTimeoutMs(timeOut) + .connectionTimeoutMs(timeOut) + .retryPolicy(new RetryForever(ZOOKEEPER_RETRY_INTERVAL_MS)) + .build(); + zkClient.start(); + Map pushDownInfoMap = new HashMap<>(); + InterProcessMutex lock = new InterProcessMutex(zkClient, parentPath); + if (lock.acquire(ZOOKEEPER_RETRY_INTERVAL_MS, TimeUnit.MILLISECONDS)) { + try { + List childrenPaths = zkClient.getChildren().forPath(parentPath); + ObjectMapper mapper = new ObjectMapper(); + for (String path : childrenPaths) { + if (!path.contains("-lock-")) { + byte[] data = zkClient.getData().forPath(parentPath + "/" + path); + PushDownData statusInfo = mapper.readValue(data, PushDownData.class); + fpuMap.put(path, statusInfo.getDatanodeHost()); + pushDownInfoMap.put(path, statusInfo); + } + } + if (checkAllPushDown(pushDownInfoMap)) { + return javaMapToScala(fpuMap); + } else { + return javaMapToScala(new HashMap<>()); + } + } catch (InterruptedException | IOException | KeeperException e) { + LOG.error("Fail to connect ZooKeeper"); + } finally { + lock.release(); + zkClient.close(); + } + } return javaMapToScala(new HashMap<>()); + } + + /** + * OmniDataServer: Determine whether push-down is required. + */ + private boolean checkAllPushDown(Map fpuStatusInfoMap) { + if (fpuStatusInfoMap.size() == 0) { + LOG.info("Fail to Push Down, the number of omni-data-server is 0."); + return false; + } + for (Map.Entry fpuStatusInfo : fpuStatusInfoMap.entrySet()) { + if (fpuStatusInfoMap.size() >= aliveOmniDataServerNum - 2 + && checkPushDown(fpuStatusInfo.getValue())) { + return true; + } + } + return false; + } + + private boolean checkPushDown(PushDownData pushDownData) { + int runningTask = pushDownData.getRunningTasks(); + if (runningTask > pushDownData.getMaxTasks() * TASK_THRESHOLD) { + LOG.info("Fail to Push Down, the number of runningTask is {}.", runningTask); + return false; + } + return true; + } + + private static scala.collection.Map javaMapToScala(Map kafkaParams) { + scala.collection.Map scalaMap = JavaConverters.mapAsScalaMap(kafkaParams); + Object objTest = Map$.MODULE$.newBuilder().$plus$plus$eq(scalaMap.toSeq()); + Object resultTest = ((scala.collection.mutable.Builder) objTest).result(); + scala.collection.Map retMap = (scala.collection.Map) resultTest; + return retMap; + } +} + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala new file mode 100644 index 00000000..36fb382e --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -0,0 +1,624 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import java.util.concurrent.TimeUnit._ + +import scala.collection.mutable.HashMap + +import org.apache.commons.lang3.StringUtils +import org.apache.hadoop.fs.Path + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} +import org.apache.spark.sql.catalyst.catalog.BucketSpec +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, UnknownPartitioning} +import org.apache.spark.sql.catalyst.util.truncatedString +import org.apache.spark.sql.execution.datasources.{FileScanRDDPushDown, _} +import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat => ParquetSource} +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.sql.execution.ndp.NdpSupport +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources.{BaseRelation, Filter} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.util.Utils +import org.apache.spark.util.collection.BitSet + +trait DataSourceScanExec extends LeafExecNode { + val relation: BaseRelation + val tableIdentifier: Option[TableIdentifier] + + protected val nodeNamePrefix: String = "" + + override val nodeName: String = { + s"Scan $relation ${tableIdentifier.map(_.unquotedString).getOrElse("")}" + } + + // Metadata that describes more details of this scan. + protected def metadata: Map[String, String] + + override def simpleString(maxFields: Int): String = { + val metadataEntries = metadata.toSeq.sorted.map { + case (key, value) => + key + ": " + StringUtils.abbreviate(redact(value), 100) + } + val metadataStr = truncatedString(metadataEntries, " ", ", ", "", maxFields) + redact( + s"$nodeNamePrefix$nodeName${truncatedString(output, "[", ",", "]", maxFields)}$metadataStr") + } + + override def verboseStringWithOperatorId(): String = { + val metadataStr = metadata.toSeq.sorted.filterNot { + case (_, value) if (value.isEmpty || value.equals("[]")) => true + case (key, _) if (key.equals("DataFilters") || key.equals("Format")) => true + case (_, _) => false + }.map { + case (key, value) => s"$key: ${redact(value)}" + } + + s""" + |$formattedNodeName + |${ExplainUtils.generateFieldString("Output", output)} + |${metadataStr.mkString("\n")} + |""".stripMargin + } + + /** + * Shorthand for calling redactString() without specifying redacting rules + */ + protected def redact(text: String): String = { + Utils.redact(sqlContext.sessionState.conf.stringRedactionPattern, text) + } + + /** + * The data being read in. This is to provide input to the tests in a way compatible with + * [[InputRDDCodegen]] which all implementations used to extend. + */ + def inputRDDs(): Seq[RDD[InternalRow]] +} + +/** Physical plan node for scanning data from a relation. */ +case class RowDataSourceScanExec( + fullOutput: Seq[Attribute], + requiredColumnsIndex: Seq[Int], + filters: Set[Filter], + handledFilters: Set[Filter], + rdd: RDD[InternalRow], + @transient relation: BaseRelation, + override val tableIdentifier: Option[TableIdentifier]) + extends DataSourceScanExec with InputRDDCodegen { + + def output: Seq[Attribute] = requiredColumnsIndex.map(fullOutput) + + override lazy val metrics = + Map("numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) + + protected override def doExecute(): RDD[InternalRow] = { + val numOutputRows = longMetric("numOutputRows") + + rdd.mapPartitionsWithIndexInternal { (index, iter) => + val proj = UnsafeProjection.create(schema) + proj.initialize(index) + iter.map( r => { + numOutputRows += 1 + proj(r) + }) + } + } + + // Input can be InternalRow, has to be turned into UnsafeRows. + override protected val createUnsafeProjection: Boolean = true + + override def inputRDD: RDD[InternalRow] = rdd + + override val metadata: Map[String, String] = { + val markedFilters = for (filter <- filters) yield { + if (handledFilters.contains(filter)) s"*$filter" else s"$filter" + } + Map( + "ReadSchema" -> output.toStructType.catalogString, + "PushedFilters" -> markedFilters.mkString("[", ", ", "]")) + } + + // Don't care about `rdd` and `tableIdentifier` when canonicalizing. + override def doCanonicalize(): SparkPlan = + copy( + fullOutput.map(QueryPlan.normalizeExpressions(_, fullOutput)), + rdd = null, + tableIdentifier = None) +} + +/** + * Physical plan node for scanning data from HadoopFsRelations. + * + * @param relation The file-based relation to scan. + * @param output Output attributes of the scan, including data attributes and partition attributes. + * @param requiredSchema Required schema of the underlying relation, excluding partition columns. + * @param partitionFilters Predicates to use for partition pruning. + * @param optionalBucketSet Bucket ids for bucket pruning + * @param dataFilters Filters on non-partition columns. + * @param tableIdentifier identifier for the table in the metastore. + */ +case class FileSourceScanExec( + @transient relation: HadoopFsRelation, + output: Seq[Attribute], + requiredSchema: StructType, + partitionFilters: Seq[Expression], + optionalBucketSet: Option[BitSet], + dataFilters: Seq[Expression], + override val tableIdentifier: Option[TableIdentifier], + partiTionColumn: Seq[Attribute]) + extends DataSourceScanExec with NdpSupport { + + // Note that some vals referring the file-based relation are lazy intentionally + // so that this plan can be canonicalized on executor side too. See SPARK-23731. + override lazy val supportsColumnar: Boolean = { + relation.fileFormat.supportBatch(relation.sparkSession, schema) + } + + private lazy val needsUnsafeRowConversion: Boolean = { + if (relation.fileFormat.isInstanceOf[ParquetSource]) { + SparkSession.getActiveSession.get.sessionState.conf.parquetVectorizedReaderEnabled + } else { + false + } + } + + override def vectorTypes: Option[Seq[String]] = + relation.fileFormat.vectorTypes( + requiredSchema = output.toStructType, + partitionSchema = new StructType(), + relation.sparkSession.sessionState.conf) + + private lazy val driverMetrics: HashMap[String, Long] = HashMap.empty + + /** + * Send the driver-side metrics. Before calling this function, selectedPartitions has + * been initialized. See SPARK-26327 for more details. + */ + private def sendDriverMetrics(): Unit = { + driverMetrics.foreach(e => metrics(e._1).add(e._2)) + val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) + SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, + metrics.filter(e => driverMetrics.contains(e._1)).values.toSeq) + } + + private def isDynamicPruningFilter(e: Expression): Boolean = + e.find(_.isInstanceOf[PlanExpression[_]]).isDefined + + @transient private lazy val selectedPartitions: Array[PartitionDirectory] = { + val optimizerMetadataTimeNs = relation.location.metadataOpsTimeNs.getOrElse(0L) + val startTime = System.nanoTime() + val ret = + relation.location.listFiles( + partitionFilters.filterNot(isDynamicPruningFilter), dataFilters) + if (relation.partitionSchemaOption.isDefined) { + driverMetrics("numPartitions") = ret.length + } + setFilesNumAndSizeMetric(ret, true) + val timeTakenMs = NANOSECONDS.toMillis( + (System.nanoTime() - startTime) + optimizerMetadataTimeNs) + driverMetrics("metadataTime") = timeTakenMs + ret + }.toArray + + // We can only determine the actual partitions at runtime when a dynamic partition filter is + // present. This is because such a filter relies on information that is only available at run + // time (for instance the keys used in the other side of a join). + @transient private lazy val dynamicallySelectedPartitions: Array[PartitionDirectory] = { + val dynamicPartitionFilters = partitionFilters.filter(isDynamicPruningFilter) + + if (dynamicPartitionFilters.nonEmpty) { + val startTime = System.nanoTime() + // call the file index for the files matching all filters except dynamic partition filters + val predicate = dynamicPartitionFilters.reduce(And) + val partitionColumns = relation.partitionSchema + val boundPredicate = Predicate.create(predicate.transform { + case a: AttributeReference => + val index = partitionColumns.indexWhere(a.name == _.name) + BoundReference(index, partitionColumns(index).dataType, nullable = true) + }, Nil) + val ret = selectedPartitions.filter(p => boundPredicate.eval(p.values)) + setFilesNumAndSizeMetric(ret, false) + val timeTakenMs = (System.nanoTime() - startTime) / 1000 / 1000 + driverMetrics("pruningTime") = timeTakenMs + driverMetrics("numPartitions") = ret.length + ret + } else { + selectedPartitions + } + } + + /** + * [[partitionFilters]] can contain subqueries whose results are available only at runtime so + * accessing [[selectedPartitions]] should be guarded by this method during planning + */ + private def hasPartitionsAvailableAtRunTime: Boolean = { + partitionFilters.exists(ExecSubqueryExpression.hasSubquery) + } + + private def toAttribute(colName: String): Option[Attribute] = + output.find(_.name == colName) + + // exposed for testing + lazy val bucketedScan: Boolean = { + if (relation.sparkSession.sessionState.conf.bucketingEnabled && relation.bucketSpec.isDefined) { + val spec = relation.bucketSpec.get + val bucketColumns = spec.bucketColumnNames.flatMap(n => toAttribute(n)) + bucketColumns.size == spec.bucketColumnNames.size + } else { + false + } + } + + override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = { + if (bucketedScan) { + // For bucketed columns: + // ----------------------- + // `HashPartitioning` would be used only when: + // 1. ALL the bucketing columns are being read from the table + // + // For sorted columns: + // --------------------- + // Sort ordering should be used when ALL these criteria's match: + // 1. `HashPartitioning` is being used + // 2. A prefix (or all) of the sort columns are being read from the table. + // + // Sort ordering would be over the prefix subset of `sort columns` being read + // from the table. + // eg. + // Assume (col0, col2, col3) are the columns read from the table + // If sort columns are (col0, col1), then sort ordering would be considered as (col0) + // If sort columns are (col1, col0), then sort ordering would be empty as per rule #2 + // above + val spec = relation.bucketSpec.get + val bucketColumns = spec.bucketColumnNames.flatMap(n => toAttribute(n)) + val partitioning = HashPartitioning(bucketColumns, spec.numBuckets) + val sortColumns = + spec.sortColumnNames.map(x => toAttribute(x)).takeWhile(x => x.isDefined).map(_.get) + val shouldCalculateSortOrder = + conf.getConf(SQLConf.LEGACY_BUCKETED_TABLE_SCAN_OUTPUT_ORDERING) && + sortColumns.nonEmpty && + !hasPartitionsAvailableAtRunTime + + val sortOrder = if (shouldCalculateSortOrder) { + // In case of bucketing, its possible to have multiple files belonging to the + // same bucket in a given relation. Each of these files are locally sorted + // but those files combined together are not globally sorted. Given that, + // the RDD partition will not be sorted even if the relation has sort columns set + // Current solution is to check if all the buckets have a single file in it + + val files = selectedPartitions.flatMap(partition => partition.files) + val bucketToFilesGrouping = + files.map(_.getPath.getName).groupBy(file => BucketingUtils.getBucketId(file)) + val singleFilePartitions = bucketToFilesGrouping.forall(p => p._2.length <= 1) + + if (singleFilePartitions) { + // TODO Currently Spark does not support writing columns sorting in descending order + // so using Ascending order. This can be fixed in future + sortColumns.map(attribute => SortOrder(attribute, Ascending)) + } else { + Nil + } + } else { + Nil + } + (partitioning, sortOrder) + } else { + (UnknownPartitioning(0), Nil) + } + } + + @transient + private lazy val pushedDownFilters = { + val supportNestedPredicatePushdown = DataSourceUtils.supportNestedPredicatePushdown(relation) + dataFilters.flatMap(DataSourceStrategy.translateFilter(_, supportNestedPredicatePushdown)) + } + + override lazy val metadata: Map[String, String] = { + def seqToString(seq: Seq[Any]) = seq.mkString("[", ", ", "]") + val location = relation.location + val locationDesc = + location.getClass.getSimpleName + seqToString(location.rootPaths) + val metadata = + Map( + "Format" -> relation.fileFormat.toString, + "ReadSchema" -> requiredSchema.catalogString, + "Batched" -> supportsColumnar.toString, + "PartitionFilters" -> seqToString(partitionFilters), + "PushedFilters" -> seqToString(pushedDownFilters), + "DataFilters" -> seqToString(dataFilters), + "Location" -> locationDesc) + + val withSelectedBucketsCount = relation.bucketSpec.map { spec => + val numSelectedBuckets = optionalBucketSet.map { b => + b.cardinality() + } getOrElse { + spec.numBuckets + } + metadata + ("SelectedBucketsCount" -> + s"$numSelectedBuckets out of ${spec.numBuckets}") + } getOrElse { + metadata + } + + withSelectedBucketsCount + } + + override def verboseStringWithOperatorId(): String = { + val metadataStr = metadata.toSeq.sorted.filterNot { + case (_, value) if (value.isEmpty || value.equals("[]")) => true + case (key, _) if (key.equals("DataFilters") || key.equals("Format")) => true + case (_, _) => false + }.map { + case (key, _) if (key.equals("Location")) => + val location = relation.location + val numPaths = location.rootPaths.length + val abbreviatedLoaction = if (numPaths <= 1) { + location.rootPaths.mkString("[", ", ", "]") + } else { + "[" + location.rootPaths.head + s", ... ${numPaths - 1} entries]" + } + s"$key: ${location.getClass.getSimpleName} ${redact(abbreviatedLoaction)}" + case (key, value) => s"$key: ${redact(value)}" + } + + s""" + |$formattedNodeName + |${ExplainUtils.generateFieldString("Output", output)} + |${metadataStr.mkString("\n")} + |""".stripMargin + } + + lazy val inputRDD: RDD[InternalRow] = { + val readFile: (PartitionedFile) => Iterator[InternalRow] = + relation.fileFormat.buildReaderWithPartitionValues( + sparkSession = relation.sparkSession, + dataSchema = relation.dataSchema, + partitionSchema = relation.partitionSchema, + requiredSchema = requiredSchema, + filters = pushedDownFilters, + options = relation.options, + hadoopConf = relation.sparkSession.sessionState.newHadoopConfWithOptions(relation.options)) + + val readRDD = if (bucketedScan) { + createBucketedReadRDD(relation.bucketSpec.get, readFile, dynamicallySelectedPartitions, + relation) + } else { + createNonBucketedReadRDD(readFile, dynamicallySelectedPartitions, relation) + } + sendDriverMetrics() + readRDD + } + + override def inputRDDs(): Seq[RDD[InternalRow]] = { + inputRDD :: Nil + } + + /** SQL metrics generated only for scans using dynamic partition pruning. */ + private lazy val staticMetrics = if (partitionFilters.filter(isDynamicPruningFilter).nonEmpty) { + Map("staticFilesNum" -> SQLMetrics.createMetric(sparkContext, "static number of files read"), + "staticFilesSize" -> SQLMetrics.createSizeMetric(sparkContext, "static size of files read")) + } else { + Map.empty[String, SQLMetric] + } + + /** Helper for computing total number and size of files in selected partitions. */ + private def setFilesNumAndSizeMetric( + partitions: Seq[PartitionDirectory], + static: Boolean): Unit = { + val filesNum = partitions.map(_.files.size.toLong).sum + val filesSize = partitions.map(_.files.map(_.getLen).sum).sum + if (!static || partitionFilters.filter(isDynamicPruningFilter).isEmpty) { + driverMetrics("numFiles") = filesNum + driverMetrics("filesSize") = filesSize + } else { + driverMetrics("staticFilesNum") = filesNum + driverMetrics("staticFilesSize") = filesSize + } + } + + override lazy val metrics = Map( + "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), + "numFiles" -> SQLMetrics.createMetric(sparkContext, "number of files read"), + "metadataTime" -> SQLMetrics.createTimingMetric(sparkContext, "metadata time"), + "filesSize" -> SQLMetrics.createSizeMetric(sparkContext, "size of files read"), + "pruningTime" -> + SQLMetrics.createTimingMetric(sparkContext, "dynamic partition pruning time") + ) ++ { + // Tracking scan time has overhead, we can't afford to do it for each row, and can only do + // it for each batch. + if (supportsColumnar) { + Some("scanTime" -> SQLMetrics.createTimingMetric(sparkContext, "scan time")) + } else { + None + } + } ++ { + if (relation.partitionSchemaOption.isDefined) { + Some("numPartitions" -> SQLMetrics.createMetric(sparkContext, "number of partitions read")) + } else { + None + } + } ++ staticMetrics + + protected override def doExecute(): RDD[InternalRow] = { + val numOutputRows = longMetric("numOutputRows") + if (isPushDown || needsUnsafeRowConversion) { + inputRDD.mapPartitionsWithIndexInternal { (index, iter) => + val toUnsafe = UnsafeProjection.create(schema) + toUnsafe.initialize(index) + iter.map { row => + numOutputRows += 1 + toUnsafe(row) + } + } + } else { + inputRDD.mapPartitionsInternal { iter => + iter.map { row => + numOutputRows += 1 + row + } + } + } + } + + protected override def doExecuteColumnar(): RDD[ColumnarBatch] = { + val numOutputRows = longMetric("numOutputRows") + val scanTime = longMetric("scanTime") + inputRDD.asInstanceOf[RDD[ColumnarBatch]].mapPartitionsInternal { batches => + new Iterator[ColumnarBatch] { + + override def hasNext: Boolean = { + // The `FileScanRDD` returns an iterator which scans the file during the `hasNext` call. + val startNs = System.nanoTime() + val res = batches.hasNext + scanTime += NANOSECONDS.toMillis(System.nanoTime() - startNs) + res + } + + override def next(): ColumnarBatch = { + val batch = batches.next() + numOutputRows += batch.numRows() + batch + } + } + } + } + + override val nodeNamePrefix: String = "File" + + /** + * Create an RDD for bucketed reads. + * The non-bucketed variant of this function is [[createNonBucketedReadRDD]]. + * + * The algorithm is pretty simple: each RDD partition being returned should include all the files + * with the same bucket id from all the given Hive partitions. + * + * @param bucketSpec the bucketing spec. + * @param readFile a function to read each (part of a) file. + * @param selectedPartitions Hive-style partition that are part of the read. + * @param fsRelation [[HadoopFsRelation]] associated with the read. + */ + private def createBucketedReadRDD( + bucketSpec: BucketSpec, + readFile: (PartitionedFile) => Iterator[InternalRow], + selectedPartitions: Array[PartitionDirectory], + fsRelation: HadoopFsRelation): RDD[InternalRow] = { + logInfo(s"Planning with ${bucketSpec.numBuckets} buckets") + val filesGroupedToBuckets = + selectedPartitions.flatMap { p => + p.files.map { f => + PartitionedFileUtil.getPartitionedFile(f, f.getPath, p.values) + } + }.groupBy { f => + BucketingUtils + .getBucketId(new Path(f.filePath).getName) + .getOrElse(sys.error(s"Invalid bucket file ${f.filePath}")) + } + + val prunedFilesGroupedToBuckets = if (optionalBucketSet.isDefined) { + val bucketSet = optionalBucketSet.get + filesGroupedToBuckets.filter { + f => bucketSet.get(f._1) + } + } else { + filesGroupedToBuckets + } + + val filePartitions = Seq.tabulate(bucketSpec.numBuckets) { bucketId => + FilePartition(bucketId, prunedFilesGroupedToBuckets.getOrElse(bucketId, Array.empty)) + } + + if (isPushDown) { + new FileScanRDDPushDown(fsRelation.sparkSession, filePartitions, requiredSchema, output, + relation.dataSchema, ndpOperators, partiTionColumn, supportsColumnar, fsRelation.fileFormat) + } else { + new FileScanRDD(fsRelation.sparkSession, readFile, filePartitions) + } + } + + /** + * Create an RDD for non-bucketed reads. + * The bucketed variant of this function is [[createBucketedReadRDD]]. + * + * @param readFile a function to read each (part of a) file. + * @param selectedPartitions Hive-style partition that are part of the read. + * @param fsRelation [[HadoopFsRelation]] associated with the read. + */ + private def createNonBucketedReadRDD( + readFile: (PartitionedFile) => Iterator[InternalRow], + selectedPartitions: Array[PartitionDirectory], + fsRelation: HadoopFsRelation): RDD[InternalRow] = { + val openCostInBytes = fsRelation.sparkSession.sessionState.conf.filesOpenCostInBytes + val maxSplitBytes = + FilePartition.maxSplitBytes(fsRelation.sparkSession, selectedPartitions) + logInfo(s"Planning scan with bin packing, max size: $maxSplitBytes bytes, " + + s"open cost is considered as scanning $openCostInBytes bytes.") + + val splitFiles = selectedPartitions.flatMap { partition => + partition.files.flatMap { file => + // getPath() is very expensive so we only want to call it once in this block: + val filePath = file.getPath + val isSplitable = relation.fileFormat.isSplitable( + relation.sparkSession, relation.options, filePath) + PartitionedFileUtil.splitFiles( + sparkSession = relation.sparkSession, + file = file, + filePath = filePath, + isSplitable = isSplitable, + maxSplitBytes = maxSplitBytes, + partitionValues = partition.values + ) + } + }.sortBy(_.length)(implicitly[Ordering[Long]].reverse) + + val partitions = + FilePartition.getFilePartitions(relation.sparkSession, splitFiles, maxSplitBytes) + + if (isPushDown) { + new FileScanRDDPushDown(fsRelation.sparkSession, partitions, requiredSchema, output, + relation.dataSchema, ndpOperators, partiTionColumn, supportsColumnar, fsRelation.fileFormat) + } else { + // TODO 重写一个FileScanRDD 重新调用 + new FileScanRDD(fsRelation.sparkSession, readFile, partitions) + } + } + + override def doCanonicalize(): FileSourceScanExec = { + val ref = dataFilters.flatMap(_.references) + val exprIds = ref.map(x => (x.name, x.exprId)).toMap + val filterOutput = relation.dataSchema.fields.filter { x => + ref.map(_.name).contains(x.name) + }.map { x => + AttributeReference(x.name, x.dataType, x.nullable, x.metadata)(exprIds.getOrElse(x.name, + NamedExpression.newExprId)) + }.toSeq + FileSourceScanExec( + relation, + output.map(QueryPlan.normalizeExpressions(_, output)), + requiredSchema, + QueryPlan.normalizePredicates(partitionFilters, output), + optionalBucketSet, + QueryPlan.normalizePredicates(dataFilters, filterOutput), + None, partiTionColumn.map(QueryPlan.normalizeExpressions(_, output))) + } +} diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala new file mode 100644 index 00000000..52f36875 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -0,0 +1,350 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import java.io.{BufferedWriter, OutputStreamWriter} +import java.util.UUID + +import org.apache.hadoop.fs.Path + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{AnalysisException, Row, SparkSession} +import org.apache.spark.sql.catalyst.{InternalRow, QueryPlanningTracker} +import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker +import org.apache.spark.sql.catalyst.expressions.codegen.ByteCodeStats +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ReturnAnswer} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat +import org.apache.spark.sql.catalyst.util.truncatedString +import org.apache.spark.sql.execution.adaptive.{AdaptiveExecutionContext, InsertAdaptiveSparkPlan} +import org.apache.spark.sql.execution.dynamicpruning.PlanDynamicPruningFilters +import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange} +import org.apache.spark.sql.execution.ndp.NdpPushDown +import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.streaming.OutputMode +import org.apache.spark.util.Utils + +/** + * The primary workflow for executing relational queries using Spark. Designed to allow easy + * access to the intermediate phases of query execution for developers. + * + * While this is not a public class, we should avoid changing the function names for the sake of + * changing them, because a lot of developers use the feature for debugging. + */ +class QueryExecution( + val sparkSession: SparkSession, + val logical: LogicalPlan, + val tracker: QueryPlanningTracker = new QueryPlanningTracker) { + + // TODO: Move the planner an optimizer into here from SessionState. + protected def planner = sparkSession.sessionState.planner + + def assertAnalyzed(): Unit = analyzed + + def assertSupported(): Unit = { + if (sparkSession.sessionState.conf.isUnsupportedOperationCheckEnabled) { + UnsupportedOperationChecker.checkForBatch(analyzed) + } + } + + lazy val analyzed: LogicalPlan = executePhase(QueryPlanningTracker.ANALYSIS) { + // We can't clone `logical` here, which will reset the `_analyzed` flag. + sparkSession.sessionState.analyzer.executeAndCheck(logical, tracker) + } + + lazy val withCachedData: LogicalPlan = sparkSession.withActive { + assertAnalyzed() + assertSupported() + // clone the plan to avoid sharing the plan instance between different stages like analyzing, + // optimizing and planning. + sparkSession.sharedState.cacheManager.useCachedData(analyzed.clone()) + } + + lazy val optimizedPlan: LogicalPlan = executePhase(QueryPlanningTracker.OPTIMIZATION) { + // clone the plan to avoid sharing the plan instance between different stages like analyzing, + // optimizing and planning. + sparkSession.sessionState.optimizer.executeAndTrack(withCachedData.clone(), tracker) + } + + private def assertOptimized(): Unit = optimizedPlan + + lazy val sparkPlan: SparkPlan = { + // We need to materialize the optimizedPlan here because sparkPlan is also tracked under + // the planning phase + assertOptimized() + executePhase(QueryPlanningTracker.PLANNING) { + // Clone the logical plan here, in case the planner rules change the states of the logical + // plan. + QueryExecution.createSparkPlan(sparkSession, planner, optimizedPlan.clone()) + } + } + + // executedPlan should not be used to initialize any SparkPlan. It should be + // only used for execution. + lazy val executedPlan: SparkPlan = { + // We need to materialize the optimizedPlan here, before tracking the planning phase, to ensure + // that the optimization time is not counted as part of the planning phase. + assertOptimized() + executePhase(QueryPlanningTracker.PLANNING) { + // clone the plan to avoid sharing the plan instance between different stages like analyzing, + // optimizing and planning. + QueryExecution.prepareForExecution(preparations, sparkPlan.clone()) + } + } + + /** + * Internal version of the RDD. Avoids copies and has no schema. + * Note for callers: Spark may apply various optimization including reusing object: this means + * the row is valid only for the iteration it is retrieved. You should avoid storing row and + * accessing after iteration. (Calling `collect()` is one of known bad usage.) + * If you want to store these rows into collection, please apply some converter or copy row + * which produces new object per iteration. + * Given QueryExecution is not a public class, end users are discouraged to use this: please + * use `Dataset.rdd` instead where conversion will be applied. + */ + lazy val toRdd: RDD[InternalRow] = new SQLExecutionRDD( + executedPlan.execute(), sparkSession.sessionState.conf) + + /** Get the metrics observed during the execution of the query plan. */ + def observedMetrics: Map[String, Row] = CollectMetricsExec.collect(executedPlan) + + protected def preparations: Seq[Rule[SparkPlan]] = { + QueryExecution.preparations(sparkSession, + Option(InsertAdaptiveSparkPlan(AdaptiveExecutionContext(sparkSession, this)))) + } + + private def executePhase[T](phase: String)(block: => T): T = sparkSession.withActive { + tracker.measurePhase(phase)(block) + } + + def simpleString: String = simpleString(false) + + def simpleString(formatted: Boolean): String = withRedaction { + val concat = new PlanStringConcat() + concat.append("== Physical Plan ==\n") + if (formatted) { + try { + ExplainUtils.processPlan(executedPlan, concat.append) + } catch { + case e: AnalysisException => concat.append(e.toString) + case e: IllegalArgumentException => concat.append(e.toString) + } + } else { + QueryPlan.append(executedPlan, concat.append, verbose = false, addSuffix = false) + } + concat.append("\n") + concat.toString + } + + def explainString(mode: ExplainMode): String = { + val queryExecution = if (logical.isStreaming) { + // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the + // output mode does not matter since there is no `Sink`. + new IncrementalExecution( + sparkSession, logical, OutputMode.Append(), "", + UUID.randomUUID, UUID.randomUUID, 0, OffsetSeqMetadata(0, 0)) + } else { + this + } + + mode match { + case SimpleMode => + queryExecution.simpleString + case ExtendedMode => + queryExecution.toString + case CodegenMode => + try { + org.apache.spark.sql.execution.debug.codegenString(queryExecution.executedPlan) + } catch { + case e: AnalysisException => e.toString + } + case CostMode => + queryExecution.stringWithStats + case FormattedMode => + queryExecution.simpleString(formatted = true) + } + } + + private def writePlans(append: String => Unit, maxFields: Int): Unit = { + val (verbose, addSuffix) = (true, false) + append("== Parsed Logical Plan ==\n") + QueryPlan.append(logical, append, verbose, addSuffix, maxFields) + append("\n== Analyzed Logical Plan ==\n") + try { + append( + truncatedString( + analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}"), ", ", maxFields) + ) + append("\n") + QueryPlan.append(analyzed, append, verbose, addSuffix, maxFields) + append("\n== Optimized Logical Plan ==\n") + QueryPlan.append(optimizedPlan, append, verbose, addSuffix, maxFields) + append("\n== Physical Plan ==\n") + QueryPlan.append(executedPlan, append, verbose, addSuffix, maxFields) + } catch { + case e: AnalysisException => append(e.toString) + } + } + + override def toString: String = withRedaction { + val concat = new PlanStringConcat() + writePlans(concat.append, SQLConf.get.maxToStringFields) + concat.toString + } + + def stringWithStats: String = withRedaction { + val concat = new PlanStringConcat() + val maxFields = SQLConf.get.maxToStringFields + + // trigger to compute stats for logical plans + try { + optimizedPlan.stats + } catch { + case e: AnalysisException => concat.append(e.toString + "\n") + } + // only show optimized logical plan and physical plan + concat.append("== Optimized Logical Plan ==\n") + QueryPlan.append(optimizedPlan, concat.append, verbose = true, addSuffix = true, maxFields) + concat.append("\n== Physical Plan ==\n") + QueryPlan.append(executedPlan, concat.append, verbose = true, addSuffix = false, maxFields) + concat.append("\n") + concat.toString + } + + /** + * Redact the sensitive information in the given string. + */ + private def withRedaction(message: String): String = { + Utils.redact(sparkSession.sessionState.conf.stringRedactionPattern, message) + } + + /** A special namespace for commands that can be used to debug query execution. */ + // scalastyle:off + object debug { + // scalastyle:on + + /** + * Prints to stdout all the generated code found in this plan (i.e. the output of each + * WholeStageCodegen subtree). + */ + def codegen(): Unit = { + // scalastyle:off println + println(org.apache.spark.sql.execution.debug.codegenString(executedPlan)) + // scalastyle:on println + } + + /** + * Get WholeStageCodegenExec subtrees and the codegen in a query plan + * + * @return Sequence of WholeStageCodegen subtrees and corresponding codegen + */ + def codegenToSeq(): Seq[(String, String, ByteCodeStats)] = { + org.apache.spark.sql.execution.debug.codegenStringSeq(executedPlan) + } + + /** + * Dumps debug information about query execution into the specified file. + * + * @param maxFields maximum number of fields converted to string representation. + */ + def toFile(path: String, maxFields: Int = Int.MaxValue): Unit = { + val filePath = new Path(path) + val fs = filePath.getFileSystem(sparkSession.sessionState.newHadoopConf()) + val writer = new BufferedWriter(new OutputStreamWriter(fs.create(filePath))) + val append = (s: String) => { + writer.write(s) + } + try { + writePlans(append, maxFields) + writer.write("\n== Whole Stage Codegen ==\n") + org.apache.spark.sql.execution.debug.writeCodegen(writer.write, executedPlan) + } finally { + writer.close() + } + } + } +} + +object QueryExecution { + /** + * Construct a sequence of rules that are used to prepare a planned [[SparkPlan]] for execution. + * These rules will make sure subqueries are planned, make use the data partitioning and ordering + * are correct, insert whole stage code gen, and try to reduce the work done by reusing exchanges + * and subqueries. + */ + private[execution] def preparations( + sparkSession: SparkSession, + adaptiveExecutionRule: Option[InsertAdaptiveSparkPlan] = None): Seq[Rule[SparkPlan]] = { + // `AdaptiveSparkPlanExec` is a leaf node. If inserted, all the following rules will be no-op + // as the original plan is hidden behind `AdaptiveSparkPlanExec`. + adaptiveExecutionRule.toSeq ++ + Seq( + PlanDynamicPruningFilters(sparkSession), + PlanSubqueries(sparkSession), + NdpPushDown(sparkSession), + EnsureRequirements(sparkSession.sessionState.conf), + ApplyColumnarRulesAndInsertTransitions(sparkSession.sessionState.conf, + sparkSession.sessionState.columnarRules), + CollapseCodegenStages(sparkSession.sessionState.conf), + ReuseExchange(sparkSession.sessionState.conf), + ReuseSubquery(sparkSession.sessionState.conf) + ) + } + + /** + * Prepares a planned [[SparkPlan]] for execution by inserting shuffle operations and internal + * row format conversions as needed. + */ + private[execution] def prepareForExecution( + preparations: Seq[Rule[SparkPlan]], + plan: SparkPlan): SparkPlan = { + preparations.foldLeft(plan) { case (sp, rule) => rule.apply(sp) } + } + + /** + * Transform a [[LogicalPlan]] into a [[SparkPlan]]. + * + * Note that the returned physical plan still needs to be prepared for execution. + */ + def createSparkPlan( + sparkSession: SparkSession, + planner: SparkPlanner, + plan: LogicalPlan): SparkPlan = { + // TODO: We use next(), i.e. take the first plan returned by the planner, here for now, + // but we will implement to choose the best plan. + planner.plan(ReturnAnswer(plan)).next() + } + + /** + * Prepare the [[SparkPlan]] for execution. + */ + def prepareExecutedPlan(spark: SparkSession, plan: SparkPlan): SparkPlan = { + prepareForExecution(preparations(spark), plan) + } + + /** + * Transform the subquery's [[LogicalPlan]] into a [[SparkPlan]] and prepare the resulting + * [[SparkPlan]] for execution. + */ + def prepareExecutedPlan(spark: SparkSession, plan: LogicalPlan): SparkPlan = { + val sparkPlan = createSparkPlan(spark, spark.sessionState.planner, plan.clone()) + prepareExecutedPlan(spark, sparkPlan) + } +} diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala new file mode 100644 index 00000000..1b8bc4a5 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.adaptive.LogicalQueryStageStrategy +import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, FileSourceStrategy} +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Strategy +import org.apache.spark.sql.internal.SQLConf + +class SparkPlanner( + val session: SparkSession, + val conf: SQLConf, + val experimentalMethods: ExperimentalMethods) + extends SparkStrategies { + + def numPartitions: Int = conf.numShufflePartitions + + override def strategies: Seq[Strategy] = + experimentalMethods.extraStrategies ++ + extraPlanningStrategies ++ ( + LogicalQueryStageStrategy :: + PythonEvals :: + new DataSourceV2Strategy(session) :: + FileSourceStrategy :: + DataSourceStrategy(conf) :: + SpecialLimits :: + Aggregation :: + Window :: + JoinSelection :: + InMemoryScans :: + BasicOperators :: Nil) + + /** + * Override to add extra planning strategies to the planner. These strategies are tried after + * the strategies defined in [[ExperimentalMethods]], and before the regular strategies. + */ + def extraPlanningStrategies: Seq[Strategy] = Nil + + override protected def collectPlaceholders(plan: SparkPlan): Seq[(SparkPlan, LogicalPlan)] = { + plan.collect { + case placeholder @ PlanLater(logicalPlan) => placeholder -> logicalPlan + } + } + + override protected def prunePlans(plans: Iterator[SparkPlan]): Iterator[SparkPlan] = { + // TODO: We will need to prune bad plans when we improve plan space exploration + // to prevent combinatorial explosion. + plans + } + + /** + * Used to build table scan operators where complex projection and filtering are done using + * separate physical operators. This function returns the given scan operator with Project and + * Filter nodes added only when needed. For example, a Project operator is only used when the + * final desired output requires complex expressions to be evaluated or when columns can be + * further eliminated out after filtering has been done. + * + * The `prunePushedDownFilters` parameter is used to remove those filters that can be optimized + * away by the filter pushdown optimization. + * + * The required attributes for both filtering and expression evaluation are passed to the + * provided `scanBuilder` function so that it can avoid unnecessary column materialization. + */ + def pruneFilterProject( + projectList: Seq[NamedExpression], + filterPredicates: Seq[Expression], + prunePushedDownFilters: Seq[Expression] => Seq[Expression], + scanBuilder: Seq[Attribute] => SparkPlan, + selectivity: Option[Double]): SparkPlan = { + + val projectSet = AttributeSet(projectList.flatMap(_.references)) + val filterSet = AttributeSet(filterPredicates.flatMap(_.references)) + val filterCondition: Option[Expression] = + prunePushedDownFilters(filterPredicates).reduceLeftOption(catalyst.expressions.And) + + // Right now we still use a projection even if the only evaluation is applying an alias + // to a column. Since this is a no-op, it could be avoided. However, using this + // optimization with the current implementation would change the output schema. + // TODO: Decouple final output schema from expression evaluation so this copy can be + // avoided safely. + + if (AttributeSet(projectList.map(_.toAttribute)) == projectSet && + filterSet.subsetOf(projectSet)) { + // When it is possible to just use column pruning to get the right projection and + // when the columns of this projection are enough to evaluate all filter conditions, + // just do a scan followed by a filter, with no extra project. + val scan = scanBuilder(projectList.asInstanceOf[Seq[Attribute]]) + filterCondition.map(FilterExec(_, scan, selectivity)).getOrElse(scan) + } else { + val scan = scanBuilder((projectSet ++ filterSet).toSeq) + ProjectExec(projectList, filterCondition.map(FilterExec(_, scan, selectivity)) + .getOrElse(scan)) + } + } +} diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala new file mode 100644 index 00000000..8240e7d7 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -0,0 +1,787 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{execution, AnalysisException, Strategy} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression +import org.apache.spark.sql.catalyst.optimizer.NormalizeFloatingNumbers +import org.apache.spark.sql.catalyst.planning._ +import org.apache.spark.sql.catalyst.plans._ +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.FilterEstimation +import org.apache.spark.sql.catalyst.plans.physical._ +import org.apache.spark.sql.catalyst.streaming.InternalOutputModes +import org.apache.spark.sql.execution.aggregate.AggUtils +import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec} +import org.apache.spark.sql.execution.command._ +import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec +import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide} +import org.apache.spark.sql.execution.python._ +import org.apache.spark.sql.execution.streaming._ +import org.apache.spark.sql.execution.streaming.sources.MemoryPlan +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery} +import org.apache.spark.sql.types.StructType + +/** + * Converts a logical plan into zero or more SparkPlans. This API is exposed for experimenting + * with the query planner and is not designed to be stable across spark releases. Developers + * writing libraries should instead consider using the stable APIs provided in + * [[org.apache.spark.sql.sources]] + */ +abstract class SparkStrategy extends GenericStrategy[SparkPlan] { + + override protected def planLater(plan: LogicalPlan): SparkPlan = PlanLater(plan) +} + +case class PlanLater(plan: LogicalPlan) extends LeafExecNode { + + override def output: Seq[Attribute] = plan.output + + protected override def doExecute(): RDD[InternalRow] = { + throw new UnsupportedOperationException() + } +} + +abstract class SparkStrategies extends QueryPlanner[SparkPlan] { + self: SparkPlanner => + + override def plan(plan: LogicalPlan): Iterator[SparkPlan] = { + super.plan(plan).map { p => + val logicalPlan = plan match { + case ReturnAnswer(rootPlan) => rootPlan + case _ => plan + } + p.setLogicalLink(logicalPlan) + p + } + } + + /** + * Plans special cases of limit operators. + */ + object SpecialLimits extends Strategy { + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case ReturnAnswer(rootPlan) => rootPlan match { + case Limit(IntegerLiteral(limit), Sort(order, true, child)) + if limit < conf.topKSortFallbackThreshold => + TakeOrderedAndProjectExec(limit, order, child.output, planLater(child)) :: Nil + case Limit(IntegerLiteral(limit), Project(projectList, Sort(order, true, child))) + if limit < conf.topKSortFallbackThreshold => + TakeOrderedAndProjectExec(limit, order, projectList, planLater(child)) :: Nil + case Limit(IntegerLiteral(limit), child) => + CollectLimitExec(limit, planLater(child)) :: Nil + case Tail(IntegerLiteral(limit), child) => + CollectTailExec(limit, planLater(child)) :: Nil + case other => planLater(other) :: Nil + } + case Limit(IntegerLiteral(limit), Sort(order, true, child)) + if limit < conf.topKSortFallbackThreshold => + TakeOrderedAndProjectExec(limit, order, child.output, planLater(child)) :: Nil + case Limit(IntegerLiteral(limit), Project(projectList, Sort(order, true, child))) + if limit < conf.topKSortFallbackThreshold => + TakeOrderedAndProjectExec(limit, order, projectList, planLater(child)) :: Nil + case _ => Nil + } + } + + /** + * Select the proper physical plan for join based on join strategy hints, the availability of + * equi-join keys and the sizes of joining relations. Below are the existing join strategies, + * their characteristics and their limitations. + * + * - Broadcast hash join (BHJ): + * Only supported for equi-joins, while the join keys do not need to be sortable. + * Supported for all join types except full outer joins. + * BHJ usually performs faster than the other join algorithms when the broadcast side is + * small. However, broadcasting tables is a network-intensive operation and it could cause + * OOM or perform badly in some cases, especially when the build/broadcast side is big. + * + * - Shuffle hash join: + * Only supported for equi-joins, while the join keys do not need to be sortable. + * Supported for all join types except full outer joins. + * + * - Shuffle sort merge join (SMJ): + * Only supported for equi-joins and the join keys have to be sortable. + * Supported for all join types. + * + * - Broadcast nested loop join (BNLJ): + * Supports both equi-joins and non-equi-joins. + * Supports all the join types, but the implementation is optimized for: + * 1) broadcasting the left side in a right outer join; + * 2) broadcasting the right side in a left outer, left semi, left anti or existence join; + * 3) broadcasting either side in an inner-like join. + * For other cases, we need to scan the data multiple times, which can be rather slow. + * + * - Shuffle-and-replicate nested loop join (a.k.a. cartesian product join): + * Supports both equi-joins and non-equi-joins. + * Supports only inner like joins. + */ + object JoinSelection extends Strategy with PredicateHelper { + + /** + * Matches a plan whose output should be small enough to be used in broadcast join. + */ + private def canBroadcast(plan: LogicalPlan): Boolean = { + plan.stats.sizeInBytes >= 0 && plan.stats.sizeInBytes <= conf.autoBroadcastJoinThreshold + } + + /** + * Matches a plan whose single partition should be small enough to build a hash table. + * + * Note: this assume that the number of partition is fixed, requires additional work if it's + * dynamic. + */ + private def canBuildLocalHashMap(plan: LogicalPlan): Boolean = { + plan.stats.sizeInBytes < conf.autoBroadcastJoinThreshold * conf.numShufflePartitions + } + + /** + * Returns whether plan a is much smaller (3X) than plan b. + * + * The cost to build hash map is higher than sorting, we should only build hash map on a table + * that is much smaller than other one. Since we does not have the statistic for number of rows, + * use the size of bytes here as estimation. + */ + private def muchSmaller(a: LogicalPlan, b: LogicalPlan): Boolean = { + a.stats.sizeInBytes * 3 <= b.stats.sizeInBytes + } + + private def canBuildRight(joinType: JoinType): Boolean = joinType match { + case _: InnerLike | LeftOuter | LeftSemi | LeftAnti | _: ExistenceJoin => true + case _ => false + } + + private def canBuildLeft(joinType: JoinType): Boolean = joinType match { + case _: InnerLike | RightOuter => true + case _ => false + } + + private def getBuildSide( + wantToBuildLeft: Boolean, + wantToBuildRight: Boolean, + left: LogicalPlan, + right: LogicalPlan): Option[BuildSide] = { + if (wantToBuildLeft && wantToBuildRight) { + // returns the smaller side base on its estimated physical size, if we want to build the + // both sides. + Some(getSmallerSide(left, right)) + } else if (wantToBuildLeft) { + Some(BuildLeft) + } else if (wantToBuildRight) { + Some(BuildRight) + } else { + None + } + } + + private def getSmallerSide(left: LogicalPlan, right: LogicalPlan) = { + if (right.stats.sizeInBytes <= left.stats.sizeInBytes) BuildRight else BuildLeft + } + + private def hintToBroadcastLeft(hint: JoinHint): Boolean = { + hint.leftHint.exists(_.strategy.contains(BROADCAST)) + } + + private def hintToBroadcastRight(hint: JoinHint): Boolean = { + hint.rightHint.exists(_.strategy.contains(BROADCAST)) + } + + private def hintToShuffleHashLeft(hint: JoinHint): Boolean = { + hint.leftHint.exists(_.strategy.contains(SHUFFLE_HASH)) + } + + private def hintToShuffleHashRight(hint: JoinHint): Boolean = { + hint.rightHint.exists(_.strategy.contains(SHUFFLE_HASH)) + } + + private def hintToSortMergeJoin(hint: JoinHint): Boolean = { + hint.leftHint.exists(_.strategy.contains(SHUFFLE_MERGE)) || + hint.rightHint.exists(_.strategy.contains(SHUFFLE_MERGE)) + } + + private def hintToShuffleReplicateNL(hint: JoinHint): Boolean = { + hint.leftHint.exists(_.strategy.contains(SHUFFLE_REPLICATE_NL)) || + hint.rightHint.exists(_.strategy.contains(SHUFFLE_REPLICATE_NL)) + } + + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + + // If it is an equi-join, we first look at the join hints w.r.t. the following order: + // 1. broadcast hint: pick broadcast hash join if the join type is supported. If both sides + // have the broadcast hints, choose the smaller side (based on stats) to broadcast. + // 2. sort merge hint: pick sort merge join if join keys are sortable. + // 3. shuffle hash hint: We pick shuffle hash join if the join type is supported. If both + // sides have the shuffle hash hints, choose the smaller side (based on stats) as the + // build side. + // 4. shuffle replicate NL hint: pick cartesian product if join type is inner like. + // + // If there is no hint or the hints are not applicable, we follow these rules one by one: + // 1. Pick broadcast hash join if one side is small enough to broadcast, and the join type + // is supported. If both sides are small, choose the smaller side (based on stats) + // to broadcast. + // 2. Pick shuffle hash join if one side is small enough to build local hash map, and is + // much smaller than the other side, and `spark.sql.join.preferSortMergeJoin` is false. + // 3. Pick sort merge join if the join keys are sortable. + // 4. Pick cartesian product if join type is inner like. + // 5. Pick broadcast nested loop join as the final solution. It may OOM but we don't have + // other choice. + case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right, hint) => + def createBroadcastHashJoin(buildLeft: Boolean, buildRight: Boolean) = { + val wantToBuildLeft = canBuildLeft(joinType) && buildLeft + val wantToBuildRight = canBuildRight(joinType) && buildRight + getBuildSide(wantToBuildLeft, wantToBuildRight, left, right).map { buildSide => + Seq(joins.BroadcastHashJoinExec( + leftKeys, + rightKeys, + joinType, + buildSide, + condition, + planLater(left), + planLater(right))) + } + } + + def createShuffleHashJoin(buildLeft: Boolean, buildRight: Boolean) = { + val wantToBuildLeft = canBuildLeft(joinType) && buildLeft + val wantToBuildRight = canBuildRight(joinType) && buildRight + getBuildSide(wantToBuildLeft, wantToBuildRight, left, right).map { buildSide => + Seq(joins.ShuffledHashJoinExec( + leftKeys, + rightKeys, + joinType, + buildSide, + condition, + planLater(left), + planLater(right))) + } + } + + def createSortMergeJoin() = { + if (RowOrdering.isOrderable(leftKeys)) { + Some(Seq(joins.SortMergeJoinExec( + leftKeys, rightKeys, joinType, condition, planLater(left), planLater(right)))) + } else { + None + } + } + + def createCartesianProduct() = { + if (joinType.isInstanceOf[InnerLike]) { + Some(Seq(joins.CartesianProductExec(planLater(left), planLater(right), condition))) + } else { + None + } + } + + def createJoinWithoutHint() = { + createBroadcastHashJoin( + canBroadcast(left) && !hint.leftHint.exists(_.strategy.contains(NO_BROADCAST_HASH)), + canBroadcast(right) && !hint.rightHint.exists(_.strategy.contains(NO_BROADCAST_HASH))) + .orElse { + if (!conf.preferSortMergeJoin) { + createShuffleHashJoin( + canBuildLocalHashMap(left) && muchSmaller(left, right), + canBuildLocalHashMap(right) && muchSmaller(right, left)) + } else { + None + } + } + .orElse(createSortMergeJoin()) + .orElse(createCartesianProduct()) + .getOrElse { + // This join could be very slow or OOM + val buildSide = getSmallerSide(left, right) + Seq(joins.BroadcastNestedLoopJoinExec( + planLater(left), planLater(right), buildSide, joinType, condition)) + } + } + + createBroadcastHashJoin(hintToBroadcastLeft(hint), hintToBroadcastRight(hint)) + .orElse { if (hintToSortMergeJoin(hint)) createSortMergeJoin() else None } + .orElse(createShuffleHashJoin(hintToShuffleHashLeft(hint), hintToShuffleHashRight(hint))) + .orElse { if (hintToShuffleReplicateNL(hint)) createCartesianProduct() else None } + .getOrElse(createJoinWithoutHint()) + + // If it is not an equi-join, we first look at the join hints w.r.t. the following order: + // 1. broadcast hint: pick broadcast nested loop join. If both sides have the broadcast + // hints, choose the smaller side (based on stats) to broadcast for inner and full joins, + // choose the left side for right join, and choose right side for left join. + // 2. shuffle replicate NL hint: pick cartesian product if join type is inner like. + // + // If there is no hint or the hints are not applicable, we follow these rules one by one: + // 1. Pick broadcast nested loop join if one side is small enough to broadcast. If only left + // side is broadcast-able and it's left join, or only right side is broadcast-able and + // it's right join, we skip this rule. If both sides are small, broadcasts the smaller + // side for inner and full joins, broadcasts the left side for right join, and broadcasts + // right side for left join. + // 2. Pick cartesian product if join type is inner like. + // 3. Pick broadcast nested loop join as the final solution. It may OOM but we don't have + // other choice. It broadcasts the smaller side for inner and full joins, broadcasts the + // left side for right join, and broadcasts right side for left join. + case logical.Join(left, right, joinType, condition, hint) => + val desiredBuildSide = if (joinType.isInstanceOf[InnerLike] || joinType == FullOuter) { + getSmallerSide(left, right) + } else { + // For perf reasons, `BroadcastNestedLoopJoinExec` prefers to broadcast left side if + // it's a right join, and broadcast right side if it's a left join. + // TODO: revisit it. If left side is much smaller than the right side, it may be better + // to broadcast the left side even if it's a left join. + if (canBuildLeft(joinType)) BuildLeft else BuildRight + } + + def createBroadcastNLJoin(buildLeft: Boolean, buildRight: Boolean) = { + val maybeBuildSide = if (buildLeft && buildRight) { + Some(desiredBuildSide) + } else if (buildLeft) { + Some(BuildLeft) + } else if (buildRight) { + Some(BuildRight) + } else { + None + } + + maybeBuildSide.map { buildSide => + Seq(joins.BroadcastNestedLoopJoinExec( + planLater(left), planLater(right), buildSide, joinType, condition)) + } + } + + def createCartesianProduct() = { + if (joinType.isInstanceOf[InnerLike]) { + Some(Seq(joins.CartesianProductExec(planLater(left), planLater(right), condition))) + } else { + None + } + } + + def createJoinWithoutHint() = { + createBroadcastNLJoin(canBroadcast(left), canBroadcast(right)) + .orElse(createCartesianProduct()) + .getOrElse { + // This join could be very slow or OOM + Seq(joins.BroadcastNestedLoopJoinExec( + planLater(left), planLater(right), desiredBuildSide, joinType, condition)) + } + } + + createBroadcastNLJoin(hintToBroadcastLeft(hint), hintToBroadcastRight(hint)) + .orElse { if (hintToShuffleReplicateNL(hint)) createCartesianProduct() else None } + .getOrElse(createJoinWithoutHint()) + + + // --- Cases where this strategy does not apply --------------------------------------------- + case _ => Nil + } + } + + /** + * Used to plan streaming aggregation queries that are computed incrementally as part of a + * [[StreamingQuery]]. Currently this rule is injected into the planner + * on-demand, only when planning in a [[org.apache.spark.sql.execution.streaming.StreamExecution]] + */ + object StatefulAggregationStrategy extends Strategy { + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case _ if !plan.isStreaming => Nil + + case EventTimeWatermark(columnName, delay, child) => + EventTimeWatermarkExec(columnName, delay, planLater(child)) :: Nil + + case PhysicalAggregation( + namedGroupingExpressions, aggregateExpressions, rewrittenResultExpressions, child) => + + if (aggregateExpressions.exists(PythonUDF.isGroupedAggPandasUDF)) { + throw new AnalysisException( + "Streaming aggregation doesn't support group aggregate pandas UDF") + } + + val stateVersion = conf.getConf(SQLConf.STREAMING_AGGREGATION_STATE_FORMAT_VERSION) + + // Ideally this should be done in `NormalizeFloatingNumbers`, but we do it here because + // `groupingExpressions` is not extracted during logical phase. + val normalizedGroupingExpressions = namedGroupingExpressions.map { e => + NormalizeFloatingNumbers.normalize(e) match { + case n: NamedExpression => n + case other => Alias(other, e.name)(exprId = e.exprId) + } + } + + AggUtils.planStreamingAggregation( + normalizedGroupingExpressions, + aggregateExpressions.map(expr => expr.asInstanceOf[AggregateExpression]), + rewrittenResultExpressions, + stateVersion, + planLater(child)) + + case _ => Nil + } + } + + /** + * Used to plan the streaming deduplicate operator. + */ + object StreamingDeduplicationStrategy extends Strategy { + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case Deduplicate(keys, child) if child.isStreaming => + StreamingDeduplicateExec(keys, planLater(child)) :: Nil + + case _ => Nil + } + } + + /** + * Used to plan the streaming global limit operator for streams in append mode. + * We need to check for either a direct Limit or a Limit wrapped in a ReturnAnswer operator, + * following the example of the SpecialLimits Strategy above. + */ + case class StreamingGlobalLimitStrategy(outputMode: OutputMode) extends Strategy { + + private def generatesStreamingAppends(plan: LogicalPlan): Boolean = { + + /** Ensures that this plan does not have a streaming aggregate in it. */ + def hasNoStreamingAgg: Boolean = { + plan.collectFirst { case a: Aggregate if a.isStreaming => a }.isEmpty + } + + // The following cases of limits on a streaming plan has to be executed with a stateful + // streaming plan. + // 1. When the query is in append mode (that is, all logical plan operate on appended data). + // 2. When the plan does not contain any streaming aggregate (that is, plan has only + // operators that operate on appended data). This must be executed with a stateful + // streaming plan even if the query is in complete mode because of a later streaming + // aggregation (e.g., `streamingDf.limit(5).groupBy().count()`). + plan.isStreaming && ( + outputMode == InternalOutputModes.Append || + outputMode == InternalOutputModes.Complete && hasNoStreamingAgg) + } + + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case ReturnAnswer(Limit(IntegerLiteral(limit), child)) if generatesStreamingAppends(child) => + StreamingGlobalLimitExec(limit, StreamingLocalLimitExec(limit, planLater(child))) :: Nil + + case Limit(IntegerLiteral(limit), child) if generatesStreamingAppends(child) => + StreamingGlobalLimitExec(limit, StreamingLocalLimitExec(limit, planLater(child))) :: Nil + + case _ => Nil + } + } + + object StreamingJoinStrategy extends Strategy { + override def apply(plan: LogicalPlan): Seq[SparkPlan] = { + plan match { + case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right, _) + if left.isStreaming && right.isStreaming => + + val stateVersion = conf.getConf(SQLConf.STREAMING_JOIN_STATE_FORMAT_VERSION) + new StreamingSymmetricHashJoinExec(leftKeys, rightKeys, joinType, condition, + stateVersion, planLater(left), planLater(right)) :: Nil + + case Join(left, right, _, _, _) if left.isStreaming && right.isStreaming => + throw new AnalysisException( + "Stream-stream join without equality predicate is not supported", plan = Some(plan)) + + case _ => Nil + } + } + } + + /** + * Used to plan the aggregate operator for expressions based on the AggregateFunction2 interface. + */ + object Aggregation extends Strategy { + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case PhysicalAggregation(groupingExpressions, aggExpressions, resultExpressions, child) + if aggExpressions.forall(expr => expr.isInstanceOf[AggregateExpression]) => + val aggregateExpressions = aggExpressions.map(expr => + expr.asInstanceOf[AggregateExpression]) + + val (functionsWithDistinct, functionsWithoutDistinct) = + aggregateExpressions.partition(_.isDistinct) + if (functionsWithDistinct.map(_.aggregateFunction.children.toSet).distinct.length > 1) { + // This is a sanity check. We should not reach here when we have multiple distinct + // column sets. Our `RewriteDistinctAggregates` should take care this case. + sys.error("You hit a query analyzer bug. Please report your query to " + + "Spark user mailing list.") + } + + // Ideally this should be done in `NormalizeFloatingNumbers`, but we do it here because + // `groupingExpressions` is not extracted during logical phase. + val normalizedGroupingExpressions = groupingExpressions.map { e => + NormalizeFloatingNumbers.normalize(e) match { + case n: NamedExpression => n + case other => Alias(other, e.name)(exprId = e.exprId) + } + } + + val aggregateOperator = + if (functionsWithDistinct.isEmpty) { + AggUtils.planAggregateWithoutDistinct( + normalizedGroupingExpressions, + aggregateExpressions, + resultExpressions, + planLater(child)) + } else { + AggUtils.planAggregateWithOneDistinct( + normalizedGroupingExpressions, + functionsWithDistinct, + functionsWithoutDistinct, + resultExpressions, + planLater(child)) + } + + aggregateOperator + + case PhysicalAggregation(groupingExpressions, aggExpressions, resultExpressions, child) + if aggExpressions.forall(expr => expr.isInstanceOf[PythonUDF]) => + val udfExpressions = aggExpressions.map(expr => expr.asInstanceOf[PythonUDF]) + + Seq(execution.python.AggregateInPandasExec( + groupingExpressions, + udfExpressions, + resultExpressions, + planLater(child))) + + case PhysicalAggregation(_, _, _, _) => + // If cannot match the two cases above, then it's an error + throw new AnalysisException( + "Cannot use a mixture of aggregate function and group aggregate pandas UDF") + + case _ => Nil + } + } + + object Window extends Strategy { + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case PhysicalWindow( + WindowFunctionType.SQL, windowExprs, partitionSpec, orderSpec, child) => + execution.window.WindowExec( + windowExprs, partitionSpec, orderSpec, planLater(child)) :: Nil + + case PhysicalWindow( + WindowFunctionType.Python, windowExprs, partitionSpec, orderSpec, child) => + execution.python.WindowInPandasExec( + windowExprs, partitionSpec, orderSpec, planLater(child)) :: Nil + + case _ => Nil + } + } + + protected lazy val singleRowRdd = session.sparkContext.parallelize(Seq(InternalRow()), 1) + + object InMemoryScans extends Strategy { + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case PhysicalOperation(projectList, filters, mem: InMemoryRelation) => + val condition = filters.reduceLeftOption(And) + val selectivity = if (condition.nonEmpty) { + FilterEstimation(Filter(condition.get, mem)).calculateFilterSelectivity(condition.get) + } else { + None + } + pruneFilterProject( + projectList, + filters, + identity[Seq[Expression]], // All filters still need to be evaluated. + InMemoryTableScanExec(_, filters, mem), + selectivity) :: Nil + case _ => Nil + } + } + + /** + * This strategy is just for explaining `Dataset/DataFrame` created by `spark.readStream`. + * It won't affect the execution, because `StreamingRelation` will be replaced with + * `StreamingExecutionRelation` in `StreamingQueryManager` and `StreamingExecutionRelation` will + * be replaced with the real relation using the `Source` in `StreamExecution`. + */ + object StreamingRelationStrategy extends Strategy { + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case s: StreamingRelation => + StreamingRelationExec(s.sourceName, s.output) :: Nil + case s: StreamingExecutionRelation => + StreamingRelationExec(s.toString, s.output) :: Nil + case s: StreamingRelationV2 => + StreamingRelationExec(s.sourceName, s.output) :: Nil + case _ => Nil + } + } + + /** + * Strategy to convert [[FlatMapGroupsWithState]] logical operator to physical operator + * in streaming plans. Conversion for batch plans is handled by [[BasicOperators]]. + */ + object FlatMapGroupsWithStateStrategy extends Strategy { + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case FlatMapGroupsWithState( + func, keyDeser, valueDeser, groupAttr, dataAttr, outputAttr, stateEnc, outputMode, _, + timeout, child) => + val stateVersion = conf.getConf(SQLConf.FLATMAPGROUPSWITHSTATE_STATE_FORMAT_VERSION) + val execPlan = FlatMapGroupsWithStateExec( + func, keyDeser, valueDeser, groupAttr, dataAttr, outputAttr, None, stateEnc, stateVersion, + outputMode, timeout, batchTimestampMs = None, eventTimeWatermark = None, planLater(child)) + execPlan :: Nil + case _ => + Nil + } + } + + /** + * Strategy to convert EvalPython logical operator to physical operator. + */ + object PythonEvals extends Strategy { + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case ArrowEvalPython(udfs, output, child, evalType) => + ArrowEvalPythonExec(udfs, output, planLater(child), evalType) :: Nil + case BatchEvalPython(udfs, output, child) => + BatchEvalPythonExec(udfs, output, planLater(child)) :: Nil + case _ => + Nil + } + } + + object BasicOperators extends Strategy { + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case d: DataWritingCommand => DataWritingCommandExec(d, planLater(d.query)) :: Nil + case r: RunnableCommand => ExecutedCommandExec(r) :: Nil + + case MemoryPlan(sink, output) => + val encoder = RowEncoder(StructType.fromAttributes(output)) + val toRow = encoder.createSerializer() + LocalTableScanExec(output, sink.allData.map(r => toRow(r).copy())) :: Nil + + case logical.Distinct(child) => + throw new IllegalStateException( + "logical distinct operator should have been replaced by aggregate in the optimizer") + case logical.Intersect(left, right, false) => + throw new IllegalStateException( + "logical intersect operator should have been replaced by semi-join in the optimizer") + case logical.Intersect(left, right, true) => + throw new IllegalStateException( + "logical intersect operator should have been replaced by union, aggregate" + + " and generate operators in the optimizer") + case logical.Except(left, right, false) => + throw new IllegalStateException( + "logical except operator should have been replaced by anti-join in the optimizer") + case logical.Except(left, right, true) => + throw new IllegalStateException( + "logical except (all) operator should have been replaced by union, aggregate" + + " and generate operators in the optimizer") + case logical.ResolvedHint(child, hints) => + throw new IllegalStateException( + "ResolvedHint operator should have been replaced by join hint in the optimizer") + + case logical.DeserializeToObject(deserializer, objAttr, child) => + execution.DeserializeToObjectExec(deserializer, objAttr, planLater(child)) :: Nil + case logical.SerializeFromObject(serializer, child) => + execution.SerializeFromObjectExec(serializer, planLater(child)) :: Nil + case logical.MapPartitions(f, objAttr, child) => + execution.MapPartitionsExec(f, objAttr, planLater(child)) :: Nil + case logical.MapPartitionsInR(f, p, b, is, os, objAttr, child) => + execution.MapPartitionsExec( + execution.r.MapPartitionsRWrapper(f, p, b, is, os), objAttr, planLater(child)) :: Nil + case logical.FlatMapGroupsInR(f, p, b, is, os, key, value, grouping, data, objAttr, child) => + execution.FlatMapGroupsInRExec(f, p, b, is, os, key, value, grouping, + data, objAttr, planLater(child)) :: Nil + case logical.FlatMapGroupsInRWithArrow(f, p, b, is, ot, key, grouping, child) => + execution.FlatMapGroupsInRWithArrowExec( + f, p, b, is, ot, key, grouping, planLater(child)) :: Nil + case logical.MapPartitionsInRWithArrow(f, p, b, is, ot, child) => + execution.MapPartitionsInRWithArrowExec( + f, p, b, is, ot, planLater(child)) :: Nil + case logical.FlatMapGroupsInPandas(grouping, func, output, child) => + execution.python.FlatMapGroupsInPandasExec(grouping, func, output, planLater(child)) :: Nil + case logical.FlatMapCoGroupsInPandas(leftGroup, rightGroup, func, output, left, right) => + execution.python.FlatMapCoGroupsInPandasExec( + leftGroup, rightGroup, func, output, planLater(left), planLater(right)) :: Nil + case logical.MapInPandas(func, output, child) => + execution.python.MapInPandasExec(func, output, planLater(child)) :: Nil + case logical.MapElements(f, _, _, objAttr, child) => + execution.MapElementsExec(f, objAttr, planLater(child)) :: Nil + case logical.AppendColumns(f, _, _, in, out, child) => + execution.AppendColumnsExec(f, in, out, planLater(child)) :: Nil + case logical.AppendColumnsWithObject(f, childSer, newSer, child) => + execution.AppendColumnsWithObjectExec(f, childSer, newSer, planLater(child)) :: Nil + case logical.MapGroups(f, key, value, grouping, data, objAttr, child) => + execution.MapGroupsExec(f, key, value, grouping, data, objAttr, planLater(child)) :: Nil + case logical.FlatMapGroupsWithState( + f, key, value, grouping, data, output, _, _, _, timeout, child) => + execution.MapGroupsExec( + f, key, value, grouping, data, output, timeout, planLater(child)) :: Nil + case logical.CoGroup(f, key, lObj, rObj, lGroup, rGroup, lAttr, rAttr, oAttr, left, right) => + execution.CoGroupExec( + f, key, lObj, rObj, lGroup, rGroup, lAttr, rAttr, oAttr, + planLater(left), planLater(right)) :: Nil + + case logical.Repartition(numPartitions, shuffle, child) => + if (shuffle) { + ShuffleExchangeExec(RoundRobinPartitioning(numPartitions), + planLater(child), canChangeNumPartitions = false) :: Nil + } else { + execution.CoalesceExec(numPartitions, planLater(child)) :: Nil + } + case logical.Sort(sortExprs, global, child) => + execution.SortExec(sortExprs, global, planLater(child)) :: Nil + case logical.Project(projectList, child) => + execution.ProjectExec(projectList, planLater(child)) :: Nil + case l @ logical.Filter(condition, child) => + val selectivity = FilterEstimation(l).calculateFilterSelectivity(l.condition) + execution.FilterExec(condition, planLater(child), selectivity) :: Nil + case f: logical.TypedFilter => + val condition = f.typedCondition(f.deserializer) + val filter = Filter(condition, f.child) + val selectivity = FilterEstimation(filter).calculateFilterSelectivity(condition) + execution.FilterExec(condition, planLater(f.child), selectivity) :: Nil + case e @ logical.Expand(_, _, child) => + execution.ExpandExec(e.projections, e.output, planLater(child)) :: Nil + case logical.Sample(lb, ub, withReplacement, seed, child) => + execution.SampleExec(lb, ub, withReplacement, seed, planLater(child)) :: Nil + case logical.LocalRelation(output, data, _) => + LocalTableScanExec(output, data) :: Nil + case logical.LocalLimit(IntegerLiteral(limit), child) => + execution.LocalLimitExec(limit, planLater(child)) :: Nil + case logical.GlobalLimit(IntegerLiteral(limit), child) => + execution.GlobalLimitExec(limit, planLater(child)) :: Nil + case logical.Union(unionChildren) => + execution.UnionExec(unionChildren.map(planLater)) :: Nil + case g @ logical.Generate(generator, _, outer, _, _, child) => + execution.GenerateExec( + generator, g.requiredChildOutput, outer, + g.qualifiedGeneratorOutput, planLater(child)) :: Nil + case _: logical.OneRowRelation => + execution.RDDScanExec(Nil, singleRowRdd, "OneRowRelation") :: Nil + case r: logical.Range => + execution.RangeExec(r) :: Nil + case r: logical.RepartitionByExpression => + exchange.ShuffleExchangeExec( + r.partitioning, planLater(r.child), canChangeNumPartitions = false) :: Nil + case ExternalRDD(outputObjAttr, rdd) => ExternalRDDScanExec(outputObjAttr, rdd) :: Nil + case r: LogicalRDD => + RDDScanExec(r.output, r.rdd, "ExistingRDD", r.outputPartitioning, r.outputOrdering) :: Nil + case _: UpdateTable => + throw new UnsupportedOperationException(s"UPDATE TABLE is not supported temporarily.") + case _: MergeIntoTable => + throw new UnsupportedOperationException(s"MERGE INTO TABLE is not supported temporarily.") + case logical.CollectMetrics(name, metrics, child) => + execution.CollectMetricsExec(name, metrics, planLater(child)) :: Nil + case _ => Nil + } + } +} diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala new file mode 100644 index 00000000..82ede37d --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala @@ -0,0 +1,816 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import java.util.concurrent.{Future => JFuture} +import java.util.concurrent.TimeUnit._ + +import scala.collection.mutable +import scala.concurrent.{ExecutionContext} +import scala.concurrent.duration.Duration + +import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} +import org.apache.spark.rdd.{EmptyRDD, PartitionwiseSampledRDD, RDD} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences +import org.apache.spark.sql.catalyst.expressions.codegen._ +import org.apache.spark.sql.catalyst.plans.physical._ +import org.apache.spark.sql.execution.metric.SQLMetrics +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} +import org.apache.spark.sql.types.{LongType, StructType} +import org.apache.spark.util.{ThreadUtils, Utils} +import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler} + +/** Physical plan for Project. */ +case class ProjectExec(projectList: Seq[NamedExpression], child: SparkPlan) + extends UnaryExecNode with CodegenSupport with AliasAwareOutputPartitioning { + + override def output: Seq[Attribute] = projectList.map(_.toAttribute) + + override def inputRDDs(): Seq[RDD[InternalRow]] = { + child.asInstanceOf[CodegenSupport].inputRDDs() + } + + protected override def doProduce(ctx: CodegenContext): String = { + child.asInstanceOf[CodegenSupport].produce(ctx, this) + } + + override def usedInputs: AttributeSet = { + // only the attributes those are used at least twice should be evaluated before this plan, + // otherwise we could defer the evaluation until output attribute is actually used. + val usedExprIds = projectList.flatMap(_.collect { + case a: Attribute => a.exprId + }) + val usedMoreThanOnce = usedExprIds.groupBy(id => id).filter(_._2.size > 1).keySet + references.filter(a => usedMoreThanOnce.contains(a.exprId)) + } + + override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { + val exprs = bindReferences[Expression](projectList, child.output) + val resultVars = exprs.map(_.genCode(ctx)) + // Evaluation of non-deterministic expressions can't be deferred. + val nonDeterministicAttrs = projectList.filterNot(_.deterministic).map(_.toAttribute) + s""" + |${evaluateRequiredVariables(output, resultVars, AttributeSet(nonDeterministicAttrs))} + |${consume(ctx, resultVars)} + """.stripMargin + } + + protected override def doExecute(): RDD[InternalRow] = { + child.execute().mapPartitionsWithIndexInternal { (index, iter) => + val project = UnsafeProjection.create(projectList, child.output) + project.initialize(index) + iter.map(project) + } + } + + override def outputOrdering: Seq[SortOrder] = child.outputOrdering + + override protected def outputExpressions: Seq[NamedExpression] = projectList + + override def verboseStringWithOperatorId(): String = { + s""" + |$formattedNodeName + |${ExplainUtils.generateFieldString("Output", projectList)} + |${ExplainUtils.generateFieldString("Input", child.output)} + |""".stripMargin + } +} + +/** Physical plan for Filter. */ +case class FilterExec(condition: Expression, child: SparkPlan, selectivity: Option[Double] = None) + extends UnaryExecNode with CodegenSupport with PredicateHelper { + + // Split out all the IsNotNulls from condition. + private val (notNullPreds, otherPreds) = splitConjunctivePredicates(condition).partition { + case IsNotNull(a) => isNullIntolerant(a) && a.references.subsetOf(child.outputSet) + case _ => false + } + + // If one expression and its children are null intolerant, it is null intolerant. + private def isNullIntolerant(expr: Expression): Boolean = expr match { + case e: NullIntolerant => e.children.forall(isNullIntolerant) + case _ => false + } + + // The columns that will filtered out by `IsNotNull` could be considered as not nullable. + private val notNullAttributes = notNullPreds.flatMap(_.references).distinct.map(_.exprId) + + // Mark this as empty. We'll evaluate the input during doConsume(). We don't want to evaluate + // all the variables at the beginning to take advantage of short circuiting. + override def usedInputs: AttributeSet = AttributeSet.empty + + override def output: Seq[Attribute] = { + child.output.map { a => + if (a.nullable && notNullAttributes.contains(a.exprId)) { + a.withNullability(false) + } else { + a + } + } + } + + override lazy val metrics = Map( + "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) + + override def inputRDDs(): Seq[RDD[InternalRow]] = { + child.asInstanceOf[CodegenSupport].inputRDDs() + } + + protected override def doProduce(ctx: CodegenContext): String = { + child.asInstanceOf[CodegenSupport].produce(ctx, this) + } + + override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { + val numOutput = metricTerm(ctx, "numOutputRows") + + /** + * Generates code for `c`, using `in` for input attributes and `attrs` for nullability. + */ + def genPredicate(c: Expression, in: Seq[ExprCode], attrs: Seq[Attribute]): String = { + val bound = BindReferences.bindReference(c, attrs) + val evaluated = evaluateRequiredVariables(child.output, in, c.references) + + // Generate the code for the predicate. + val ev = ExpressionCanonicalizer.execute(bound).genCode(ctx) + val nullCheck = if (bound.nullable) { + s"${ev.isNull} || " + } else { + s"" + } + + s""" + |$evaluated + |${ev.code} + |if (${nullCheck}!${ev.value}) continue; + """.stripMargin + } + + // To generate the predicates we will follow this algorithm. + // For each predicate that is not IsNotNull, we will generate them one by one loading attributes + // as necessary. For each of both attributes, if there is an IsNotNull predicate we will + // generate that check *before* the predicate. After all of these predicates, we will generate + // the remaining IsNotNull checks that were not part of other predicates. + // This has the property of not doing redundant IsNotNull checks and taking better advantage of + // short-circuiting, not loading attributes until they are needed. + // This is very perf sensitive. + // TODO: revisit this. We can consider reordering predicates as well. + val generatedIsNotNullChecks = new Array[Boolean](notNullPreds.length) + val extraIsNotNullAttrs = mutable.Set[Attribute]() + val generated = otherPreds.map { c => + val nullChecks = c.references.map { r => + val idx = notNullPreds.indexWhere { n => n.asInstanceOf[IsNotNull].child.semanticEquals(r)} + if (idx != -1 && !generatedIsNotNullChecks(idx)) { + generatedIsNotNullChecks(idx) = true + // Use the child's output. The nullability is what the child produced. + genPredicate(notNullPreds(idx), input, child.output) + } else if (notNullAttributes.contains(r.exprId) && !extraIsNotNullAttrs.contains(r)) { + extraIsNotNullAttrs += r + genPredicate(IsNotNull(r), input, child.output) + } else { + "" + } + }.mkString("\n").trim + + // Here we use *this* operator's output with this output's nullability since we already + // enforced them with the IsNotNull checks above. + s""" + |$nullChecks + |${genPredicate(c, input, output)} + """.stripMargin.trim + }.mkString("\n") + + val nullChecks = notNullPreds.zipWithIndex.map { case (c, idx) => + if (!generatedIsNotNullChecks(idx)) { + genPredicate(c, input, child.output) + } else { + "" + } + }.mkString("\n") + + // Reset the isNull to false for the not-null columns, then the followed operators could + // generate better code (remove dead branches). + val resultVars = input.zipWithIndex.map { case (ev, i) => + if (notNullAttributes.contains(child.output(i).exprId)) { + ev.isNull = FalseLiteral + } + ev + } + + // Note: wrap in "do { } while(false);", so the generated checks can jump out with "continue;" + s""" + |do { + | $generated + | $nullChecks + | $numOutput.add(1); + | ${consume(ctx, resultVars)} + |} while(false); + """.stripMargin + } + + protected override def doExecute(): RDD[InternalRow] = { + val numOutputRows = longMetric("numOutputRows") + child.execute().mapPartitionsWithIndexInternal { (index, iter) => + val predicate = Predicate.create(condition, child.output) + predicate.initialize(0) + iter.filter { row => + val r = predicate.eval(row) + if (r) numOutputRows += 1 + r + } + } + } + + override def outputOrdering: Seq[SortOrder] = child.outputOrdering + + override def outputPartitioning: Partitioning = child.outputPartitioning + + override def verboseStringWithOperatorId(): String = { + s""" + |$formattedNodeName + |${ExplainUtils.generateFieldString("Input", child.output)} + |Condition : ${condition} + |""".stripMargin + } +} + +/** + * Physical plan for sampling the dataset. + * + * @param lowerBound Lower-bound of the sampling probability (usually 0.0) + * @param upperBound Upper-bound of the sampling probability. The expected fraction sampled + * will be ub - lb. + * @param withReplacement Whether to sample with replacement. + * @param seed the random seed + * @param child the SparkPlan + */ +case class SampleExec( + lowerBound: Double, + upperBound: Double, + withReplacement: Boolean, + seed: Long, + child: SparkPlan) extends UnaryExecNode with CodegenSupport { + override def output: Seq[Attribute] = child.output + + override def outputPartitioning: Partitioning = child.outputPartitioning + + override lazy val metrics = Map( + "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) + + protected override def doExecute(): RDD[InternalRow] = { + if (withReplacement) { + // Disable gap sampling since the gap sampling method buffers two rows internally, + // requiring us to copy the row, which is more expensive than the random number generator. + new PartitionwiseSampledRDD[InternalRow, InternalRow]( + child.execute(), + new PoissonSampler[InternalRow](upperBound - lowerBound, useGapSamplingIfPossible = false), + preservesPartitioning = true, + seed) + } else { + child.execute().randomSampleWithRange(lowerBound, upperBound, seed) + } + } + + // Mark this as empty. This plan doesn't need to evaluate any inputs and can defer the evaluation + // to the parent operator. + override def usedInputs: AttributeSet = AttributeSet.empty + + override def inputRDDs(): Seq[RDD[InternalRow]] = { + child.asInstanceOf[CodegenSupport].inputRDDs() + } + + protected override def doProduce(ctx: CodegenContext): String = { + child.asInstanceOf[CodegenSupport].produce(ctx, this) + } + + override def needCopyResult: Boolean = { + child.asInstanceOf[CodegenSupport].needCopyResult || withReplacement + } + + override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { + val numOutput = metricTerm(ctx, "numOutputRows") + + if (withReplacement) { + val samplerClass = classOf[PoissonSampler[UnsafeRow]].getName + val initSampler = ctx.freshName("initSampler") + + // Inline mutable state since not many Sample operations in a task + val sampler = ctx.addMutableState(s"$samplerClass", "sampleReplace", + v => { + val initSamplerFuncName = ctx.addNewFunction(initSampler, + s""" + | private void $initSampler() { + | $v = new $samplerClass($upperBound - $lowerBound, false); + | java.util.Random random = new java.util.Random(${seed}L); + | long randomSeed = random.nextLong(); + | int loopCount = 0; + | while (loopCount < partitionIndex) { + | randomSeed = random.nextLong(); + | loopCount += 1; + | } + | $v.setSeed(randomSeed); + | } + """.stripMargin.trim) + s"$initSamplerFuncName();" + }, forceInline = true) + + val samplingCount = ctx.freshName("samplingCount") + s""" + | int $samplingCount = $sampler.sample(); + | while ($samplingCount-- > 0) { + | $numOutput.add(1); + | ${consume(ctx, input)} + | } + """.stripMargin.trim + } else { + val samplerClass = classOf[BernoulliCellSampler[UnsafeRow]].getName + val sampler = ctx.addMutableState(s"$samplerClass", "sampler", + v => s""" + | $v = new $samplerClass($lowerBound, $upperBound, false); + | $v.setSeed(${seed}L + partitionIndex); + """.stripMargin.trim) + + s""" + | if ($sampler.sample() != 0) { + | $numOutput.add(1); + | ${consume(ctx, input)} + | } + """.stripMargin.trim + } + } +} + + +/** + * Physical plan for range (generating a range of 64 bit numbers). + */ +case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) + extends LeafExecNode with CodegenSupport { + + val start: Long = range.start + val end: Long = range.end + val step: Long = range.step + val numSlices: Int = range.numSlices.getOrElse(sparkContext.defaultParallelism) + val numElements: BigInt = range.numElements + + override val output: Seq[Attribute] = range.output + + override def outputOrdering: Seq[SortOrder] = range.outputOrdering + + override def outputPartitioning: Partitioning = { + if (numElements > 0) { + if (numSlices == 1) { + SinglePartition + } else { + RangePartitioning(outputOrdering, numSlices) + } + } else { + UnknownPartitioning(0) + } + } + + override lazy val metrics = Map( + "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) + + override def doCanonicalize(): SparkPlan = { + RangeExec(range.canonicalized.asInstanceOf[org.apache.spark.sql.catalyst.plans.logical.Range]) + } + + override def inputRDDs(): Seq[RDD[InternalRow]] = { + val rdd = if (start == end || (start < end ^ 0 < step)) { + new EmptyRDD[InternalRow](sqlContext.sparkContext) + } else { + sqlContext.sparkContext.parallelize(0 until numSlices, numSlices).map(i => InternalRow(i)) + } + rdd :: Nil + } + + protected override def doProduce(ctx: CodegenContext): String = { + val numOutput = metricTerm(ctx, "numOutputRows") + + val initTerm = ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "initRange") + val nextIndex = ctx.addMutableState(CodeGenerator.JAVA_LONG, "nextIndex") + + val value = ctx.freshName("value") + val ev = ExprCode.forNonNullValue(JavaCode.variable(value, LongType)) + val BigInt = classOf[java.math.BigInteger].getName + + // Inline mutable state since not many Range operations in a task + val taskContext = ctx.addMutableState("TaskContext", "taskContext", + v => s"$v = TaskContext.get();", forceInline = true) + val inputMetrics = ctx.addMutableState("InputMetrics", "inputMetrics", + v => s"$v = $taskContext.taskMetrics().inputMetrics();", forceInline = true) + + // In order to periodically update the metrics without inflicting performance penalty, this + // operator produces elements in batches. After a batch is complete, the metrics are updated + // and a new batch is started. + // In the implementation below, the code in the inner loop is producing all the values + // within a batch, while the code in the outer loop is setting batch parameters and updating + // the metrics. + + // Once nextIndex == batchEnd, it's time to progress to the next batch. + val batchEnd = ctx.addMutableState(CodeGenerator.JAVA_LONG, "batchEnd") + + // How many values should still be generated by this range operator. + val numElementsTodo = ctx.addMutableState(CodeGenerator.JAVA_LONG, "numElementsTodo") + + // How many values should be generated in the next batch. + val nextBatchTodo = ctx.freshName("nextBatchTodo") + + // The default size of a batch, which must be positive integer + val batchSize = 1000 + + val initRangeFuncName = ctx.addNewFunction("initRange", + s""" + | private void initRange(int idx) { + | $BigInt index = $BigInt.valueOf(idx); + | $BigInt numSlice = $BigInt.valueOf(${numSlices}L); + | $BigInt numElement = $BigInt.valueOf(${numElements.toLong}L); + | $BigInt step = $BigInt.valueOf(${step}L); + | $BigInt start = $BigInt.valueOf(${start}L); + | long partitionEnd; + | + | $BigInt st = index.multiply(numElement).divide(numSlice).multiply(step).add(start); + | if (st.compareTo($BigInt.valueOf(Long.MAX_VALUE)) > 0) { + | $nextIndex = Long.MAX_VALUE; + | } else if (st.compareTo($BigInt.valueOf(Long.MIN_VALUE)) < 0) { + | $nextIndex = Long.MIN_VALUE; + | } else { + | $nextIndex = st.longValue(); + | } + | $batchEnd = $nextIndex; + | + | $BigInt end = index.add($BigInt.ONE).multiply(numElement).divide(numSlice) + | .multiply(step).add(start); + | if (end.compareTo($BigInt.valueOf(Long.MAX_VALUE)) > 0) { + | partitionEnd = Long.MAX_VALUE; + | } else if (end.compareTo($BigInt.valueOf(Long.MIN_VALUE)) < 0) { + | partitionEnd = Long.MIN_VALUE; + | } else { + | partitionEnd = end.longValue(); + | } + | + | $BigInt startToEnd = $BigInt.valueOf(partitionEnd).subtract( + | $BigInt.valueOf($nextIndex)); + | $numElementsTodo = startToEnd.divide(step).longValue(); + | if ($numElementsTodo < 0) { + | $numElementsTodo = 0; + | } else if (startToEnd.remainder(step).compareTo($BigInt.valueOf(0L)) != 0) { + | $numElementsTodo++; + | } + | } + """.stripMargin) + + val localIdx = ctx.freshName("localIdx") + val localEnd = ctx.freshName("localEnd") + val stopCheck = if (parent.needStopCheck) { + s""" + |if (shouldStop()) { + | $nextIndex = $value + ${step}L; + | $numOutput.add($localIdx + 1); + | $inputMetrics.incRecordsRead($localIdx + 1); + | return; + |} + """.stripMargin + } else { + "// shouldStop check is eliminated" + } + val loopCondition = if (limitNotReachedChecks.isEmpty) { + "true" + } else { + limitNotReachedChecks.mkString(" && ") + } + + // An overview of the Range processing. + // + // For each partition, the Range task needs to produce records from partition start(inclusive) + // to end(exclusive). For better performance, we separate the partition range into batches, and + // use 2 loops to produce data. The outer while loop is used to iterate batches, and the inner + // for loop is used to iterate records inside a batch. + // + // `nextIndex` tracks the index of the next record that is going to be consumed, initialized + // with partition start. `batchEnd` tracks the end index of the current batch, initialized + // with `nextIndex`. In the outer loop, we first check if `nextIndex == batchEnd`. If it's true, + // it means the current batch is fully consumed, and we will update `batchEnd` to process the + // next batch. If `batchEnd` reaches partition end, exit the outer loop. Finally we enter the + // inner loop. Note that, when we enter inner loop, `nextIndex` must be different from + // `batchEnd`, otherwise we already exit the outer loop. + // + // The inner loop iterates from 0 to `localEnd`, which is calculated by + // `(batchEnd - nextIndex) / step`. Since `batchEnd` is increased by `nextBatchTodo * step` in + // the outer loop, and initialized with `nextIndex`, so `batchEnd - nextIndex` is always + // divisible by `step`. The `nextIndex` is increased by `step` during each iteration, and ends + // up being equal to `batchEnd` when the inner loop finishes. + // + // The inner loop can be interrupted, if the query has produced at least one result row, so that + // we don't buffer too many result rows and waste memory. It's ok to interrupt the inner loop, + // because `nextIndex` will be updated before interrupting. + + s""" + | // initialize Range + | if (!$initTerm) { + | $initTerm = true; + | $initRangeFuncName(partitionIndex); + | } + | + | while ($loopCondition) { + | if ($nextIndex == $batchEnd) { + | long $nextBatchTodo; + | if ($numElementsTodo > ${batchSize}L) { + | $nextBatchTodo = ${batchSize}L; + | $numElementsTodo -= ${batchSize}L; + | } else { + | $nextBatchTodo = $numElementsTodo; + | $numElementsTodo = 0; + | if ($nextBatchTodo == 0) break; + | } + | $batchEnd += $nextBatchTodo * ${step}L; + | } + | + | int $localEnd = (int)(($batchEnd - $nextIndex) / ${step}L); + | for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) { + | long $value = ((long)$localIdx * ${step}L) + $nextIndex; + | ${consume(ctx, Seq(ev))} + | $stopCheck + | } + | $nextIndex = $batchEnd; + | $numOutput.add($localEnd); + | $inputMetrics.incRecordsRead($localEnd); + | $taskContext.killTaskIfInterrupted(); + | } + """.stripMargin + } + + protected override def doExecute(): RDD[InternalRow] = { + val numOutputRows = longMetric("numOutputRows") + sqlContext + .sparkContext + .parallelize(0 until numSlices, numSlices) + .mapPartitionsWithIndex { (i, _) => + val partitionStart = (i * numElements) / numSlices * step + start + val partitionEnd = (((i + 1) * numElements) / numSlices) * step + start + def getSafeMargin(bi: BigInt): Long = + if (bi.isValidLong) { + bi.toLong + } else if (bi > 0) { + Long.MaxValue + } else { + Long.MinValue + } + val safePartitionStart = getSafeMargin(partitionStart) + val safePartitionEnd = getSafeMargin(partitionEnd) + val rowSize = UnsafeRow.calculateBitSetWidthInBytes(1) + LongType.defaultSize + val unsafeRow = UnsafeRow.createFromByteArray(rowSize, 1) + val taskContext = TaskContext.get() + + val iter = new Iterator[InternalRow] { + private[this] var number: Long = safePartitionStart + private[this] var overflow: Boolean = false + private[this] val inputMetrics = taskContext.taskMetrics().inputMetrics + + override def hasNext = + if (!overflow) { + if (step > 0) { + number < safePartitionEnd + } else { + number > safePartitionEnd + } + } else false + + override def next() = { + val ret = number + number += step + if (number < ret ^ step < 0) { + // we have Long.MaxValue + Long.MaxValue < Long.MaxValue + // and Long.MinValue + Long.MinValue > Long.MinValue, so iff the step causes a step + // back, we are pretty sure that we have an overflow. + overflow = true + } + + numOutputRows += 1 + inputMetrics.incRecordsRead(1) + unsafeRow.setLong(0, ret) + unsafeRow + } + } + new InterruptibleIterator(taskContext, iter) + } + } + + override def simpleString(maxFields: Int): String = { + s"Range ($start, $end, step=$step, splits=$numSlices)" + } +} + +/** + * Physical plan for unioning two plans, without a distinct. This is UNION ALL in SQL. + * + * If we change how this is implemented physically, we'd need to update + * [[org.apache.spark.sql.catalyst.plans.logical.Union.maxRowsPerPartition]]. + */ +case class UnionExec(children: Seq[SparkPlan]) extends SparkPlan { + // updating nullability to make all the children consistent + override def output: Seq[Attribute] = { + children.map(_.output).transpose.map { attrs => + val firstAttr = attrs.head + val nullable = attrs.exists(_.nullable) + val newDt = attrs.map(_.dataType).reduce(StructType.merge) + if (firstAttr.dataType == newDt) { + firstAttr.withNullability(nullable) + } else { + AttributeReference(firstAttr.name, newDt, nullable, firstAttr.metadata)( + firstAttr.exprId, firstAttr.qualifier) + } + } + } + + protected override def doExecute(): RDD[InternalRow] = + sparkContext.union(children.map(_.execute())) +} + +/** + * Physical plan for returning a new RDD that has exactly `numPartitions` partitions. + * Similar to coalesce defined on an [[RDD]], this operation results in a narrow dependency, e.g. + * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of + * the 100 new partitions will claim 10 of the current partitions. If a larger number of partitions + * is requested, it will stay at the current number of partitions. + * + * However, if you're doing a drastic coalesce, e.g. to numPartitions = 1, + * this may result in your computation taking place on fewer nodes than + * you like (e.g. one node in the case of numPartitions = 1). To avoid this, + * you see ShuffleExchange. This will add a shuffle step, but means the + * current upstream partitions will be executed in parallel (per whatever + * the current partitioning is). + */ +case class CoalesceExec(numPartitions: Int, child: SparkPlan) extends UnaryExecNode { + override def output: Seq[Attribute] = child.output + + override def outputPartitioning: Partitioning = { + if (numPartitions == 1) SinglePartition + else UnknownPartitioning(numPartitions) + } + + protected override def doExecute(): RDD[InternalRow] = { + if (numPartitions == 1 && child.execute().getNumPartitions < 1) { + // Make sure we don't output an RDD with 0 partitions, when claiming that we have a + // `SinglePartition`. + new CoalesceExec.EmptyRDDWithPartitions(sparkContext, numPartitions) + } else { + child.execute().coalesce(numPartitions, shuffle = false) + } + } +} + +object CoalesceExec { + /** A simple RDD with no data, but with the given number of partitions. */ + class EmptyRDDWithPartitions( + @transient private val sc: SparkContext, + numPartitions: Int) extends RDD[InternalRow](sc, Nil) { + + override def getPartitions: Array[Partition] = + Array.tabulate(numPartitions)(i => EmptyPartition(i)) + + override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { + Iterator.empty + } + } + + case class EmptyPartition(index: Int) extends Partition +} + +/** + * Parent class for different types of subquery plans + */ +abstract class BaseSubqueryExec extends SparkPlan { + def name: String + def child: SparkPlan + + override def output: Seq[Attribute] = child.output + + override def outputPartitioning: Partitioning = child.outputPartitioning + + override def outputOrdering: Seq[SortOrder] = child.outputOrdering + + override def generateTreeString( + depth: Int, + lastChildren: Seq[Boolean], + append: String => Unit, + verbose: Boolean, + prefix: String = "", + addSuffix: Boolean = false, + maxFields: Int, + printNodeId: Boolean): Unit = { + /** + * In the new explain mode `EXPLAIN FORMATTED`, the subqueries are not shown in the + * main plan and are printed separately along with correlation information with + * its parent plan. The condition below makes sure that subquery plans are + * excluded from the main plan. + */ + if (!printNodeId) { + super.generateTreeString( + depth, + lastChildren, + append, + verbose, + "", + false, + maxFields, + printNodeId) + } + } +} + +/** + * Physical plan for a subquery. + */ +case class SubqueryExec(name: String, child: SparkPlan) + extends BaseSubqueryExec with UnaryExecNode { + + override lazy val metrics = Map( + "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"), + "collectTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to collect")) + + @transient + private lazy val relationFuture: JFuture[Array[InternalRow]] = { + // relationFuture is used in "doExecute". Therefore we can get the execution id correctly here. + val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) + SQLExecution.withThreadLocalCaptured[Array[InternalRow]]( + sqlContext.sparkSession, + SubqueryExec.executionContext) { + // This will run in another thread. Set the execution id so that we can connect these jobs + // with the correct execution. + SQLExecution.withExecutionId(sqlContext.sparkSession, executionId) { + val beforeCollect = System.nanoTime() + // Note that we use .executeCollect() because we don't want to convert data to Scala types + val rows: Array[InternalRow] = child.executeCollect() + val beforeBuild = System.nanoTime() + longMetric("collectTime") += NANOSECONDS.toMillis(beforeBuild - beforeCollect) + val dataSize = rows.map(_.asInstanceOf[UnsafeRow].getSizeInBytes.toLong).sum + longMetric("dataSize") += dataSize + + SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq) + rows + } + } + } + + protected override def doCanonicalize(): SparkPlan = { + SubqueryExec("Subquery", child.canonicalized) + } + + protected override def doPrepare(): Unit = { + relationFuture + } + + protected override def doExecute(): RDD[InternalRow] = { + child.execute() + } + + override def executeCollect(): Array[InternalRow] = { + ThreadUtils.awaitResult(relationFuture, Duration.Inf) + } + + override def stringArgs: Iterator[Any] = super.stringArgs ++ Iterator(s"[id=#$id]") +} + +object SubqueryExec { + private[execution] val executionContext = ExecutionContext.fromExecutorService( + ThreadUtils.newDaemonCachedThreadPool("subquery", + SQLConf.get.getConf(StaticSQLConf.SUBQUERY_MAX_THREAD_THRESHOLD))) +} + +/** + * A wrapper for reused [[BaseSubqueryExec]]. + */ +case class ReusedSubqueryExec(child: BaseSubqueryExec) + extends BaseSubqueryExec with LeafExecNode { + + override def name: String = child.name + + override def output: Seq[Attribute] = child.output + override def doCanonicalize(): SparkPlan = child.canonicalized + override def outputOrdering: Seq[SortOrder] = child.outputOrdering + override def outputPartitioning: Partitioning = child.outputPartitioning + + protected override def doPrepare(): Unit = child.prepare() + + protected override def doExecute(): RDD[InternalRow] = child.execute() + + override def executeCollect(): Array[InternalRow] = child.executeCollect() +} diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala new file mode 100644 index 00000000..d1b4a195 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -0,0 +1,715 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.util.Locale + +import scala.collection.mutable + +import org.apache.hadoop.fs.Path + +import org.apache.spark.internal.Logging +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, QualifiedTableName} +import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala +import org.apache.spark.sql.catalyst.analysis._ +import org.apache.spark.sql.catalyst.catalog._ +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.catalyst.expressions +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.planning.ScanOperation +import org.apache.spark.sql.catalyst.plans.logical.{Filter => LFilter, InsertIntoDir, InsertIntoStatement, LogicalPlan, Project} +import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.FilterEstimation +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan} +import org.apache.spark.sql.execution.command._ +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy +import org.apache.spark.sql.sources._ +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + +/** + * Replaces generic operations with specific variants that are designed to work with Spark + * SQL Data Sources. + * + * Note that, this rule must be run after `PreprocessTableCreation` and + * `PreprocessTableInsertion`. + */ +case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with CastSupport { + + def resolver: Resolver = conf.resolver + + // Visible for testing. + def convertStaticPartitions( + sourceAttributes: Seq[Attribute], + providedPartitions: Map[String, Option[String]], + targetAttributes: Seq[Attribute], + targetPartitionSchema: StructType): Seq[NamedExpression] = { + + assert(providedPartitions.exists(_._2.isDefined)) + + val staticPartitions = providedPartitions.flatMap { + case (partKey, Some(partValue)) => (partKey, partValue) :: Nil + case (_, None) => Nil + } + + // The sum of the number of static partition columns and columns provided in the SELECT + // clause needs to match the number of columns of the target table. + if (staticPartitions.size + sourceAttributes.size != targetAttributes.size) { + throw new AnalysisException( + s"The data to be inserted needs to have the same number of " + + s"columns as the target table: target table has ${targetAttributes.size} " + + s"column(s) but the inserted data has ${sourceAttributes.size + staticPartitions.size} " + + s"column(s), which contain ${staticPartitions.size} partition column(s) having " + + s"assigned constant values.") + } + + if (providedPartitions.size != targetPartitionSchema.fields.size) { + throw new AnalysisException( + s"The data to be inserted needs to have the same number of " + + s"partition columns as the target table: target table " + + s"has ${targetPartitionSchema.fields.size} partition column(s) but the inserted " + + s"data has ${providedPartitions.size} partition columns specified.") + } + + staticPartitions.foreach { + case (partKey, partValue) => + if (!targetPartitionSchema.fields.exists(field => resolver(field.name, partKey))) { + throw new AnalysisException( + s"$partKey is not a partition column. Partition columns are " + + s"${targetPartitionSchema.fields.map(_.name).mkString("[", ",", "]")}") + } + } + + val partitionList = targetPartitionSchema.fields.map { field => + val potentialSpecs = staticPartitions.filter { + case (partKey, partValue) => resolver(field.name, partKey) + } + if (potentialSpecs.isEmpty) { + None + } else if (potentialSpecs.size == 1) { + val partValue = potentialSpecs.head._2 + conf.storeAssignmentPolicy match { + // SPARK-30844: try our best to follow StoreAssignmentPolicy for static partition + // values but not completely follow because we can't do static type checking due to + // the reason that the parser has erased the type info of static partition values + // and converted them to string. + case StoreAssignmentPolicy.ANSI | StoreAssignmentPolicy.STRICT => + Some(Alias(AnsiCast(Literal(partValue), field.dataType, + Option(conf.sessionLocalTimeZone)), field.name)()) + case _ => + Some(Alias(cast(Literal(partValue), field.dataType), field.name)()) + } + } else { + throw new AnalysisException( + s"Partition column ${field.name} have multiple values specified, " + + s"${potentialSpecs.mkString("[", ", ", "]")}. Please only specify a single value.") + } + } + + // We first drop all leading static partitions using dropWhile and check if there is + // any static partition appear after dynamic partitions. + partitionList.dropWhile(_.isDefined).collectFirst { + case Some(_) => + throw new AnalysisException( + s"The ordering of partition columns is " + + s"${targetPartitionSchema.fields.map(_.name).mkString("[", ",", "]")}. " + + "All partition columns having constant values need to appear before other " + + "partition columns that do not have an assigned constant value.") + } + + assert(partitionList.take(staticPartitions.size).forall(_.isDefined)) + val projectList = + sourceAttributes.take(targetAttributes.size - targetPartitionSchema.fields.size) ++ + partitionList.take(staticPartitions.size).map(_.get) ++ + sourceAttributes.takeRight(targetPartitionSchema.fields.size - staticPartitions.size) + + projectList + } + + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case CreateTable(tableDesc, mode, None) if DDLUtils.isDatasourceTable(tableDesc) => + CreateDataSourceTableCommand(tableDesc, ignoreIfExists = mode == SaveMode.Ignore) + + case CreateTable(tableDesc, mode, Some(query)) + if query.resolved && DDLUtils.isDatasourceTable(tableDesc) => + CreateDataSourceTableAsSelectCommand(tableDesc, mode, query, query.output.map(_.name)) + + case InsertIntoStatement(l @ LogicalRelation(_: InsertableRelation, _, _, _), + parts, query, overwrite, false) if parts.isEmpty => + InsertIntoDataSourceCommand(l, query, overwrite) + + case InsertIntoDir(_, storage, provider, query, overwrite) + if provider.isDefined && provider.get.toLowerCase(Locale.ROOT) != DDLUtils.HIVE_PROVIDER => + + val outputPath = new Path(storage.locationUri.get) + if (overwrite) DDLUtils.verifyNotReadPath(query, outputPath) + + InsertIntoDataSourceDirCommand(storage, provider.get, query, overwrite) + + case i @ InsertIntoStatement( + l @ LogicalRelation(t: HadoopFsRelation, _, table, _), parts, query, overwrite, _) => + // If the InsertIntoTable command is for a partitioned HadoopFsRelation and + // the user has specified static partitions, we add a Project operator on top of the query + // to include those constant column values in the query result. + // + // Example: + // Let's say that we have a table "t", which is created by + // CREATE TABLE t (a INT, b INT, c INT) USING parquet PARTITIONED BY (b, c) + // The statement of "INSERT INTO TABLE t PARTITION (b=2, c) SELECT 1, 3" + // will be converted to "INSERT INTO TABLE t PARTITION (b, c) SELECT 1, 2, 3". + // + // Basically, we will put those partition columns having a assigned value back + // to the SELECT clause. The output of the SELECT clause is organized as + // normal_columns static_partitioning_columns dynamic_partitioning_columns. + // static_partitioning_columns are partitioning columns having assigned + // values in the PARTITION clause (e.g. b in the above example). + // dynamic_partitioning_columns are partitioning columns that do not assigned + // values in the PARTITION clause (e.g. c in the above example). + val actualQuery = if (parts.exists(_._2.isDefined)) { + val projectList = convertStaticPartitions( + sourceAttributes = query.output, + providedPartitions = parts, + targetAttributes = l.output, + targetPartitionSchema = t.partitionSchema) + Project(projectList, query) + } else { + query + } + + // Sanity check + if (t.location.rootPaths.size != 1) { + throw new AnalysisException("Can only write data to relations with a single path.") + } + + val outputPath = t.location.rootPaths.head + val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append + + val partitionSchema = actualQuery.resolve( + t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver) + val staticPartitions = parts.filter(_._2.nonEmpty).map { case (k, v) => k -> v.get } + + val insertCommand = InsertIntoHadoopFsRelationCommand( + outputPath, + staticPartitions, + i.ifPartitionNotExists, + partitionSchema, + t.bucketSpec, + t.fileFormat, + t.options, + actualQuery, + mode, + table, + Some(t.location), + actualQuery.output.map(_.name)) + + // For dynamic partition overwrite, we do not delete partition directories ahead. + // We write to staging directories and move to final partition directories after writing + // job is done. So it is ok to have outputPath try to overwrite inputpath. + if (overwrite && !insertCommand.dynamicPartitionOverwrite) { + DDLUtils.verifyNotReadPath(actualQuery, outputPath) + } + insertCommand + } +} + + +/** + * Replaces [[UnresolvedCatalogRelation]] with concrete relation logical plans. + * + * TODO: we should remove the special handling for hive tables after completely making hive as a + * data source. + */ +class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan] { + private def readDataSourceTable(table: CatalogTable): LogicalPlan = { + val qualifiedTableName = QualifiedTableName(table.database, table.identifier.table) + val catalog = sparkSession.sessionState.catalog + catalog.getCachedPlan(qualifiedTableName, () => { + val pathOption = table.storage.locationUri.map("path" -> CatalogUtils.URIToString(_)) + val dataSource = + DataSource( + sparkSession, + // In older version(prior to 2.1) of Spark, the table schema can be empty and should be + // inferred at runtime. We should still support it. + userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema), + partitionColumns = table.partitionColumnNames, + bucketSpec = table.bucketSpec, + className = table.provider.get, + options = table.storage.properties ++ pathOption, + catalogTable = Some(table)) + LogicalRelation(dataSource.resolveRelation(checkFilesExist = false), table) + }) + } + + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case i @ InsertIntoStatement(UnresolvedCatalogRelation(tableMeta), _, _, _, _) + if DDLUtils.isDatasourceTable(tableMeta) => + i.copy(table = readDataSourceTable(tableMeta)) + + case i @ InsertIntoStatement(UnresolvedCatalogRelation(tableMeta), _, _, _, _) => + i.copy(table = DDLUtils.readHiveTable(tableMeta)) + + case UnresolvedCatalogRelation(tableMeta) if DDLUtils.isDatasourceTable(tableMeta) => + readDataSourceTable(tableMeta) + + case UnresolvedCatalogRelation(tableMeta) => + DDLUtils.readHiveTable(tableMeta) + } +} + + +/** + * A Strategy for planning scans over data sources defined using the sources API. + */ +case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with CastSupport { + import DataSourceStrategy._ + + def apply(plan: LogicalPlan): Seq[execution.SparkPlan] = plan match { + case ScanOperation(projects, filters, l @ LogicalRelation(t: CatalystScan, _, _, _)) => + pruneFilterProjectRaw( + l, + projects, + filters, + (requestedColumns, allPredicates, _) => + toCatalystRDD(l, requestedColumns, t.buildScan(requestedColumns, allPredicates))) :: Nil + + case ScanOperation(projects, filters, + l @ LogicalRelation(t: PrunedFilteredScan, _, _, _)) => + pruneFilterProject( + l, + projects, + filters, + (a, f) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray, f))) :: Nil + + case ScanOperation(projects, filters, l @ LogicalRelation(t: PrunedScan, _, _, _)) => + pruneFilterProject( + l, + projects, + filters, + (a, _) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray))) :: Nil + + case l @ LogicalRelation(baseRelation: TableScan, _, _, _) => + RowDataSourceScanExec( + l.output, + l.output.indices, + Set.empty, + Set.empty, + toCatalystRDD(l, baseRelation.buildScan()), + baseRelation, + None) :: Nil + + case _ => Nil + } + + // Based on Public API. + private def pruneFilterProject( + relation: LogicalRelation, + projects: Seq[NamedExpression], + filterPredicates: Seq[Expression], + scanBuilder: (Seq[Attribute], Array[Filter]) + => RDD[InternalRow]) = { + pruneFilterProjectRaw( + relation, + projects, + filterPredicates, + (requestedColumns, _, pushedFilters) => { + scanBuilder(requestedColumns, pushedFilters.toArray) + }) + } + + // Based on Catalyst expressions. The `scanBuilder` function accepts three arguments: + // + // 1. A `Seq[Attribute]`, containing all required column attributes. Used to handle relation + // traits that support column pruning (e.g. `PrunedScan` and `PrunedFilteredScan`). + // + // 2. A `Seq[Expression]`, containing all gathered Catalyst filter expressions, only used for + // `CatalystScan`. + // + // 3. A `Seq[Filter]`, containing all data source `Filter`s that are converted from (possibly a + // subset of) Catalyst filter expressions and can be handled by `relation`. Used to handle + // relation traits (`CatalystScan` excluded) that support filter push-down (e.g. + // `PrunedFilteredScan` and `HadoopFsRelation`). + // + // Note that 2 and 3 shouldn't be used together. + private def pruneFilterProjectRaw( + relation: LogicalRelation, + projects: Seq[NamedExpression], + filterPredicates: Seq[Expression], + scanBuilder: (Seq[Attribute], Seq[Expression], + Seq[Filter]) => RDD[InternalRow]): SparkPlan = { + + val projectSet = AttributeSet(projects.flatMap(_.references)) + val filterSet = AttributeSet(filterPredicates.flatMap(_.references)) + + val candidatePredicates = filterPredicates.map { _ transform { + case a: AttributeReference => relation.attributeMap(a) // Match original case of attributes. + }} + + val (unhandledPredicates, pushedFilters, handledFilters) = + selectFilters(relation.relation, candidatePredicates) + + // Combines all Catalyst filter `Expression`s that are either not convertible to data source + // `Filter`s or cannot be handled by `relation`. + val filterCondition = unhandledPredicates.reduceLeftOption(expressions.And) + + if (projects.map(_.toAttribute) == projects && + projectSet.size == projects.size && + filterSet.subsetOf(projectSet)) { + // When it is possible to just use column pruning to get the right projection and + // when the columns of this projection are enough to evaluate all filter conditions, + // just do a scan followed by a filter, with no extra project. + val requestedColumns = projects + // Safe due to if above. + .asInstanceOf[Seq[Attribute]] + // Match original case of attributes. + .map(relation.attributeMap) + + val scan = RowDataSourceScanExec( + relation.output, + requestedColumns.map(relation.output.indexOf), + pushedFilters.toSet, + handledFilters, + scanBuilder(requestedColumns, candidatePredicates, pushedFilters), + relation.relation, + relation.catalogTable.map(_.identifier)) + filterCondition.map{ x => + val selectivity = FilterEstimation(LFilter(x, relation)).calculateFilterSelectivity(x) + execution.FilterExec(x, scan, selectivity) + }.getOrElse(scan) + } else { + // A set of column attributes that are only referenced by pushed down filters. We can + // eliminate them from requested columns. + val handledSet = { + val handledPredicates = filterPredicates.filterNot(unhandledPredicates.contains) + val unhandledSet = AttributeSet(unhandledPredicates.flatMap(_.references)) + AttributeSet(handledPredicates.flatMap(_.references)) -- + (projectSet ++ unhandledSet).map(relation.attributeMap) + } + // Don't request columns that are only referenced by pushed filters. + val requestedColumns = + (projectSet ++ filterSet -- handledSet).map(relation.attributeMap).toSeq + + val scan = RowDataSourceScanExec( + relation.output, + requestedColumns.map(relation.output.indexOf), + pushedFilters.toSet, + handledFilters, + scanBuilder(requestedColumns, candidatePredicates, pushedFilters), + relation.relation, + relation.catalogTable.map(_.identifier)) + execution.ProjectExec( + projects, filterCondition.map{x => + val selectivity = FilterEstimation(LFilter(x, relation)).calculateFilterSelectivity(x) + execution.FilterExec(x, scan, selectivity) + }.getOrElse(scan)) + } + } + + /** + * Convert RDD of Row into RDD of InternalRow with objects in catalyst types + */ + private[this] def toCatalystRDD( + relation: LogicalRelation, + output: Seq[Attribute], + rdd: RDD[Row]): RDD[InternalRow] = { + DataSourceStrategy.toCatalystRDD(relation.relation, output, rdd) + } + + /** + * Convert RDD of Row into RDD of InternalRow with objects in catalyst types + */ + private[this] def toCatalystRDD(relation: LogicalRelation, rdd: RDD[Row]): RDD[InternalRow] = { + toCatalystRDD(relation, relation.output, rdd) + } +} + +object DataSourceStrategy { + /** + * The attribute name may differ from the one in the schema if the query analyzer + * is case insensitive. We should change attribute names to match the ones in the schema, + * so we do not need to worry about case sensitivity anymore. + */ + protected[sql] def normalizeExprs( + exprs: Seq[Expression], + attributes: Seq[AttributeReference]): Seq[Expression] = { + exprs.map { e => + e transform { + case a: AttributeReference => + a.withName(attributes.find(_.semanticEquals(a)).getOrElse(a).name) + } + } + } + + private def translateLeafNodeFilter( + predicate: Expression, + pushableColumn: PushableColumnBase): + Option[Filter] = predicate match { + case expressions.EqualTo(pushableColumn(name), Literal(v, t)) => + Some(sources.EqualTo(name, convertToScala(v, t))) + case expressions.EqualTo(Literal(v, t), pushableColumn(name)) => + Some(sources.EqualTo(name, convertToScala(v, t))) + + case expressions.EqualNullSafe(pushableColumn(name), Literal(v, t)) => + Some(sources.EqualNullSafe(name, convertToScala(v, t))) + case expressions.EqualNullSafe(Literal(v, t), pushableColumn(name)) => + Some(sources.EqualNullSafe(name, convertToScala(v, t))) + + case expressions.GreaterThan(pushableColumn(name), Literal(v, t)) => + Some(sources.GreaterThan(name, convertToScala(v, t))) + case expressions.GreaterThan(Literal(v, t), pushableColumn(name)) => + Some(sources.LessThan(name, convertToScala(v, t))) + + case expressions.LessThan(pushableColumn(name), Literal(v, t)) => + Some(sources.LessThan(name, convertToScala(v, t))) + case expressions.LessThan(Literal(v, t), pushableColumn(name)) => + Some(sources.GreaterThan(name, convertToScala(v, t))) + + case expressions.GreaterThanOrEqual(pushableColumn(name), Literal(v, t)) => + Some(sources.GreaterThanOrEqual(name, convertToScala(v, t))) + case expressions.GreaterThanOrEqual(Literal(v, t), pushableColumn(name)) => + Some(sources.LessThanOrEqual(name, convertToScala(v, t))) + + case expressions.LessThanOrEqual(pushableColumn(name), Literal(v, t)) => + Some(sources.LessThanOrEqual(name, convertToScala(v, t))) + case expressions.LessThanOrEqual(Literal(v, t), pushableColumn(name)) => + Some(sources.GreaterThanOrEqual(name, convertToScala(v, t))) + + case expressions.InSet(e @ pushableColumn(name), set) => + val toScala = CatalystTypeConverters.createToScalaConverter(e.dataType) + Some(sources.In(name, set.toArray.map(toScala))) + + // Because we only convert In to InSet in Optimizer when there are more than certain + // items. So it is possible we still get an In expression here that needs to be pushed + // down. + case expressions.In(e @ pushableColumn(name), list) if list.forall(_.isInstanceOf[Literal]) => + val hSet = list.map(_.eval(EmptyRow)) + val toScala = CatalystTypeConverters.createToScalaConverter(e.dataType) + Some(sources.In(name, hSet.toArray.map(toScala))) + + case expressions.IsNull(pushableColumn(name)) => + Some(sources.IsNull(name)) + case expressions.IsNotNull(pushableColumn(name)) => + Some(sources.IsNotNull(name)) + case expressions.StartsWith(pushableColumn(name), Literal(v: UTF8String, StringType)) => + Some(sources.StringStartsWith(name, v.toString)) + + case expressions.EndsWith(pushableColumn(name), Literal(v: UTF8String, StringType)) => + Some(sources.StringEndsWith(name, v.toString)) + + case expressions.Contains(pushableColumn(name), Literal(v: UTF8String, StringType)) => + Some(sources.StringContains(name, v.toString)) + + case expressions.Literal(true, BooleanType) => + Some(sources.AlwaysTrue) + + case expressions.Literal(false, BooleanType) => + Some(sources.AlwaysFalse) + + case _ => None + } + + /** + * Tries to translate a Catalyst [[Expression]] into data source [[Filter]]. + * + * @return a `Some[Filter]` if the input [[Expression]] is convertible, otherwise a `None`. + */ + protected[sql] def translateFilter( + predicate: Expression, + supportNestedPredicatePushdown: Boolean): + Option[Filter] = { + translateFilterWithMapping(predicate, None, supportNestedPredicatePushdown) + } + + /** + * Tries to translate a Catalyst [[Expression]] into data source [[Filter]]. + * + * @param predicate The input [[Expression]] to be translated as [[Filter]] + * @param translatedFilterToExpr An optional map from leaf node filter expressions to its + * translated [[Filter]]. The map is used for rebuilding + * [[Expression]] from [[Filter]]. + * @param nestedPredicatePushdownEnabled Whether nested predicate pushdown is enabled. + * @return a `Some[Filter]` if the input [[Expression]] is convertible, otherwise a `None`. + */ + protected[sql] def translateFilterWithMapping( + predicate: Expression, + translatedFilterToExpr: + Option[mutable.HashMap[sources.Filter, Expression]], + nestedPredicatePushdownEnabled: Boolean): Option[Filter] = { + predicate match { + case expressions.And(left, right) => + // See SPARK-12218 for detailed discussion + // It is not safe to just convert one side if we do not understand the + // other side. Here is an example used to explain the reason. + // Let's say we have (a = 2 AND trim(b) = 'blah') OR (c > 0) + // and we do not understand how to convert trim(b) = 'blah'. + // If we only convert a = 2, we will end up with + // (a = 2) OR (c > 0), which will generate wrong results. + // Pushing one leg of AND down is only safe to do at the top level. + // You can see ParquetFilters' createFilter for more details. + for { + leftFilter <- translateFilterWithMapping( + left, translatedFilterToExpr, nestedPredicatePushdownEnabled) + rightFilter <- translateFilterWithMapping( + right, translatedFilterToExpr, nestedPredicatePushdownEnabled) + } yield sources.And(leftFilter, rightFilter) + + case expressions.Or(left, right) => + for { + leftFilter <- translateFilterWithMapping( + left, translatedFilterToExpr, nestedPredicatePushdownEnabled) + rightFilter <- translateFilterWithMapping( + right, translatedFilterToExpr, nestedPredicatePushdownEnabled) + } yield sources.Or(leftFilter, rightFilter) + + case expressions.Not(child) => + translateFilterWithMapping(child, translatedFilterToExpr, nestedPredicatePushdownEnabled) + .map(sources.Not) + + case other => + val filter = translateLeafNodeFilter(other, PushableColumn(nestedPredicatePushdownEnabled)) + if (filter.isDefined && translatedFilterToExpr.isDefined) { + translatedFilterToExpr.get(filter.get) = predicate + } + filter + } + } + + protected[sql] def rebuildExpressionFromFilter( + filter: Filter, + translatedFilterToExpr: + mutable.HashMap[sources.Filter, Expression]): + Expression = { + filter match { + case sources.And(left, right) => + expressions.And(rebuildExpressionFromFilter(left, translatedFilterToExpr), + rebuildExpressionFromFilter(right, translatedFilterToExpr)) + case sources.Or(left, right) => + expressions.Or(rebuildExpressionFromFilter(left, translatedFilterToExpr), + rebuildExpressionFromFilter(right, translatedFilterToExpr)) + case sources.Not(pred) => + expressions.Not(rebuildExpressionFromFilter(pred, translatedFilterToExpr)) + case other => + translatedFilterToExpr.getOrElse(other, + throw new AnalysisException( + s"Fail to rebuild expression: missing key $filter in `translatedFilterToExpr`")) + } + } + + /** + * Selects Catalyst predicate [[Expression]]s which are convertible into data source [[Filter]]s + * and can be handled by `relation`. + * + * @return A triplet of `Seq[Expression]`, `Seq[Filter]`, and `Seq[Filter]` . The first element + * contains all Catalyst predicate [[Expression]]s that are either not convertible or + * cannot be handled by `relation`. The second element contains all converted data source + * [[Filter]]s that will be pushed down to the data source. The third element contains + * all [[Filter]]s that are completely filtered at the DataSource. + */ + protected[sql] def selectFilters( + relation: BaseRelation, + predicates: Seq[Expression]): (Seq[Expression], + Seq[Filter], Set[Filter]) = { + + // For conciseness, all Catalyst filter expressions of type `expressions.Expression` below are + // called `predicate`s, while all data source filters of type `sources.Filter` are simply called + // `filter`s. + + // A map from original Catalyst expressions to corresponding translated data source filters. + // If a predicate is not in this map, it means it cannot be pushed down. + val supportNestedPredicatePushdown = DataSourceUtils.supportNestedPredicatePushdown(relation) + val translatedMap: Map[Expression, Filter] = predicates.flatMap { p => + translateFilter(p, supportNestedPredicatePushdown).map(f => p -> f) + }.toMap + + val pushedFilters: Seq[Filter] = translatedMap.values.toSeq + + // Catalyst predicate expressions that cannot be converted to data source filters. + val nonconvertiblePredicates = predicates.filterNot(translatedMap.contains) + + // Data source filters that cannot be handled by `relation`. An unhandled filter means + // the data source cannot guarantee the rows returned can pass the filter. + // As a result we must return it so Spark can plan an extra filter operator. + val unhandledFilters = relation.unhandledFilters(translatedMap.values.toArray).toSet + val unhandledPredicates = translatedMap.filter { case (p, f) => + unhandledFilters.contains(f) + }.keys + val handledFilters = pushedFilters.toSet -- unhandledFilters + + (nonconvertiblePredicates ++ unhandledPredicates, pushedFilters, handledFilters) + } + + /** + * Convert RDD of Row into RDD of InternalRow with objects in catalyst types + */ + private[sql] def toCatalystRDD( + relation: BaseRelation, + output: Seq[Attribute], + rdd: RDD[Row]): RDD[InternalRow] = { + if (relation.needConversion) { + val toRow = RowEncoder(StructType.fromAttributes(output)).createSerializer() + rdd.mapPartitions { iterator => + iterator.map(toRow) + } + } else { + rdd.asInstanceOf[RDD[InternalRow]] + } + } +} + +/** + * Find the column name of an expression that can be pushed down. + */ +abstract class PushableColumnBase { + val nestedPredicatePushdownEnabled: Boolean + + def unapply(e: Expression): Option[String] = { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper + def helper(e: Expression): Option[Seq[String]] = e match { + case a: Attribute => + if (nestedPredicatePushdownEnabled || !a.name.contains(".")) { + Some(Seq(a.name)) + } else { + None + } + case s: GetStructField if nestedPredicatePushdownEnabled => + helper(s.child).map(_ :+ s.childSchema(s.ordinal).name) + case _ => None + } + helper(e).map(_.quoted) + } +} + +object PushableColumn { + def apply(nestedPredicatePushdownEnabled: Boolean): PushableColumnBase = { + if (nestedPredicatePushdownEnabled) { + PushableColumnAndNestedColumn + } else { + PushableColumnWithoutNestedColumn + } + } +} + +object PushableColumnAndNestedColumn extends PushableColumnBase { + override val nestedPredicatePushdownEnabled = true +} + +object PushableColumnWithoutNestedColumn extends PushableColumnBase { + override val nestedPredicatePushdownEnabled = false +} diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala new file mode 100644 index 00000000..7f689202 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.Partition +import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.read.InputPartition + +/** + * A collection of file blocks that should be read as a single task + * (possibly from multiple partitioned directories). + */ +case class FilePartition(index: Int, files: Array[PartitionedFile], var sdi: String = "") + extends Partition with InputPartition { + override def preferredLocations(): Array[String] = { + // Computes total number of bytes can be retrieved from each host. + val hostToNumBytes = mutable.HashMap.empty[String, Long] + files.foreach { file => + file.locations.filter(_ != "localhost").foreach { host => + hostToNumBytes(host) = hostToNumBytes.getOrElse(host, 0L) + file.length + } + } + + // Takes the first 3 hosts with the most data to be retrieved + hostToNumBytes.toSeq.sortBy { + case (host, numBytes) => numBytes + }.reverse.take(3).map { + case (host, numBytes) => host + }.toArray + } +} + +object FilePartition extends Logging { + + def getFilePartitions( + sparkSession: SparkSession, + partitionedFiles: Seq[PartitionedFile], + maxSplitBytes: Long): Seq[FilePartition] = { + val partitions = new ArrayBuffer[FilePartition] + val currentFiles = new ArrayBuffer[PartitionedFile] + var currentSize = 0L + + /** Close the current partition and move to the next. */ + def closePartition(): Unit = { + if (currentFiles.nonEmpty) { + // Copy to a new Array. + val newPartition = FilePartition(partitions.size, currentFiles.toArray) + partitions += newPartition + } + currentFiles.clear() + currentSize = 0 + } + + val openCostInBytes = sparkSession.sessionState.conf.filesOpenCostInBytes + // Assign files to partitions using "Next Fit Decreasing" + partitionedFiles.foreach { file => + if (currentSize + file.length > maxSplitBytes) { + closePartition() + } + // Add the given file to the current partition. + currentSize += file.length + openCostInBytes + currentFiles += file + } + closePartition() + partitions + } + + def maxSplitBytes( + sparkSession: SparkSession, + selectedPartitions: Seq[PartitionDirectory]): Long = { + val defaultMaxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes + val openCostInBytes = sparkSession.sessionState.conf.filesOpenCostInBytes + val defaultParallelism = sparkSession.sparkContext.defaultParallelism + val totalBytes = selectedPartitions.flatMap(_.files.map(_.getLen + openCostInBytes)).sum + val bytesPerCore = totalBytes / defaultParallelism + + Math.min(defaultMaxSplitBytes, Math.max(openCostInBytes, bytesPerCore)) + } +} diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDDPushDown.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDDPushDown.scala new file mode 100644 index 00000000..1b39cf2c --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDDPushDown.scala @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.util + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import org.apache.parquet.io.ParquetDecodingException + +import org.apache.spark.{Partition => RDDPartition, SparkUpgradeException, TaskContext} +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.rdd.{InputFileBlockHolder, RDD} +import org.apache.spark.sql.{DataIoAdapter, NdpUtils, OmniDataProperties, PageCandidate, PageToColumnar, SparkSession} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.execution.QueryExecutionException +import org.apache.spark.sql.execution.ndp.{NdpConf, PushDownInfo} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + + +/** + * An RDD that scans a list of file partitions. + */ +class FileScanRDDPushDown( + @transient private val sparkSession: SparkSession, + @transient val filePartitions: Seq[FilePartition], + requiredSchema: StructType, + output: Seq[Attribute], + dataSchema: StructType, + pushDownOperators: PushDownInfo, + partitionColumns: Seq[Attribute], + isColumnVector: Boolean, + fileFormat: FileFormat) + extends RDD[InternalRow](sparkSession.sparkContext, Nil) { + + var columnOffset = -1 + var filterOutput : Seq[Attribute] = Seq() + val sdiPort = NdpConf.getNdpSdiPort(sparkSession) + val grpcSslEnabled = NdpConf.getNdpGrpcSslEnabled(sparkSession) + val grpcCertPath = NdpConf.getNdpGrpcClientCertFilePath(sparkSession) + val grpcKeyPath = NdpConf.getNdpGrpcClientPrivateKeyFilePath(sparkSession) + val grpcCaPath = NdpConf.getNdpGrpcTrustCaFilePath(sparkSession) + val grpcPkiDir = NdpConf.getNdpPkiDir(sparkSession) + if (pushDownOperators.filterExecutions != null && pushDownOperators.filterExecutions.size > 0) { + columnOffset = NdpUtils.getColumnOffset(dataSchema, + pushDownOperators.filterExecutions(0).output) + filterOutput = pushDownOperators.filterExecutions(0).output + } else if (pushDownOperators.aggExecutions != null && pushDownOperators.aggExecutions.size > 0) { + columnOffset = NdpUtils.getColumnOffsetByAggExeInfo(dataSchema, + pushDownOperators.aggExecutions) + } else { + columnOffset = NdpUtils.getColumnOffset(dataSchema, output) + filterOutput = output + } + val fpuMap = pushDownOperators.fpuHosts + val filterExecution = pushDownOperators.filterExecutions + val aggExecution = pushDownOperators.aggExecutions + val limitExecution = pushDownOperators.limitExecution + var sqlAggExpressions : Seq[Expression] = Seq() + var sqlGroupExpressions : Seq[Expression] = Seq() + var expressionMaps: scala.collection.mutable.Map[String, Seq[Expression]] = + scala.collection.mutable.Map[String, Seq[Expression]]() + var aggMaps: scala.collection.mutable.Map[String, + scala.collection.mutable.Map[String, Seq[Expression]]] = + scala.collection.mutable.Map[String, scala.collection.mutable.Map[String, Seq[Expression]]]() + var projectId = 0 + val expressions: util.ArrayList[Object] = new util.ArrayList[Object]() + + override def compute(split: RDDPartition, context: TaskContext): Iterator[InternalRow] = { + val pageToColumnarClass = new PageToColumnar(requiredSchema, output) + + val iterator = new Iterator[Object] with AutoCloseable { + private val inputMetrics = context.taskMetrics().inputMetrics + private val existingBytesRead = inputMetrics.bytesRead + private val getBytesReadCallback = + SparkHadoopUtil.get.getFSBytesReadOnThreadCallback() + private def incTaskInputMetricsBytesRead(): Unit = { + inputMetrics.setBytesRead(existingBytesRead + getBytesReadCallback()) + } + + private[this] val files = split.asInstanceOf[FilePartition].files.toIterator + private[this] var currentFile: PartitionedFile = null + private[this] var currentIterator: Iterator[Object] = null + private[this] val sdiHosts = split.asInstanceOf[FilePartition].sdi + val dataIoClass = new DataIoAdapter() + + def hasNext: Boolean = { + // Kill the task in case it has been marked as killed. This logic is from + // InterruptibleIterator, but we inline it here instead of wrapping the iterator in order + // to avoid performance overhead. + context.killTaskIfInterrupted() + val hasNext = currentIterator != null && currentIterator.hasNext + if (hasNext) { + hasNext + } else { + val tmp: util.ArrayList[Object] = new util.ArrayList[Object]() + var hasnextIterator = false + try { + hasnextIterator = dataIoClass.hasnextIterator(tmp, pageToColumnarClass, + currentFile, isColumnVector) + } catch { + case e : Exception => + throw e + } + val ret = if (hasnextIterator && tmp.size() > 0) { + currentIterator = tmp.asScala.iterator + hasnextIterator + } else { + nextIterator() + } + ret + } + } + def next(): Object = { + val nextElement = currentIterator.next() + // TODO: we should have a better separation of row based and batch based scan, so that we + // don't need to run this `if` for every record. + if (nextElement.isInstanceOf[ColumnarBatch]) { + incTaskInputMetricsBytesRead() + inputMetrics.incRecordsRead(nextElement.asInstanceOf[ColumnarBatch].numRows()) + } else { + // too costly to update every record + if (inputMetrics.recordsRead % + SparkHadoopUtil.UPDATE_INPUT_METRICS_INTERVAL_RECORDS == 0) { + incTaskInputMetricsBytesRead() + } + inputMetrics.incRecordsRead(1) + } + nextElement + } + + /** Advances to the next file. Returns true if a new non-empty iterator is available. */ + private def nextIterator(): Boolean = { + if (files.hasNext) { + currentFile = files.next() + logInfo(s"Reading File $currentFile") + InputFileBlockHolder.set(currentFile.filePath, currentFile.start, currentFile.length) + val pageCandidate = new PageCandidate(currentFile.filePath, currentFile.start, + currentFile.length, columnOffset, sdiHosts, fileFormat.toString, sdiPort) + val omniDataProperties = new OmniDataProperties(grpcSslEnabled, + grpcCertPath, grpcKeyPath, grpcCaPath, grpcPkiDir) + val dataIoPage = dataIoClass.getPageIterator(pageCandidate, output, + partitionColumns, filterOutput, pushDownOperators, omniDataProperties) + currentIterator = pageToColumnarClass.transPageToColumnar(dataIoPage, + isColumnVector).asScala.iterator + try { + hasNext + } catch { + case e: SchemaColumnConvertNotSupportedException => + val message = "Parquet column cannot be converted in " + + s"file ${currentFile.filePath}. Column: ${e.getColumn}, " + + s"Expected: ${e.getLogicalType}, Found: ${e.getPhysicalType}" + throw new QueryExecutionException(message, e) + case e: ParquetDecodingException => + if (e.getCause.isInstanceOf[SparkUpgradeException]) { + throw e.getCause + } else if (e.getMessage.contains("Can not read value at")) { + val message = "Encounter error while reading parquet files. " + + "One possible cause: Parquet column cannot be converted in the " + + "corresponding files. Details: " + throw new QueryExecutionException(message, e) + } + throw e + } + } else { + currentFile = null + InputFileBlockHolder.unset() + false + } + } + + override def close(): Unit = { + incTaskInputMetricsBytesRead() + InputFileBlockHolder.unset() + } + } + + // Register an on-task-completion callback to close the input stream. + context.addTaskCompletionListener[Unit](_ => iterator.close()) + + iterator.asInstanceOf[Iterator[InternalRow]] // This is an erasure hack. + } + + override protected def getPartitions: Array[RDDPartition] = { + filePartitions.map { partitionFile => { + val retHost = mutable.HashMap.empty[String, Long] + partitionFile.files.foreach { partitionMap => { + partitionMap.locations.filter(_ != "localhost").foreach { + sdiKey => { + retHost(sdiKey) = retHost.getOrElse(sdiKey, 0L) + partitionMap.length + sdiKey + }} + }} + val datanode = retHost.toSeq.sortWith((x, y) => x._2 > y._2).toIterator + var mapNum = 0 + while (datanode.hasNext && mapNum < 3) { + val datanodeStr = datanode.next()._1 + if (fpuMap.contains(datanodeStr)) { + val partitioned = fpuMap(datanodeStr) + if (!"".equalsIgnoreCase(partitionFile.sdi)) { + partitionFile.sdi ++= "," + } + partitionFile.sdi ++= partitioned + } + mapNum = mapNum + 1 + } + partitionFile.sdi + }}.toArray + filePartitions.toArray + } + + override protected def getPreferredLocations(split: RDDPartition): Seq[String] = { + split.asInstanceOf[FilePartition].preferredLocations() + } +} + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala new file mode 100644 index 00000000..c05df643 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import org.apache.spark.internal.Logging +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.catalog.BucketSpec +import org.apache.spark.sql.catalyst.expressions +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.planning.ScanOperation +import org.apache.spark.sql.catalyst.plans.logical.{Filter => LFilter, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.FilterEstimation +import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan} +import org.apache.spark.util.collection.BitSet + +/** + * A strategy for planning scans over collections of files that might be partitioned or bucketed + * by user specified columns. + * + * At a high level planning occurs in several phases: + * - Split filters by when they need to be evaluated. + * - Prune the schema of the data requested based on any projections present. Today this pruning + * is only done on top level columns, but formats should support pruning of nested columns as + * well. + * - Construct a reader function by passing filters and the schema into the FileFormat. + * - Using a partition pruning predicates, enumerate the list of files that should be read. + * - Split the files into tasks and construct a FileScanRDD. + * - Add any projection or filters that must be evaluated after the scan. + * + * Files are assigned into tasks using the following algorithm: + * - If the table is bucketed, group files by bucket id into the correct number of partitions. + * - If the table is not bucketed or bucketing is turned off: + * - If any file is larger than the threshold, split it into pieces based on that threshold + * - Sort the files by decreasing file size. + * - Assign the ordered files to buckets using the following algorithm. If the current partition + * is under the threshold with the addition of the next file, add it. If not, open a new bucket + * and add it. Proceed to the next file. + */ +object FileSourceStrategy extends Strategy with Logging { + + // should prune buckets iff num buckets is greater than 1 and there is only one bucket column + private def shouldPruneBuckets(bucketSpec: Option[BucketSpec]): Boolean = { + bucketSpec match { + case Some(spec) => spec.bucketColumnNames.length == 1 && spec.numBuckets > 1 + case None => false + } + } + + private def getExpressionBuckets( + expr: Expression, + bucketColumnName: String, + numBuckets: Int): BitSet = { + + def getBucketNumber(attr: Attribute, v: Any): Int = { + BucketingUtils.getBucketIdFromValue(attr, numBuckets, v) + } + + def getBucketSetFromIterable(attr: Attribute, iter: Iterable[Any]): BitSet = { + val matchedBuckets = new BitSet(numBuckets) + iter + .map(v => getBucketNumber(attr, v)) + .foreach(bucketNum => matchedBuckets.set(bucketNum)) + matchedBuckets + } + + def getBucketSetFromValue(attr: Attribute, v: Any): BitSet = { + val matchedBuckets = new BitSet(numBuckets) + matchedBuckets.set(getBucketNumber(attr, v)) + matchedBuckets + } + + expr match { + case expressions.Equality(a: Attribute, Literal(v, _)) if a.name == bucketColumnName => + getBucketSetFromValue(a, v) + case expressions.In(a: Attribute, list) + if list.forall(_.isInstanceOf[Literal]) && a.name == bucketColumnName => + getBucketSetFromIterable(a, list.map(e => e.eval(EmptyRow))) + case expressions.InSet(a: Attribute, hset) + if hset.forall(_.isInstanceOf[Literal]) && a.name == bucketColumnName => + getBucketSetFromIterable(a, hset.map(e => expressions.Literal(e).eval(EmptyRow))) + case expressions.IsNull(a: Attribute) if a.name == bucketColumnName => + getBucketSetFromValue(a, null) + case expressions.And(left, right) => + getExpressionBuckets(left, bucketColumnName, numBuckets) & + getExpressionBuckets(right, bucketColumnName, numBuckets) + case expressions.Or(left, right) => + getExpressionBuckets(left, bucketColumnName, numBuckets) | + getExpressionBuckets(right, bucketColumnName, numBuckets) + case _ => + val matchedBuckets = new BitSet(numBuckets) + matchedBuckets.setUntil(numBuckets) + matchedBuckets + } + } + + private def genBucketSet( + normalizedFilters: Seq[Expression], + bucketSpec: BucketSpec): Option[BitSet] = { + if (normalizedFilters.isEmpty) { + return None + } + + val bucketColumnName = bucketSpec.bucketColumnNames.head + val numBuckets = bucketSpec.numBuckets + + val normalizedFiltersAndExpr = normalizedFilters + .reduce(expressions.And) + val matchedBuckets = getExpressionBuckets(normalizedFiltersAndExpr, bucketColumnName, + numBuckets) + + val numBucketsSelected = matchedBuckets.cardinality() + + logInfo { + s"Pruned ${numBuckets - numBucketsSelected} out of $numBuckets buckets." + } + + // None means all the buckets need to be scanned + if (numBucketsSelected == numBuckets) { + None + } else { + Some(matchedBuckets) + } + } + + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case ScanOperation(projects, filters, + l @ LogicalRelation(fsRelation: HadoopFsRelation, _, table, _)) => + // Filters on this relation fall into four categories based on where we can use them to avoid + // reading unneeded data: + // - partition keys only - used to prune directories to read + // - bucket keys only - optionally used to prune files to read + // - keys stored in the data only - optionally used to skip groups of data in files + // - filters that need to be evaluated again after the scan + val filterSet = ExpressionSet(filters) + + val normalizedFilters = DataSourceStrategy.normalizeExprs( + filters.filter(_.deterministic), l.output) + + val partitionColumns = + l.resolve( + fsRelation.partitionSchema, fsRelation.sparkSession.sessionState.analyzer.resolver) + val partitionSet = AttributeSet(partitionColumns) + val partitionKeyFilters = + ExpressionSet(normalizedFilters + .filter(_.references.subsetOf(partitionSet))) + + logInfo(s"Pruning directories with: ${partitionKeyFilters.mkString(",")}") + + // subquery expressions are filtered out because they can't be used to prune buckets or pushed + // down as data filters, yet they would be executed + val normalizedFiltersWithoutSubqueries = + normalizedFilters.filterNot(SubqueryExpression.hasSubquery) + + val bucketSpec: Option[BucketSpec] = fsRelation.bucketSpec + val bucketSet = if (shouldPruneBuckets(bucketSpec)) { + genBucketSet(normalizedFiltersWithoutSubqueries, bucketSpec.get) + } else { + None + } + + val dataColumns = + l.resolve(fsRelation.dataSchema, fsRelation.sparkSession.sessionState.analyzer.resolver) + + // Partition keys are not available in the statistics of the files. + val dataFilters = + normalizedFiltersWithoutSubqueries.filter(_.references.intersect(partitionSet).isEmpty) + val supportNestedPredicatePushdown = + DataSourceUtils.supportNestedPredicatePushdown(fsRelation) + val pushedFilters = dataFilters + .flatMap(DataSourceStrategy.translateFilter(_, supportNestedPredicatePushdown)) + logInfo(s"Pushed Filters: ${pushedFilters.mkString(",")}") + + // Predicates with both partition keys and attributes need to be evaluated after the scan. + val afterScanFilters = filterSet -- partitionKeyFilters.filter(_.references.nonEmpty) + logInfo(s"Post-Scan Filters: ${afterScanFilters.mkString(",")}") + + val filterAttributes = AttributeSet(afterScanFilters) + val requiredExpressions: Seq[NamedExpression] = filterAttributes.toSeq ++ projects + val requiredAttributes = AttributeSet(requiredExpressions) + + val readDataColumns = + dataColumns + .filter(requiredAttributes.contains) + .filterNot(partitionColumns.contains) + val outputSchema = readDataColumns.toStructType + logInfo(s"Output Data Schema: ${outputSchema.simpleString(5)}") + + val outputAttributes = readDataColumns ++ partitionColumns + + val scan = + FileSourceScanExec( + fsRelation, + outputAttributes, + outputSchema, + partitionKeyFilters.toSeq, + bucketSet, + dataFilters, + table.map(_.identifier), + partitionColumns) + + val afterScanFilter = afterScanFilters.toSeq.reduceOption(expressions.And) + val selectivity = if (afterScanFilter.nonEmpty) { + FilterEstimation(LFilter(afterScanFilter.get, l)) + .calculateFilterSelectivity(afterScanFilter.get) + } else { + None + } + val withFilter = afterScanFilter.map(execution.FilterExec(_, scan, selectivity)) + .getOrElse(scan) + val withProjections = if (projects == withFilter.output) { + withFilter + } else { + execution.ProjectExec(projects, withFilter) + } + + withProjections :: Nil + + case _ => Nil + } +} diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala new file mode 100644 index 00000000..5ffab94d --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -0,0 +1,310 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import scala.collection.JavaConverters._ + +import org.apache.spark.sql.{AnalysisException, SparkSession, Strategy} +import org.apache.spark.sql.catalyst.analysis.{ResolvedNamespace, ResolvedTable} +import org.apache.spark.sql.catalyst.expressions.{And, Expression, NamedExpression, PredicateHelper, SubqueryExpression} +import org.apache.spark.sql.catalyst.planning.PhysicalOperation +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.plans.logical.{Filter => LFilter} +import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.FilterEstimation +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, StagingTableCatalog, SupportsNamespaces, TableCapability, TableCatalog, TableChange} +import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, MicroBatchStream} +import org.apache.spark.sql.execution.{FilterExec, LeafExecNode, ProjectExec, RowDataSourceScanExec, SparkPlan} +import org.apache.spark.sql.execution.datasources.DataSourceStrategy +import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec} +import org.apache.spark.sql.sources.{BaseRelation, TableScan} +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +class DataSourceV2Strategy(session: SparkSession) extends Strategy with PredicateHelper { + + import DataSourceV2Implicits._ + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + + private def withProjectAndFilter( + project: Seq[NamedExpression], + filters: Seq[Expression], + scan: LeafExecNode, + needsUnsafeConversion: Boolean, + selectivity: Option[Double]): SparkPlan = { + val filterCondition = filters.reduceLeftOption(And) + val withFilter = filterCondition.map(FilterExec(_, scan, selectivity)).getOrElse(scan) + + if (withFilter.output != project || needsUnsafeConversion) { + ProjectExec(project, withFilter) + } else { + withFilter + } + } + + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case PhysicalOperation(project, filters, + relation @ DataSourceV2ScanRelation(_, V1ScanWrapper(scan, translated, pushed), output)) => + val v1Relation = scan.toV1TableScan[BaseRelation with TableScan](session.sqlContext) + if (v1Relation.schema != scan.readSchema()) { + throw new IllegalArgumentException( + "The fallback v1 relation reports inconsistent schema:\n" + + "Schema of v2 scan: " + scan.readSchema() + "\n" + + "Schema of v1 relation: " + v1Relation.schema) + } + val rdd = v1Relation.buildScan() + val unsafeRowRDD = DataSourceStrategy.toCatalystRDD(v1Relation, output, rdd) + val originalOutputNames = relation.table.schema().map(_.name) + val requiredColumnsIndex = output.map(_.name).map(originalOutputNames.indexOf) + val dsScan = RowDataSourceScanExec( + output, + requiredColumnsIndex, + translated.toSet, + pushed.toSet, + unsafeRowRDD, + v1Relation, + tableIdentifier = None) + val condition = filters.reduceLeftOption(And) + val selectivity = if (condition.nonEmpty) { + FilterEstimation(LFilter(condition.get, relation)).calculateFilterSelectivity(condition.get) + } else { + None + } + withProjectAndFilter(project, filters, dsScan, + needsUnsafeConversion = false, selectivity) :: Nil + + case PhysicalOperation(project, filters, relation: DataSourceV2ScanRelation) => + // projection and filters were already pushed down in the optimizer. + // this uses PhysicalOperation to get the projection and ensure that if the batch scan does + // not support columnar, a projection is added to convert the rows to UnsafeRow. + val batchExec = BatchScanExec(relation.output, relation.scan) + val condition = filters.reduceLeftOption(And) + val selectivity = if (condition.nonEmpty) { + FilterEstimation(LFilter(condition.get, relation)).calculateFilterSelectivity(condition.get) + } else { + None + } + withProjectAndFilter(project, filters, batchExec, + !batchExec.supportsColumnar, selectivity) :: Nil + + case r: StreamingDataSourceV2Relation if r.startOffset.isDefined && r.endOffset.isDefined => + val microBatchStream = r.stream.asInstanceOf[MicroBatchStream] + val scanExec = MicroBatchScanExec( + r.output, r.scan, microBatchStream, r.startOffset.get, r.endOffset.get) + + val withProjection = if (scanExec.supportsColumnar) { + scanExec + } else { + // Add a Project here to make sure we produce unsafe rows. + ProjectExec(r.output, scanExec) + } + + withProjection :: Nil + + case r: StreamingDataSourceV2Relation if r.startOffset.isDefined && r.endOffset.isEmpty => + val continuousStream = r.stream.asInstanceOf[ContinuousStream] + val scanExec = ContinuousScanExec(r.output, r.scan, continuousStream, r.startOffset.get) + + val withProjection = if (scanExec.supportsColumnar) { + scanExec + } else { + // Add a Project here to make sure we produce unsafe rows. + ProjectExec(r.output, scanExec) + } + + withProjection :: Nil + + case WriteToDataSourceV2(writer, query) => + WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil + + case CreateV2Table(catalog, ident, schema, parts, props, ifNotExists) => + val propsWithOwner = CatalogV2Util.withDefaultOwnership(props) + CreateTableExec(catalog, ident, schema, parts, propsWithOwner, ifNotExists) :: Nil + + case CreateTableAsSelect(catalog, ident, parts, query, props, options, ifNotExists) => + val propsWithOwner = CatalogV2Util.withDefaultOwnership(props) + val writeOptions = new CaseInsensitiveStringMap(options.asJava) + catalog match { + case staging: StagingTableCatalog => + AtomicCreateTableAsSelectExec(staging, ident, parts, query, planLater(query), + propsWithOwner, writeOptions, ifNotExists) :: Nil + case _ => + CreateTableAsSelectExec(catalog, ident, parts, query, planLater(query), + propsWithOwner, writeOptions, ifNotExists) :: Nil + } + + case RefreshTable(catalog, ident) => + RefreshTableExec(catalog, ident) :: Nil + + case ReplaceTable(catalog, ident, schema, parts, props, orCreate) => + val propsWithOwner = CatalogV2Util.withDefaultOwnership(props) + catalog match { + case staging: StagingTableCatalog => + AtomicReplaceTableExec( + staging, ident, schema, parts, propsWithOwner, orCreate = orCreate) :: Nil + case _ => + ReplaceTableExec( + catalog, ident, schema, parts, propsWithOwner, orCreate = orCreate) :: Nil + } + + case ReplaceTableAsSelect(catalog, ident, parts, query, props, options, orCreate) => + val propsWithOwner = CatalogV2Util.withDefaultOwnership(props) + val writeOptions = new CaseInsensitiveStringMap(options.asJava) + catalog match { + case staging: StagingTableCatalog => + AtomicReplaceTableAsSelectExec( + staging, + ident, + parts, + query, + planLater(query), + propsWithOwner, + writeOptions, + orCreate = orCreate) :: Nil + case _ => + ReplaceTableAsSelectExec( + catalog, + ident, + parts, + query, + planLater(query), + propsWithOwner, + writeOptions, + orCreate = orCreate) :: Nil + } + + case AppendData(r: DataSourceV2Relation, query, writeOptions, _) => + r.table.asWritable match { + case v1 if v1.supports(TableCapability.V1_BATCH_WRITE) => + AppendDataExecV1(v1, writeOptions.asOptions, query) :: Nil + case v2 => + AppendDataExec(v2, writeOptions.asOptions, planLater(query)) :: Nil + } + + case OverwriteByExpression(r: DataSourceV2Relation, deleteExpr, query, writeOptions, _) => + // fail if any filter cannot be converted. correctness depends on removing all matching data. + val filters = splitConjunctivePredicates(deleteExpr).map { + filter => DataSourceStrategy.translateFilter(deleteExpr, + supportNestedPredicatePushdown = true).getOrElse( + throw new AnalysisException(s"Cannot translate expression to source filter: $filter")) + }.toArray + r.table.asWritable match { + case v1 if v1.supports(TableCapability.V1_BATCH_WRITE) => + OverwriteByExpressionExecV1(v1, filters, writeOptions.asOptions, query) :: Nil + case v2 => + OverwriteByExpressionExec(v2, filters, writeOptions.asOptions, planLater(query)) :: Nil + } + + case OverwritePartitionsDynamic(r: DataSourceV2Relation, query, writeOptions, _) => + OverwritePartitionsDynamicExec( + r.table.asWritable, writeOptions.asOptions, planLater(query)) :: Nil + + case DeleteFromTable(relation, condition) => + relation match { + case DataSourceV2ScanRelation(table, _, output) => + if (condition.exists(SubqueryExpression.hasSubquery)) { + throw new AnalysisException( + s"Delete by condition with subquery is not supported: $condition") + } + // fail if any filter cannot be converted. + // correctness depends on removing all matching data. + val filters = DataSourceStrategy.normalizeExprs(condition.toSeq, output) + .flatMap(splitConjunctivePredicates(_).map { + f => DataSourceStrategy.translateFilter(f, true).getOrElse( + throw new AnalysisException(s"Exec update failed:" + + s" cannot translate expression to source filter: $f")) + }).toArray + DeleteFromTableExec(table.asDeletable, filters) :: Nil + case _ => + throw new AnalysisException("DELETE is only supported with v2 tables.") + } + + case WriteToContinuousDataSource(writer, query) => + WriteToContinuousDataSourceExec(writer, planLater(query)) :: Nil + + case Repartition(1, false, child) => + val isContinuous = child.find { + case r: StreamingDataSourceV2Relation => r.stream.isInstanceOf[ContinuousStream] + case _ => false + }.isDefined + + if (isContinuous) { + ContinuousCoalesceExec(1, planLater(child)) :: Nil + } else { + Nil + } + + case desc @ DescribeNamespace(ResolvedNamespace(catalog, ns), extended) => + DescribeNamespaceExec(desc.output, catalog.asNamespaceCatalog, ns, extended) :: Nil + + case desc @ DescribeRelation(r: ResolvedTable, partitionSpec, isExtended) => + if (partitionSpec.nonEmpty) { + throw new AnalysisException("DESCRIBE does not support partition for v2 tables.") + } + DescribeTableExec(desc.output, r.table, isExtended) :: Nil + + case DropTable(catalog, ident, ifExists) => + DropTableExec(catalog, ident, ifExists) :: Nil + + case AlterTable(catalog, ident, _, changes) => + AlterTableExec(catalog, ident, changes) :: Nil + + case RenameTable(catalog, oldIdent, newIdent) => + RenameTableExec(catalog, oldIdent, newIdent) :: Nil + + case AlterNamespaceSetProperties(ResolvedNamespace(catalog, ns), properties) => + AlterNamespaceSetPropertiesExec(catalog.asNamespaceCatalog, ns, properties) :: Nil + + case AlterNamespaceSetLocation(ResolvedNamespace(catalog, ns), location) => + AlterNamespaceSetPropertiesExec( + catalog.asNamespaceCatalog, + ns, + Map(SupportsNamespaces.PROP_LOCATION -> location)) :: Nil + + case CommentOnNamespace(ResolvedNamespace(catalog, ns), comment) => + AlterNamespaceSetPropertiesExec( + catalog.asNamespaceCatalog, + ns, + Map(SupportsNamespaces.PROP_COMMENT -> comment)) :: Nil + + case CommentOnTable(ResolvedTable(catalog, identifier, _), comment) => + val changes = TableChange.setProperty(TableCatalog.PROP_COMMENT, comment) + AlterTableExec(catalog, identifier, Seq(changes)) :: Nil + + case CreateNamespace(catalog, namespace, ifNotExists, properties) => + CreateNamespaceExec(catalog, namespace, ifNotExists, properties) :: Nil + + case DropNamespace(ResolvedNamespace(catalog, ns), ifExists, cascade) => + DropNamespaceExec(catalog, ns, ifExists, cascade) :: Nil + + case r @ ShowNamespaces(ResolvedNamespace(catalog, ns), pattern) => + ShowNamespacesExec(r.output, catalog.asNamespaceCatalog, ns, pattern) :: Nil + + case r @ ShowTables(ResolvedNamespace(catalog, ns), pattern) => + ShowTablesExec(r.output, catalog.asTableCatalog, ns, pattern) :: Nil + + case SetCatalogAndNamespace(catalogManager, catalogName, ns) => + SetCatalogAndNamespaceExec(catalogManager, catalogName, ns) :: Nil + + case r: ShowCurrentNamespace => + ShowCurrentNamespaceExec(r.output, r.catalogManager) :: Nil + + case r @ ShowTableProperties(rt: ResolvedTable, propertyKey) => + ShowTablePropertiesExec(r.output, rt.table, propertyKey) :: Nil + + case _ => Nil + } +} diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/ndp/NdpPushDown.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/ndp/NdpPushDown.scala new file mode 100644 index 00000000..dcf47471 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/ndp/NdpPushDown.scala @@ -0,0 +1,436 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.ndp + +import java.util.Locale + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{PushDownManager, SparkSession} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, Expression, NamedExpression, PredicateHelper, UserDefinedExpression} +import org.apache.spark.sql.catalyst.expressions.aggregate.{Partial, PartialMerge} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.{FileSourceScanExec, FilterExec, GlobalLimitExec, LeafExecNode, LocalLimitExec, ProjectExec, SparkPlan} +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, InsertAdaptiveSparkPlan} +import org.apache.spark.sql.execution.aggregate.{BaseAggregateExec, HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} +import org.apache.spark.sql.execution.datasources.HadoopFsRelation +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources.DataSourceRegister + +case class NdpPushDown(sparkSession: SparkSession) + extends Rule[SparkPlan] with PredicateHelper { + private val pushDownEnabled = NdpConf.getNdpEnabled(sparkSession) + private var fpuHosts: scala.collection.Map[String, String] = _ + + // filter performance blackList: like, startswith, endswith, contains + private val filterWhiteList = Set("or", "and", "not", "equalto", "isnotnull", "lessthan", + "greaterthan", "greaterthanorequal", "lessthanorequal", "in", "literal", "isnull", + "attributereference") + private val attrWhiteList = Set("long", "integer", "byte", "short", "float", "double", + "boolean", "date") + private val sparkUdfWhiteList = Set("substr", "substring", "length", "upper", "lower", "cast", + "replace", "getarrayitem") + private val udfWhitelistConf = NdpConf.getNdpUdfWhitelist(sparkSession) + private val customUdfWhiteList = if (udfWhitelistConf.nonEmpty) { + udfWhitelistConf.map(_.split(",")).get.toSet + } else { + Set.empty + } + private val udfWhiteList = sparkUdfWhiteList ++ customUdfWhiteList + private val aggFuncWhiteList = Set("max", "min", "count", "avg", "sum") + private val aggExpressionWhiteList = + Set("multiply", "add", "subtract", "divide", "remainder", "literal", "attributereference") + private val selectivityThreshold = NdpConf.getNdpFilterSelectivity(sparkSession) + private val tableSizeThreshold = NdpConf.getNdpTableSizeThreshold(sparkSession) + private val filterSelectivityEnable = NdpConf.getNdpFilterSelectivityEnable(sparkSession) + private val tableFileFormatWhiteList = Set("orc", "parquet") + private val parquetSchemaMergingEnabled = NdpConf.getParquetMergeSchema(sparkSession) + private val timeOut = NdpConf.getNdpZookeeperTimeout(sparkSession) + private val parentPath = NdpConf.getNdpZookeeperPath(sparkSession) + private val zkAddress = NdpConf.getNdpZookeeperAddress(sparkSession) + private val aliveOmniDataServerNum = NdpConf.getNdpAliveOmnidata(sparkSession) + + override def apply(plan: SparkPlan): SparkPlan = { + if (pushDownEnabled && shouldPushDown(plan) && shouldPushDown()) { + pushDownOperator(plan) + } else { + plan + } + } + + def shouldPushDown(plan: SparkPlan): Boolean = { + var isPush = false + val p = plan.transformUp { + case a: AdaptiveSparkPlanExec => + if (shouldPushDown(a.initialPlan)) { + isPush = true + } + plan + case s: FileSourceScanExec => + if (s.metadata.get("Location").toString.contains("[hdfs")) { + isPush = true + } + plan + } + if (parquetSchemaMergingEnabled) { + isPush = false + } + isPush + } + + def shouldPushDown(): Boolean = { + val pushDownManagerClass = new PushDownManager() + fpuHosts = pushDownManagerClass.getZookeeperData(timeOut, parentPath, + zkAddress, aliveOmniDataServerNum) + fpuHosts.nonEmpty + } + + def shouldPushDown(relation: HadoopFsRelation): Boolean = { + val isSupportFormat = relation.fileFormat match { + case source: DataSourceRegister => + tableFileFormatWhiteList.contains(source.shortName().toLowerCase(Locale.ROOT)) + case _ => false + } + isSupportFormat && relation.sizeInBytes > tableSizeThreshold.toLong + } + + def shouldPushDown(f: FilterExec, scan: NdpSupport): Boolean = { + scan.filterExeInfos.isEmpty && + f.subqueries.isEmpty && + f.output.forall(x => attrWhiteList.contains(x.dataType.typeName) + || supportedHiveStringType(x)) + } + + private def supportedHiveStringType(attr: Attribute): Boolean = { + if (attr.dataType.typeName.equals("string")) { + !attr.metadata.contains("HIVE_TYPE_STRING") || + attr.metadata.getString("HIVE_TYPE_STRING").startsWith("varchar") + } else { + false + } + } + + def shouldPushDown(projectList: Seq[NamedExpression], s: NdpScanWrapper): Boolean = { + s.scan.isPushDown && projectList.forall(_.isInstanceOf[AttributeReference]) + } + + def shouldPushDown(agg: BaseAggregateExec, scan: NdpSupport): Boolean = { + scan.aggExeInfos.isEmpty && + agg.output.forall(x => attrWhiteList.contains(x.dataType.typeName)) && + agg.aggregateExpressions.forall{ e => + aggFuncWhiteList.contains(e.aggregateFunction.prettyName) && + (e.mode.equals(PartialMerge) || e.mode.equals(Partial)) && + !e.isDistinct && + e.aggregateFunction.children.forall { g => + aggExpressionWhiteList.contains(g.prettyName) + } + } + } + + def shouldPushDown(scan: NdpSupport): Boolean = { + scan.limitExeInfo.isEmpty + } + + def filterSelectivityEnabled: Boolean = { + filterSelectivityEnable && + sparkSession.conf.get(SQLConf.CBO_ENABLED) && + sparkSession.conf.get(SQLConf.PLAN_STATS_ENABLED) + } + + def replaceWrapper(plan: SparkPlan): SparkPlan = { + plan.transform { + case s: NdpScanWrapper => + if (s.scan.isPushDown) { + s.scan match { + case f: FileSourceScanExec => + val scan = f.copy(output = s.scanOutput) + scan.pushDown(s.scan) + scan.fpuHosts(fpuHosts) + logInfo(s"Push down with [${scan.ndpOperators}]") + scan + case _ => throw new UnsupportedOperationException() + } + } else { + s.scan + } + } + } + + def pushDownOperator(plan: SparkPlan): SparkPlan = { + val p = plan.transformUp { + case a: AdaptiveSparkPlanExec => + InsertAdaptiveSparkPlan(a.context)(pushDownOperator(a.initialPlan)) + case s: FileSourceScanExec if shouldPushDown(s.relation) => + val filters = s.partitionFilters.filter { x => + filterWhiteList.contains(x.prettyName) || udfWhiteList.contains(x.prettyName) + } + NdpScanWrapper(s, s.output, filters) + case f @ FilterExec(condition, s: NdpScanWrapper, selectivity) if shouldPushDown(f, s.scan) => + if (filterSelectivityEnabled && + selectivity.nonEmpty && + selectivity.get > selectivityThreshold.toDouble) { + logInfo(s"Fail to push down filter, since " + + s"selectivity[${selectivity.get}] > threshold[${selectivityThreshold}] " + + s"for condition[${condition}]") + f + } else { + // TODO: move selectivity info to pushdown-info + if (filterSelectivityEnabled && selectivity.nonEmpty) { + logInfo(s"Selectivity: ${selectivity.get}") + } + // partial pushdown + val (otherFilters, pushDownFilters) = + (splitConjunctivePredicates(condition) ++ s.partitionFilters).partition { x => + x.find { y => + !filterWhiteList.contains(y.prettyName) && + !udfWhiteList.contains(y.prettyName) + }.isDefined + } + if (pushDownFilters.nonEmpty) { + s.scan.pushDownFilter(FilterExeInfo(pushDownFilters.reduce(And), f.output)) + } + s.update(f.output) + if (otherFilters.nonEmpty) { + FilterExec(otherFilters.reduce(And), s) + } else { + s + } + } + case p @ ProjectExec(projectList, s: NdpScanWrapper) if shouldPushDown(projectList, s) => + s.update(p.output) + case agg @ HashAggregateExec(_, _, _, _, _, _, s: NdpScanWrapper) + if shouldPushDown(agg, s.scan) => + val execution = NdpSupport.toAggExecution(agg) + s.scan.pushDownAgg(execution) + s.update(agg.output) + case agg @ HashAggregateExec(_, _, _, _, _, _, ProjectExec(projectList, s: NdpScanWrapper)) + if shouldPushDown(agg, s.scan) && shouldPushDown(projectList, s) => + val execution = NdpSupport.toAggExecution(agg) + s.scan.pushDownAgg(execution) + s.update(agg.output) + case agg @ SortAggregateExec(_, _, _, _, _, _, s: NdpScanWrapper) + if shouldPushDown(agg, s.scan) => + val execution = NdpSupport.toAggExecution(agg) + s.scan.pushDownAgg(execution) + s.update(agg.output) + case agg @ SortAggregateExec(_, _, _, _, _, _, ProjectExec(projectList, s: NdpScanWrapper)) + if shouldPushDown(agg, s.scan) && shouldPushDown(projectList, s) => + val execution = NdpSupport.toAggExecution(agg) + s.scan.pushDownAgg(execution) + s.update(agg.output) + case agg @ ObjectHashAggregateExec(_, _, _, _, _, _, s: NdpScanWrapper) + if shouldPushDown(agg, s.scan) => + val execution = NdpSupport.toAggExecution(agg) + s.scan.pushDownAgg(execution) + s.update(agg.output) + case agg @ ObjectHashAggregateExec(_, _, _, _, _, _, ProjectExec(pl, s: NdpScanWrapper)) + if shouldPushDown(agg, s.scan) && shouldPushDown(pl, s) => + val execution = NdpSupport.toAggExecution(agg) + s.scan.pushDownAgg(execution) + s.update(agg.output) + case l @ GlobalLimitExec(limit, s: NdpScanWrapper) if shouldPushDown(s.scan) => + s.scan.pushDownLimit(LimitExeInfo(limit)) + s.update(l.output) + case l @ LocalLimitExec(limit, s: NdpScanWrapper) if shouldPushDown(s.scan) => + s.scan.pushDownLimit(LimitExeInfo(limit)) + s.update(l.output) + } + replaceWrapper(p) + } + +} + +case class NdpScanWrapper( + scan: NdpSupport, + var scanOutput: Seq[Attribute], + partitionFilters: Seq[Expression]) extends LeafExecNode { + + override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() + + override def output: Seq[Attribute] = scan.output + + def update(scanOutput: Seq[Attribute]): NdpScanWrapper = { + this.scanOutput = scanOutput + this + } +} + +object NdpConf { + val NDP_ENABLED = "spark.sql.ndp.enabled" + val PARQUET_MERGESCHEMA = "spark.sql.parquet.mergeSchema" + val NDP_FILTER_SELECTIVITY_ENABLE = "spark.sql.ndp.filter.selectivity.enable" + val NDP_TABLE_SIZE_THRESHOLD = "spark.sql.ndp.table.size.threshold" + val NDP_ZOOKEEPER_TIMEOUT = "spark.sql.ndp.zookeeper.timeout" + val NDP_ALIVE_OMNIDATA = "spark.sql.ndp.alive.omnidata" + val NDP_FILTER_SELECTIVITY = "spark.sql.ndp.filter.selectivity" + val NDP_UDF_WHITELIST = "spark.sql.ndp.udf.whitelist" + val NDP_ZOOKEEPER_PATH = "spark.sql.ndp.zookeeper.path" + val NDP_ZOOKEEPER_ADDRESS = "spark.sql.ndp.zookeeper.address" + val NDP_SDI_PORT = "spark.sql.ndp.sdi.port" + val NDP_GRPC_SSL_ENABLED = "spark.sql.ndp.grpc.ssl.enabled" + val NDP_GRPC_CLIENT_CERT_FILE_PATH = "spark.sql.ndp.grpc.client.cert.file.path" + val NDP_GRPC_CLIENT_PRIVATE_KEY_FILE_PATH = "spark.sql.ndp.grpc.client.private.key.file.path" + val NDP_GRPC_TRUST_CA_FILE_PATH = "spark.sql.ndp.grpc.trust.ca.file.path" + val NDP_PKI_DIR = "spark.sql.ndp.pki.dir" + + def toBoolean(key: String, value: String, sparkSession: SparkSession): Boolean = { + try { + value.trim.toBoolean + } catch { + case _: IllegalArgumentException => + sparkSession.conf.unset(key) + throw new IllegalArgumentException(s"NdpPushDown: $key should be boolean, but was $value") + } + } + + def toBoolean(key: String, value: String, conf: SQLConf): Boolean = { + try { + value.trim.toBoolean + } catch { + case _: IllegalArgumentException => + conf.unsetConf(key) + throw new IllegalArgumentException(s"NdpPushDown: $key should be boolean, but was $value") + } + } + + def toNumber[T](key: String, value: String, converter: String => T, + configType: String, sparkSession: SparkSession): T = { + try { + converter(value.trim) + } catch { + case _: NumberFormatException => + sparkSession.conf.unset(key) + throw new IllegalArgumentException( + s"NdpPushDown: $key should be $configType, but was $value") + } + } + + def checkLongValue(key: String, value: Long, validator: Long => Boolean, + errorMsg: String, sparkSession: SparkSession) { + if (!validator(value)) { + sparkSession.conf.unset(key) + throw new IllegalArgumentException(errorMsg) + } + } + + def checkDoubleValue(key: String, value: Double, validator: Double => Boolean, + errorMsg: String, sparkSession: SparkSession) { + if (!validator(value)) { + sparkSession.conf.unset(key) + throw new IllegalArgumentException(errorMsg) + } + } + + def getNdpEnabled(sparkSession: SparkSession): Boolean = { + toBoolean(NDP_ENABLED, + sparkSession.conf.getOption(NDP_ENABLED).getOrElse("true"), sparkSession) + } + + def getParquetMergeSchema(sparkSession: SparkSession): Boolean = { + toBoolean(PARQUET_MERGESCHEMA, + sparkSession.conf.getOption(PARQUET_MERGESCHEMA).getOrElse("false"), sparkSession) + } + + def getNdpFilterSelectivityEnable(sparkSession: SparkSession): Boolean = { + toBoolean(NDP_FILTER_SELECTIVITY_ENABLE, + sparkSession.conf.getOption(NDP_FILTER_SELECTIVITY_ENABLE).getOrElse("true"), sparkSession) + } + + def getNdpTableSizeThreshold(sparkSession: SparkSession): Long = { + val result = toNumber(NDP_TABLE_SIZE_THRESHOLD, + sparkSession.conf.getOption(NDP_TABLE_SIZE_THRESHOLD).getOrElse("10240"), + _.toLong, "long", sparkSession) + checkLongValue(NDP_TABLE_SIZE_THRESHOLD, result, _ > 0, + s"The $NDP_TABLE_SIZE_THRESHOLD value must be positive", sparkSession) + result + } + + def getNdpZookeeperTimeout(sparkSession: SparkSession): Int = { + val result = toNumber(NDP_ZOOKEEPER_TIMEOUT, + sparkSession.conf.getOption(NDP_ZOOKEEPER_TIMEOUT).getOrElse("5000"), + _.toInt, "int", sparkSession) + checkLongValue(NDP_ZOOKEEPER_TIMEOUT, result, _ > 0, + s"The $NDP_ZOOKEEPER_TIMEOUT value must be positive", sparkSession) + result + } + + def getNdpAliveOmnidata(sparkSession: SparkSession): Int = { + val result = toNumber(NDP_ALIVE_OMNIDATA, + sparkSession.conf.getOption(NDP_ALIVE_OMNIDATA).getOrElse("0"), + _.toInt, "int", sparkSession) + checkLongValue(NDP_ALIVE_OMNIDATA, result, _ >= 0, + s"The $NDP_ALIVE_OMNIDATA value must be positive", sparkSession) + result + } + + def getNdpFilterSelectivity(sparkSession: SparkSession): Double = { + val result = toNumber(NDP_FILTER_SELECTIVITY, + sparkSession.conf.getOption(NDP_FILTER_SELECTIVITY).getOrElse("0.5"), + _.toDouble, "double", sparkSession) + checkDoubleValue(NDP_FILTER_SELECTIVITY, result, + selectivity => selectivity >= 0.0 && selectivity <= 1.0, + s"The $NDP_FILTER_SELECTIVITY value must be in [0.0, 1.0].", sparkSession) + result + } + + def getNdpUdfWhitelist(sparkSession: SparkSession): Option[String] = { + sparkSession.conf.getOption(NDP_UDF_WHITELIST) + } + + def getNdpZookeeperPath(sparkSession: SparkSession): String = { + sparkSession.conf.getOption(NDP_ZOOKEEPER_PATH).getOrElse("/sdi/status") + } + + def getNdpZookeeperAddress(sparkSession: SparkSession): String = { + sparkSession.conf.getOption(NDP_ZOOKEEPER_ADDRESS).getOrElse("") + } + + def getNdpSdiPort(sparkSession: SparkSession): String = { + val result = toNumber(NDP_SDI_PORT, + sparkSession.conf.getOption(NDP_SDI_PORT).getOrElse("9100"), + _.toInt, "int", sparkSession) + checkLongValue(NDP_SDI_PORT, result, _ > 0, + s"The $NDP_SDI_PORT value must be positive", sparkSession) + result.toString + } + + def getNdpGrpcSslEnabled(sparkSession: SparkSession): String = { + toBoolean(NDP_GRPC_SSL_ENABLED, + sparkSession.conf.getOption(NDP_GRPC_SSL_ENABLED).getOrElse("true"), sparkSession).toString + } + + def getNdpGrpcClientCertFilePath(sparkSession: SparkSession): String = { + sparkSession.conf.getOption(NDP_GRPC_CLIENT_CERT_FILE_PATH) + .getOrElse("/opt/conf/client.crt") + } + + def getNdpGrpcClientPrivateKeyFilePath(sparkSession: SparkSession): String = { + sparkSession.conf.getOption(NDP_GRPC_CLIENT_PRIVATE_KEY_FILE_PATH) + .getOrElse("/opt/conf/client.pem") + } + + def getNdpGrpcTrustCaFilePath(sparkSession: SparkSession): String = { + sparkSession.conf.getOption(NDP_GRPC_TRUST_CA_FILE_PATH) + .getOrElse("/opt/conf/ca.crt") + } + + def getNdpPkiDir(sparkSession: SparkSession): String = { + sparkSession.conf.getOption(NDP_PKI_DIR).getOrElse("/opt/conf/") + } +} + diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/ndp/NdpSupport.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/ndp/NdpSupport.scala new file mode 100644 index 00000000..7b39eb58 --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/execution/ndp/NdpSupport.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.ndp + +import scala.collection.mutable.ListBuffer + +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, NamedExpression} +import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.aggregate.BaseAggregateExec + +// filter in aggregate could be push down through aggregate, separate filter and aggregate +case class AggExeInfo( + aggregateExpressions: Seq[AggregateFunction], + groupingExpressions: Seq[NamedExpression], + output: Seq[Attribute]) + +// include aggregate filter +case class FilterExeInfo(filter: Expression, output: Seq[Attribute]) + +case class LimitExeInfo(limit: Int) + +case class PushDownInfo( + filterExecutions: Seq[FilterExeInfo], + aggExecutions: Seq[AggExeInfo], + limitExecution: Option[LimitExeInfo], + fpuHosts: scala.collection.Map[String, String]) + +trait NdpSupport extends SparkPlan { + + val filterExeInfos = new ListBuffer[FilterExeInfo]() + val aggExeInfos = new ListBuffer[AggExeInfo]() + var limitExeInfo: Option[LimitExeInfo] = None + var fpuHosts: scala.collection.Map[String, String] = _ + + def pushDownFilter(filter: FilterExeInfo): Unit = { + filterExeInfos += filter + } + + def pushDownAgg(agg: AggExeInfo): Unit = { + aggExeInfos += agg + } + + def pushDownLimit(limit: LimitExeInfo): Unit = { + limitExeInfo = Some(limit) + } + + def pushDown(n: NdpSupport): Unit = { + filterExeInfos ++= n.filterExeInfos + aggExeInfos ++= n.aggExeInfos + limitExeInfo = n.limitExeInfo + } + + def fpuHosts(fpu: scala.collection.Map[String, String]): Unit = { + fpuHosts = fpu + } + + def ndpOperators: PushDownInfo = { + PushDownInfo(filterExeInfos, aggExeInfos, limitExeInfo, fpuHosts) + } + + def isPushDown: Boolean = filterExeInfos.nonEmpty || + aggExeInfos.nonEmpty || + limitExeInfo.nonEmpty +} + +object NdpSupport { + def toAggExecution(agg: BaseAggregateExec): AggExeInfo = { + AggExeInfo(agg.aggregateExpressions.map(_.aggregateFunction), + agg.groupingExpressions, agg.output) + } +} diff --git a/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala new file mode 100644 index 00000000..24f4f5cf --- /dev/null +++ b/omnidata/omnidata-spark-connector/connector/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala @@ -0,0 +1,305 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import java.io.IOException +import java.util.Locale + +import org.apache.hadoop.fs.{FileSystem, Path} + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.catalog._ +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.planning._ +import org.apache.spark.sql.catalyst.plans.logical.{Filter => LFilter, InsertIntoDir, InsertIntoStatement, LogicalPlan, ScriptTransformation, Statistics} +import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.FilterEstimation +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.command.{CreateTableCommand, DDLUtils} +import org.apache.spark.sql.execution.datasources.CreateTable +import org.apache.spark.sql.execution.ndp.NdpConf +import org.apache.spark.sql.execution.ndp.NdpConf.{NDP_ENABLED} +import org.apache.spark.sql.hive.execution._ +import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} + + +/** + * Determine the database, serde/format and schema of the Hive serde table, according to the storage + * properties. + */ +class ResolveHiveSerdeTable(session: SparkSession) extends Rule[LogicalPlan] { + private def determineHiveSerde(table: CatalogTable): CatalogTable = { + if (table.storage.serde.nonEmpty) { + table + } else { + if (table.bucketSpec.isDefined) { + throw new AnalysisException("Creating bucketed Hive serde table is not supported yet.") + } + + val defaultStorage = HiveSerDe.getDefaultStorage(session.sessionState.conf) + val options = new HiveOptions(table.storage.properties) + + val fileStorage = if (options.fileFormat.isDefined) { + HiveSerDe.sourceToSerDe(options.fileFormat.get) match { + case Some(s) => + CatalogStorageFormat.empty.copy( + inputFormat = s.inputFormat, + outputFormat = s.outputFormat, + serde = s.serde) + case None => + throw new IllegalArgumentException(s"invalid fileFormat: '${options.fileFormat.get}'") + } + } else if (options.hasInputOutputFormat) { + CatalogStorageFormat.empty.copy( + inputFormat = options.inputFormat, + outputFormat = options.outputFormat) + } else { + CatalogStorageFormat.empty + } + + val rowStorage = if (options.serde.isDefined) { + CatalogStorageFormat.empty.copy(serde = options.serde) + } else { + CatalogStorageFormat.empty + } + + val storage = table.storage.copy( + inputFormat = fileStorage.inputFormat.orElse(defaultStorage.inputFormat), + outputFormat = fileStorage.outputFormat.orElse(defaultStorage.outputFormat), + serde = rowStorage.serde.orElse(fileStorage.serde).orElse(defaultStorage.serde), + properties = options.serdeProperties) + + table.copy(storage = storage) + } + } + + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case c @ CreateTable(t, _, query) if DDLUtils.isHiveTable(t) => + // Finds the database name if the name does not exist. + val dbName = t.identifier.database.getOrElse(session.catalog.currentDatabase) + val table = t.copy(identifier = t.identifier.copy(database = Some(dbName))) + + // Determines the serde/format of Hive tables + val withStorage = determineHiveSerde(table) + + // Infers the schema, if empty, because the schema could be determined by Hive + // serde. + val withSchema = if (query.isEmpty) { + val inferred = HiveUtils.inferSchema(withStorage) + if (inferred.schema.length <= 0) { + throw new AnalysisException("Unable to infer the schema. " + + s"The schema specification is required to create the table ${inferred.identifier}.") + } + inferred + } else { + withStorage + } + + c.copy(tableDesc = withSchema) + } +} + +class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] { + private def hiveTableWithStats(relation: HiveTableRelation): HiveTableRelation = { + val table = relation.tableMeta + val partitionCols = relation.partitionCols + val conf = session.sessionState.conf + // For partitioned tables, the partition directory may be outside of the table directory. + // Which is expensive to get table size. Please see how we implemented it in the AnalyzeTable. + val sizeInBytes = if (conf.fallBackToHdfsForStatsEnabled && partitionCols.isEmpty) { + try { + val hadoopConf = session.sessionState.newHadoopConf() + val tablePath = new Path(table.location) + val fs: FileSystem = tablePath.getFileSystem(hadoopConf) + fs.getContentSummary(tablePath).getLength + } catch { + case e: IOException => + logWarning("Failed to get table size from HDFS.", e) + conf.defaultSizeInBytes + } + } else { + conf.defaultSizeInBytes + } + + val stats = Some(Statistics(sizeInBytes = BigInt(sizeInBytes))) + relation.copy(tableStats = stats) + } + + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case relation: HiveTableRelation + if DDLUtils.isHiveTable(relation.tableMeta) && relation.tableMeta.stats.isEmpty => + hiveTableWithStats(relation) + + // handles InsertIntoStatement specially as the table in InsertIntoStatement is not added in its + // children, hence not matched directly by previous HiveTableRelation case. + case i @ InsertIntoStatement(relation: HiveTableRelation, _, _, _, _) + if DDLUtils.isHiveTable(relation.tableMeta) && relation.tableMeta.stats.isEmpty => + i.copy(table = hiveTableWithStats(relation)) + } +} + +/** + * Replaces generic operations with specific variants that are designed to work with Hive. + * + * Note that, this rule must be run after `PreprocessTableCreation` and + * `PreprocessTableInsertion`. + */ +object HiveAnalysis extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case InsertIntoStatement(r: HiveTableRelation, partSpec, query, overwrite, ifPartitionNotExists) + if DDLUtils.isHiveTable(r.tableMeta) => + InsertIntoHiveTable(r.tableMeta, partSpec, query, overwrite, + ifPartitionNotExists, query.output.map(_.name)) + + case CreateTable(tableDesc, mode, None) if DDLUtils.isHiveTable(tableDesc) => + CreateTableCommand(tableDesc, ignoreIfExists = mode == SaveMode.Ignore) + + case CreateTable(tableDesc, mode, Some(query)) + if DDLUtils.isHiveTable(tableDesc) && query.resolved => + CreateHiveTableAsSelectCommand(tableDesc, query, query.output.map(_.name), mode) + + case InsertIntoDir(isLocal, storage, provider, child, overwrite) + if DDLUtils.isHiveTable(provider) && child.resolved => + val outputPath = new Path(storage.locationUri.get) + if (overwrite) DDLUtils.verifyNotReadPath(child, outputPath) + + InsertIntoHiveDirCommand(isLocal, storage, child, overwrite, child.output.map(_.name)) + } +} + +/** + * Relation conversion from metastore relations to data source relations for better performance + * + * - When writing to non-partitioned Hive-serde Parquet/Orc tables + * - When scanning Hive-serde Parquet/ORC tables + * + * This rule must be run before all other DDL post-hoc resolution rules, i.e. + * `PreprocessTableCreation`, `PreprocessTableInsertion`, `DataSourceAnalysis` and `HiveAnalysis`. + */ +case class RelationConversions( + conf: SQLConf, + sessionCatalog: HiveSessionCatalog) extends Rule[LogicalPlan] { + private def isInsertConvertible(relation: HiveTableRelation, isPushDown : Boolean): Boolean = { + isInsertConvertible(relation.tableMeta, isPushDown) + } + + private def isRelationConvertible(relation: HiveTableRelation, isPushDown : Boolean): Boolean = { + isRelationConvertible(relation.tableMeta, isPushDown) + } + + private def isConvertible(tableMeta: CatalogTable): Boolean = { + val serde = tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT) + serde.contains("parquet") && SQLConf.get.getConf(HiveUtils.CONVERT_METASTORE_PARQUET) || + serde.contains("orc") && SQLConf.get.getConf(HiveUtils.CONVERT_METASTORE_ORC) + } + + private def isRelationConvertible(tableMeta: CatalogTable, isPushDown : Boolean): Boolean = { + val serde = tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT) + serde.contains("parquet") && (SQLConf.get.getConf(HiveUtils.CONVERT_METASTORE_PARQUET) + || isPushDown) || serde.contains("orc") && + (SQLConf.get.getConf(HiveUtils.CONVERT_METASTORE_ORC) || isPushDown) + } + + private def isInsertConvertible(tableMeta: CatalogTable, isPushDown : Boolean): Boolean = { + val serde = tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT) + serde.contains("parquet") && (SQLConf.get.getConf(HiveUtils.CONVERT_METASTORE_PARQUET) + && !isPushDown) || serde.contains("orc") && + (SQLConf.get.getConf(HiveUtils.CONVERT_METASTORE_ORC) && !isPushDown) + } + + private val metastoreCatalog = sessionCatalog.metastoreCatalog + + override def apply(plan: LogicalPlan): LogicalPlan = { + val isPushDown = NdpConf.toBoolean(NDP_ENABLED, + conf.getConfString("spark.sql.ndp.enabled", "true"), conf) + plan resolveOperators { + // Write path + case InsertIntoStatement( + r: HiveTableRelation, partition, query, overwrite, ifPartitionNotExists) + if query.resolved && DDLUtils.isHiveTable(r.tableMeta) && + (!r.isPartitioned || SQLConf.get.getConf(HiveUtils.CONVERT_INSERTING_PARTITIONED_TABLE)) + && isInsertConvertible(r, isPushDown) => + InsertIntoStatement(metastoreCatalog.convert(r), partition, + query, overwrite, ifPartitionNotExists) + + // Read path + case relation: HiveTableRelation + if DDLUtils.isHiveTable(relation.tableMeta) && + isRelationConvertible(relation, isPushDown) => + metastoreCatalog.convert(relation) + + // CTAS + case CreateTable(tableDesc, mode, Some(query)) + if DDLUtils.isHiveTable(tableDesc) && tableDesc.partitionColumnNames.isEmpty && + isConvertible(tableDesc) && SQLConf.get.getConf(HiveUtils.CONVERT_METASTORE_CTAS) => + // validation is required to be done here before relation conversion. + DDLUtils.checkDataColNames(tableDesc.copy(schema = query.schema)) + OptimizedCreateHiveTableAsSelectCommand( + tableDesc, query, query.output.map(_.name), mode) + } + } +} + +private[hive] trait HiveStrategies { + // Possibly being too clever with types here... or not clever enough. + self: SparkPlanner => + + val sparkSession: SparkSession + + object Scripts extends Strategy { + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case ScriptTransformation(input, script, output, child, ioschema) => + val hiveIoSchema = HiveScriptIOSchema(ioschema) + ScriptTransformationExec(input, script, output, planLater(child), hiveIoSchema) :: Nil + case _ => Nil + } + } + + /** + * Retrieves data using a HiveTableScan. Partition pruning predicates are also detected and + * applied. + */ + object HiveTableScans extends Strategy { + def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case ScanOperation(projectList, predicates, relation: HiveTableRelation) => + // Filter out all predicates that only deal with partition keys, these are given to the + // hive table scan operator to be used for partition pruning. + val partitionKeyIds = AttributeSet(relation.partitionCols) + val (pruningPredicates, otherPredicates) = predicates.partition { predicate => + !predicate.references.isEmpty && + predicate.references.subsetOf(partitionKeyIds) + } + + val condition = otherPredicates.reduceLeftOption(And) + val selectivity = if (condition.nonEmpty) { + FilterEstimation(LFilter(condition.get, relation)) + .calculateFilterSelectivity(condition.get) + } else { + None + } + pruneFilterProject( + projectList, + otherPredicates, + identity[Seq[Expression]], + HiveTableScanExec(_, relation, pruningPredicates)(sparkSession), + selectivity) :: Nil + case _ => + Nil + } + } +} diff --git a/omnidata/omnidata-spark-connector/pom.xml b/omnidata/omnidata-spark-connector/pom.xml new file mode 100644 index 00000000..9d9e3f2b --- /dev/null +++ b/omnidata/omnidata-spark-connector/pom.xml @@ -0,0 +1,19 @@ + + + 4.0.0 + + org.apache.spark + omnidata-spark-connector-root + OmniData - Spark Connector Root + 1.0.0 + pom + + + stub + connector + + + + diff --git a/omnidata/omnidata-spark-connector/stub/client/pom.xml b/omnidata/omnidata-spark-connector/stub/client/pom.xml new file mode 100644 index 00000000..c38685f6 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/client/pom.xml @@ -0,0 +1,32 @@ + + + + com.huawei.boostkit + omniDataStub + 1.0.0 + + + 4.0.0 + boostkit-omnidata-client-stub + jar + + + + com.huawei.boostkit + boostkit-omnidata-core-stub + 1.0.0 + compile + + + org.apache.spark + spark-sql_2.12 + 3.0.1 + compile + + + + + + diff --git a/omnidata/omnidata-spark-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/reader/impl/DataReaderImpl.java b/omnidata/omnidata-spark-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/reader/impl/DataReaderImpl.java new file mode 100644 index 00000000..82926300 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/reader/impl/DataReaderImpl.java @@ -0,0 +1,30 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.reader.impl; + +import com.huawei.boostkit.omnidata.decode.Deserializer; +import com.huawei.boostkit.omnidata.model.TaskSource; + +import java.util.Properties; + +public class DataReaderImpl { + + public DataReaderImpl(Properties transProperties, TaskSource taskSource, Deserializer deserializer) { + + } + + public boolean isFinished() { + return true; + } + + + public T getNextPageBlocking() { + return null; + } + + public void close() { + } +} + diff --git a/omnidata/omnidata-spark-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/spark/SparkDeserializer.java b/omnidata/omnidata-spark-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/spark/SparkDeserializer.java new file mode 100644 index 00000000..133d20f2 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/client/src/main/java/com/huawei/boostkit/omnidata/spark/SparkDeserializer.java @@ -0,0 +1,22 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.spark; + +import com.huawei.boostkit.omnidata.decode.Deserializer; +import com.huawei.boostkit.omnidata.type.DecodeType; +import org.apache.spark.sql.execution.vectorized.WritableColumnVector; + + +/** + * Deserialize serialized page to spark writableColumnVector array + * + * @since 2021-03-30 + */ +public class SparkDeserializer implements Deserializer { + + public SparkDeserializer(DecodeType[] columnTypes, int[] columnOrders) { + } +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/pom.xml b/omnidata/omnidata-spark-connector/stub/core/pom.xml new file mode 100644 index 00000000..71980905 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/pom.xml @@ -0,0 +1,29 @@ + + + + + com.huawei.boostkit + omniDataStub + 1.0.0 + + + 4.0.0 + jar + 1.0.0 + + + + org.apache.maven.plugins + maven-compiler-plugin + + 8 + 8 + + + + + boostkit-omnidata-core-stub + + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/decode/Deserializer.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/decode/Deserializer.java new file mode 100644 index 00000000..b05db640 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/decode/Deserializer.java @@ -0,0 +1,19 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.decode; + +public interface Deserializer { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniDataException.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniDataException.java new file mode 100644 index 00000000..50955a1d --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniDataException.java @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.exception; + +import static com.huawei.boostkit.omnidata.exception.OmniErrorCode.OMNIDATA_GENERIC_ERROR; + +public class OmniDataException extends RuntimeException { + public OmniErrorCode getErrorCode() { + return OMNIDATA_GENERIC_ERROR; + } +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniErrorCode.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniErrorCode.java new file mode 100644 index 00000000..0266fb44 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/exception/OmniErrorCode.java @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.exception; + +public enum OmniErrorCode { + OMNIDATA_GENERIC_ERROR, + OMNIDATA_UNSUPPORTED_OPERATOR, + OMNIDATA_INSUFFICIENT_RESOURCES, + OMNIDATA_INVALID_ARGUMENT, + OMNIDATA_IO_ERROR, + OMNIDATA_NOT_FOUND + + } + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/AggregationInfo.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/AggregationInfo.java new file mode 100644 index 00000000..a2ab3fb0 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/AggregationInfo.java @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model; + +import io.prestosql.spi.relation.CallExpression; +import io.prestosql.spi.relation.RowExpression; + +import java.util.List; +import java.util.Map; + +public class AggregationInfo { + + public AggregationInfo(Map aggregations, List groupingKeys) { + } + + public static class AggregateFunction { + + public AggregateFunction(CallExpression callExpression, boolean isDistinct) { + } + } +} diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Column.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Column.java new file mode 100644 index 00000000..21b2364e --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Column.java @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model; + + +import io.prestosql.spi.type.Type; + +public class Column { + public Column(int fieldId, String name, Type type, boolean isPartiontionKey, Object partiontionKeyValues) { + } + + public Column(int fieldId, String name, Type type) { + } +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Predicate.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Predicate.java new file mode 100644 index 00000000..9130f9be --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/Predicate.java @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model; + +import io.prestosql.spi.predicate.Domain; +import io.prestosql.spi.relation.RowExpression; +import io.prestosql.spi.type.Type; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalLong; + +public class Predicate { + public Predicate(List types, List columns, Optional filter, List projections, + Map domainMap, Map bloomFilters, Optional aggregations, + OptionalLong limit) { + } + +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/TaskSource.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/TaskSource.java new file mode 100644 index 00000000..be43032b --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/TaskSource.java @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model; + + +import com.huawei.boostkit.omnidata.model.datasource.DataSource; + +public class TaskSource { + public TaskSource(DataSource dataSource, Predicate predicate, int maxPageSizeInBytes) { + } +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/DataSource.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/DataSource.java new file mode 100644 index 00000000..3ed8fba9 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/DataSource.java @@ -0,0 +1,19 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model.datasource; + +public class DataSource { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsOrcDataSource.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsOrcDataSource.java new file mode 100644 index 00000000..4ecdd706 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsOrcDataSource.java @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model.datasource.hdfs; + +import com.huawei.boostkit.omnidata.model.datasource.DataSource; + +public class HdfsOrcDataSource extends DataSource { + public HdfsOrcDataSource(String path, long start, long length, boolean useColumnNames) { + } +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsParquetDataSource.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsParquetDataSource.java new file mode 100644 index 00000000..2b74a4d0 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/model/datasource/hdfs/HdfsParquetDataSource.java @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2018-2021. Huawei Technologies Co., Ltd. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.huawei.boostkit.omnidata.model.datasource.hdfs; + +import com.huawei.boostkit.omnidata.model.datasource.DataSource; + +public class HdfsParquetDataSource extends DataSource { + public HdfsParquetDataSource(String path, long start, long length, boolean useColumnNames) { + } +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/ArrayDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/ArrayDecodeType.java new file mode 100644 index 00000000..aa885f21 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/ArrayDecodeType.java @@ -0,0 +1,16 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Array decode type + * + * @param decode type + * @since 2021-07-31 + */ +public class ArrayDecodeType implements DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/BooleanDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/BooleanDecodeType.java new file mode 100644 index 00000000..7d3f3a5a --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/BooleanDecodeType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Boolean decode type + * + * @since 2021-07-31 + */ +public class BooleanDecodeType implements DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/ByteDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/ByteDecodeType.java new file mode 100644 index 00000000..fda1464c --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/ByteDecodeType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Byte decode type + * + * @since 2021-07-31 + */ +public class ByteDecodeType implements DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/DateDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/DateDecodeType.java new file mode 100644 index 00000000..6b636532 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/DateDecodeType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Date Decode Type + * + * @since 2021-07-31 + */ +public class DateDecodeType implements DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/DecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/DecodeType.java new file mode 100644 index 00000000..ac555565 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/DecodeType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Decode java type + * + * @since 2020-07-31 + */ +public interface DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/DoubleDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/DoubleDecodeType.java new file mode 100644 index 00000000..9043123f --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/DoubleDecodeType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Double decode type + * + * @since 2021-07-31 + */ +public class DoubleDecodeType implements DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/FloatDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/FloatDecodeType.java new file mode 100644 index 00000000..e4a318c5 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/FloatDecodeType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Float decode type + * + * @since 2020-07-31 + */ +public class FloatDecodeType implements DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/IntDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/IntDecodeType.java new file mode 100644 index 00000000..9d5ad0ae --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/IntDecodeType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Int decode type + * + * @since 2021-07-31 + */ +public class IntDecodeType implements DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongDecodeType.java new file mode 100644 index 00000000..cb320b73 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongDecodeType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Long decode type + * + * @since 2021-07-31 + */ +public class LongDecodeType implements DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToByteDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToByteDecodeType.java new file mode 100644 index 00000000..443da19c --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToByteDecodeType.java @@ -0,0 +1,13 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + +/** + * Long To Byte decode + * + * @since 2021-08-26 + */ +public class LongToByteDecodeType implements DecodeType { +} diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToFloatDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToFloatDecodeType.java new file mode 100644 index 00000000..608cdefb --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToFloatDecodeType.java @@ -0,0 +1,14 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Double To Float decode + * + * @since 2021-08-26 + */ +public class LongToFloatDecodeType implements DecodeType { +} diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToIntDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToIntDecodeType.java new file mode 100644 index 00000000..18921dd3 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToIntDecodeType.java @@ -0,0 +1,14 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Long To Int decode + * + * @since 2021-08-26 + */ +public class LongToIntDecodeType implements DecodeType { +} diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToShortDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToShortDecodeType.java new file mode 100644 index 00000000..a221f7f5 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/LongToShortDecodeType.java @@ -0,0 +1,14 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * 功能描述 + * + * @since 2021-08-26 + */ +public class LongToShortDecodeType implements DecodeType { +} diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/MapDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/MapDecodeType.java new file mode 100644 index 00000000..f56d322f --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/MapDecodeType.java @@ -0,0 +1,17 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * MapDecode type + * + * @param k + * @param v + * @since 2021-07-31 + */ +public class MapDecodeType implements DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/RowDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/RowDecodeType.java new file mode 100644 index 00000000..14e6c4d1 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/RowDecodeType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Row decode type + * + * @since 2020-07-31 + */ +public class RowDecodeType implements DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/ShortDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/ShortDecodeType.java new file mode 100644 index 00000000..9f23af26 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/ShortDecodeType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Short decode type + * + * @since 2021-07-31 + */ +public class ShortDecodeType implements DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/VarcharDecodeType.java b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/VarcharDecodeType.java new file mode 100644 index 00000000..d04cfc4e --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/core/src/main/java/com/huawei/boostkit/omnidata/type/VarcharDecodeType.java @@ -0,0 +1,15 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + */ + +package com.huawei.boostkit.omnidata.type; + + +/** + * Varchar decode type + * + * @since 2020-07-31 + */ +public class VarcharDecodeType implements DecodeType { +} + diff --git a/omnidata/omnidata-spark-connector/stub/pom.xml b/omnidata/omnidata-spark-connector/stub/pom.xml new file mode 100644 index 00000000..42de7772 --- /dev/null +++ b/omnidata/omnidata-spark-connector/stub/pom.xml @@ -0,0 +1,68 @@ + + + + org.apache.spark + omnidata-spark-connector-root + 1.0.0 + + + 4.0.0 + + com.huawei.boostkit + omniDataStub + pom + 1.0.0 + + + core + client + + + + 1.4.0 + + + + + io.hetu.core + presto-spi + ${dep.hetu.version} + + + io.hetu.core + hetu-transport + ${dep.hetu.version} + + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.1.2 + + + + true + true + + + + + + + jar + + + + default-jar + none + + + + + +