|
| 1 | +# Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +# or more contributor license agreements. See the NOTICE file |
| 3 | +# distributed with this work for additional information |
| 4 | +# regarding copyright ownership. The ASF licenses this file |
| 5 | +# to you under the Apache License, Version 2.0 (the |
| 6 | +# "License"); you may not use this file except in compliance |
| 7 | +# with the License. You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, |
| 12 | +# software distributed under the License is distributed on an |
| 13 | +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +# KIND, either express or implied. See the License for the |
| 15 | +# specific language governing permissions and limitations |
| 16 | +# under the License. |
| 17 | + |
| 18 | +""" |
| 19 | +This is a utility function that will consumer the data generated by dbgen from TPC-H and convert |
| 20 | +it into a parquet file with the column names as expected by the TPC-H specification. It assumes |
| 21 | +the data generated resides in a path ../../benchmarks/tpch/data relative to the current file, |
| 22 | +as will be generated by the script provided in this repository. |
| 23 | +""" |
| 24 | + |
| 25 | +import os |
| 26 | +import pyarrow |
| 27 | +import datafusion |
| 28 | + |
| 29 | +ctx = datafusion.SessionContext() |
| 30 | + |
| 31 | +all_schemas = {} |
| 32 | + |
| 33 | +all_schemas["customer"] = [ |
| 34 | + ("C_CUSTKEY", pyarrow.int32()), |
| 35 | + ("C_NAME", pyarrow.string()), |
| 36 | + ("C_ADDRESS", pyarrow.string()), |
| 37 | + ("C_NATIONKEY", pyarrow.int32()), |
| 38 | + ("C_PHONE", pyarrow.string()), |
| 39 | + ("C_ACCTBAL", pyarrow.float32()), |
| 40 | + ("C_MKTSEGMENT", pyarrow.string()), |
| 41 | + ("C_COMMENT", pyarrow.string()), |
| 42 | +] |
| 43 | + |
| 44 | +all_schemas["lineitem"] = [ |
| 45 | + ("L_ORDERKEY", pyarrow.int32()), |
| 46 | + ("L_PARTKEY", pyarrow.int32()), |
| 47 | + ("L_SUPPKEY", pyarrow.int32()), |
| 48 | + ("L_LINENUMBER", pyarrow.int32()), |
| 49 | + ("L_QUANTITY", pyarrow.float32()), |
| 50 | + ("L_EXTENDEDPRICE", pyarrow.float32()), |
| 51 | + ("L_DISCOUNT", pyarrow.float32()), |
| 52 | + ("L_TAX", pyarrow.float32()), |
| 53 | + ("L_RETURNFLAG", pyarrow.string()), |
| 54 | + ("L_LINESTATUS", pyarrow.string()), |
| 55 | + ("L_SHIPDATE", pyarrow.date32()), |
| 56 | + ("L_COMMITDATE", pyarrow.date32()), |
| 57 | + ("L_RECEIPTDATE", pyarrow.date32()), |
| 58 | + ("L_SHIPINSTRUCT", pyarrow.string()), |
| 59 | + ("L_SHIPMODE", pyarrow.string()), |
| 60 | + ("L_COMMENT", pyarrow.string()), |
| 61 | +] |
| 62 | + |
| 63 | +all_schemas["nation"] = [ |
| 64 | + ("N_NATIONKEY", pyarrow.int32()), |
| 65 | + ("N_NAME", pyarrow.string()), |
| 66 | + ("N_REGIONKEY", pyarrow.int32()), |
| 67 | + ("N_COMMENT", pyarrow.string()), |
| 68 | +] |
| 69 | + |
| 70 | +all_schemas["orders"] = [ |
| 71 | + ("O_ORDERKEY", pyarrow.int32()), |
| 72 | + ("O_CUSTKEY", pyarrow.int32()), |
| 73 | + ("O_ORDERSTATUS", pyarrow.string()), |
| 74 | + ("O_TOTALPRICE", pyarrow.float32()), |
| 75 | + ("O_ORDERDATE", pyarrow.date32()), |
| 76 | + ("O_ORDERPRIORITY", pyarrow.string()), |
| 77 | + ("O_CLERK", pyarrow.string()), |
| 78 | + ("O_SHIPPRIORITY", pyarrow.int32()), |
| 79 | + ("O_COMMENT", pyarrow.string()), |
| 80 | +] |
| 81 | + |
| 82 | +all_schemas["part"] = [ |
| 83 | + ("P_PARTKEY", pyarrow.int32()), |
| 84 | + ("P_NAME", pyarrow.string()), |
| 85 | + ("P_MFGR", pyarrow.string()), |
| 86 | + ("P_BRAND", pyarrow.string()), |
| 87 | + ("P_TYPE", pyarrow.string()), |
| 88 | + ("P_SIZE", pyarrow.int32()), |
| 89 | + ("P_CONTAINER", pyarrow.string()), |
| 90 | + ("P_RETAILPRICE", pyarrow.float32()), |
| 91 | + ("P_COMMENT", pyarrow.string()), |
| 92 | +] |
| 93 | + |
| 94 | +all_schemas["partsupp"] = [ |
| 95 | + ("PS_PARTKEY", pyarrow.int32()), |
| 96 | + ("PS_SUPPKEY", pyarrow.int32()), |
| 97 | + ("PS_AVAILQTY", pyarrow.int32()), |
| 98 | + ("PS_SUPPLYCOST", pyarrow.float32()), |
| 99 | + ("PS_COMMENT", pyarrow.string()), |
| 100 | +] |
| 101 | + |
| 102 | +all_schemas["region"] = [ |
| 103 | + ("r_REGIONKEY", pyarrow.int32()), |
| 104 | + ("r_NAME", pyarrow.string()), |
| 105 | + ("r_COMMENT", pyarrow.string()), |
| 106 | +] |
| 107 | + |
| 108 | +all_schemas["supplier"] = [ |
| 109 | + ("S_SUPPKEY", pyarrow.int32()), |
| 110 | + ("S_NAME", pyarrow.string()), |
| 111 | + ("S_ADDRESS", pyarrow.string()), |
| 112 | + ("S_NATIONKEY", pyarrow.int32()), |
| 113 | + ("S_PHONE", pyarrow.string()), |
| 114 | + ("S_ACCTBAL", pyarrow.float32()), |
| 115 | + ("S_COMMENT", pyarrow.string()), |
| 116 | +] |
| 117 | + |
| 118 | +curr_dir = os.path.dirname(os.path.abspath(__file__)) |
| 119 | +for filename, curr_schema in all_schemas.items(): |
| 120 | + |
| 121 | + # For convenience, go ahead and convert the schema column names to lowercase |
| 122 | + curr_schema = [(s[0].lower(), s[1]) for s in curr_schema] |
| 123 | + |
| 124 | + # Pre-collect the output columns so we can ignore the null field we add |
| 125 | + # in to handle the trailing | in the file |
| 126 | + output_cols = [r[0] for r in curr_schema] |
| 127 | + |
| 128 | + # Trailing | requires extra field for in processing |
| 129 | + curr_schema.append(("some_null", pyarrow.null())) |
| 130 | + |
| 131 | + schema = pyarrow.schema(curr_schema) |
| 132 | + |
| 133 | + source_file = os.path.abspath( |
| 134 | + os.path.join(curr_dir, f"../../benchmarks/tpch/data/{filename}.csv") |
| 135 | + ) |
| 136 | + dest_file = os.path.abspath(os.path.join(curr_dir, f"./data/{filename}.parquet")) |
| 137 | + |
| 138 | + df = ctx.read_csv(source_file, schema=schema, has_header=False, delimiter="|") |
| 139 | + |
| 140 | + df = df.select_columns(*output_cols) |
| 141 | + |
| 142 | + df.write_parquet(dest_file, compression="snappy") |
0 commit comments