Skip to content

Commit 68e47fd

Browse files
committed
Merge all convert_and_insert and getTableStructureFromData v1
1 parent 31d1497 commit 68e47fd

File tree

4 files changed

+294
-56
lines changed

4 files changed

+294
-56
lines changed

src/Processors/Sources/PythonSource.cpp

Lines changed: 62 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,20 @@
1+
#include <Columns/ColumnDecimal.h>
12
#include <Columns/ColumnString.h>
23
#include <Columns/IColumn.h>
4+
#include <DataTypes/DataTypeDecimalBase.h>
35
#include <DataTypes/DataTypeString.h>
6+
#include <DataTypes/DataTypesDecimal.h>
47
#include <DataTypes/DataTypesNumber.h>
8+
#include <DataTypes/IDataType.h>
59
#include <Processors/Sources/PythonSource.h>
610
#include <Storages/StoragePython.h>
11+
#include <base/Decimal.h>
712
#include <pybind11/gil.h>
813
#include <pybind11/pytypes.h>
914
#include <Common/Exception.h>
1015
#include <Common/logger_useful.h>
16+
#include <base/Decimal_fwd.h>
17+
#include <base/types.h>
1118

1219
namespace DB
1320
{
@@ -18,70 +25,50 @@ PythonSource::PythonSource(std::shared_ptr<PyReader> reader_, const Block & samp
1825
}
1926

2027
template <typename T>
21-
ColumnPtr convert_and_insert(py::object obj)
28+
ColumnPtr convert_and_insert(py::object obj, UInt32 scale = 0)
2229
{
23-
auto column = ColumnVector<T>::create();
24-
// if obj is a list
25-
if (py::isinstance<py::list>(obj))
26-
{
27-
py::list list = obj.cast<py::list>();
28-
for (auto && i : list)
29-
column->insert(i.cast<T>());
30-
// free the list
31-
list.dec_ref();
32-
}
33-
else if (py::isinstance<py::array>(obj)) // if obj is a numpy array
34-
{
35-
py::array array = obj.cast<py::array>();
36-
//chdb: array is a numpy array, so we can directly cast it to a vector?
37-
for (auto && i : array)
38-
column->insert(i.cast<T>());
39-
// free the array, until we implement with zero copy
40-
array.dec_ref();
41-
}
30+
MutableColumnPtr column;
31+
if constexpr (std::is_same_v<T, DateTime64> || std::is_same_v<T, Decimal128> || std::is_same_v<T, Decimal256>)
32+
column = ColumnDecimal<T>::create(0, scale);
33+
else if constexpr (std::is_same_v<T, String>)
34+
column = ColumnString::create();
4235
else
43-
{
44-
throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unsupported type {}", obj.get_type().attr("__name__").cast<std::string>());
45-
}
46-
return column;
47-
}
36+
column = ColumnVector<T>::create();
4837

49-
template <>
50-
ColumnPtr convert_and_insert<String>(py::object obj)
51-
{
52-
auto column = ColumnString::create();
5338
if (py::isinstance<py::list>(obj))
5439
{
5540
py::list list = obj.cast<py::list>();
5641
for (auto && i : list)
57-
column->insert(i.cast<String>());
58-
// free the list
42+
column->insert(i.cast<T>());
5943
list.dec_ref();
6044
}
6145
else if (py::isinstance<py::array>(obj))
6246
{
6347
py::array array = obj.cast<py::array>();
6448
for (auto && i : array)
65-
column->insert(i.cast<String>());
66-
// free the array, until we implement with zero copy
49+
column->insert(i.cast<T>());
6750
array.dec_ref();
6851
}
6952
else
7053
{
71-
throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unsupported type {}", obj.get_type().attr("__name__").cast<std::string>());
54+
throw Exception(
55+
ErrorCodes::BAD_TYPE_OF_FIELD,
56+
"Unsupported type {} for value {}",
57+
obj.get_type().attr("__name__").cast<std::string>(),
58+
py::str(obj).cast<std::string>());
7259
}
7360
return column;
7461
}
7562

7663
Chunk PythonSource::generate()
7764
{
7865
size_t num_rows = 0;
79-
66+
std::vector<py::object> data;
8067
try
8168
{
8269
// GIL is held when called from Python code. Release it to avoid deadlock
8370
py::gil_scoped_release release;
84-
std::vector<py::object> data = reader->read(description.sample_block.getNames(), max_block_size);
71+
data = reader->read(description.sample_block.getNames(), max_block_size);
8572

8673
LOG_DEBUG(logger, "Read {} columns", data.size());
8774
LOG_DEBUG(logger, "Need {} columns", description.sample_block.columns());
@@ -122,31 +109,58 @@ Chunk PythonSource::generate()
122109
num_rows = py::len(data[i]);
123110
const auto & column = data[i];
124111
const auto & type = description.sample_block.getByPosition(i).type;
112+
WhichDataType which(type);
125113

126-
if (type->equals(*std::make_shared<DataTypeUInt8>()))
114+
if (which.isUInt8())
127115
columns[i] = convert_and_insert<UInt8>(column);
128-
else if (type->equals(*std::make_shared<DataTypeUInt16>()))
116+
else if (which.isUInt16())
129117
columns[i] = convert_and_insert<UInt16>(column);
130-
else if (type->equals(*std::make_shared<DataTypeUInt32>()))
118+
else if (which.isUInt32())
131119
columns[i] = convert_and_insert<UInt32>(column);
132-
else if (type->equals(*std::make_shared<DataTypeUInt64>()))
120+
else if (which.isUInt64())
133121
columns[i] = convert_and_insert<UInt64>(column);
134-
else if (type->equals(*std::make_shared<DataTypeInt8>()))
122+
else if (which.isUInt128())
123+
columns[i] = convert_and_insert<UInt128>(column);
124+
else if (which.isUInt256())
125+
columns[i] = convert_and_insert<UInt256>(column);
126+
else if (which.isInt8())
135127
columns[i] = convert_and_insert<Int8>(column);
136-
else if (type->equals(*std::make_shared<DataTypeInt16>()))
128+
else if (which.isInt16())
137129
columns[i] = convert_and_insert<Int16>(column);
138-
else if (type->equals(*std::make_shared<DataTypeInt32>()))
130+
else if (which.isInt32())
139131
columns[i] = convert_and_insert<Int32>(column);
140-
else if (type->equals(*std::make_shared<DataTypeInt64>()))
132+
else if (which.isInt64())
141133
columns[i] = convert_and_insert<Int64>(column);
142-
else if (type->equals(*std::make_shared<DataTypeFloat32>()))
134+
else if (which.isInt128())
135+
columns[i] = convert_and_insert<Int128>(column);
136+
else if (which.isInt256())
137+
columns[i] = convert_and_insert<Int256>(column);
138+
else if (which.isFloat32())
143139
columns[i] = convert_and_insert<Float32>(column);
144-
else if (type->equals(*std::make_shared<DataTypeFloat64>()))
140+
else if (which.isFloat64())
145141
columns[i] = convert_and_insert<Float64>(column);
146-
else if (type->equals(*std::make_shared<DataTypeString>()))
142+
else if (which.isDecimal128())
143+
{
144+
const auto & dtype = typeid_cast<const DataTypeDecimal<Decimal128> *>(type.get());
145+
columns[i] = convert_and_insert<Decimal128>(column, dtype->getScale());
146+
}
147+
else if (which.isDecimal256())
148+
{
149+
const auto & dtype = typeid_cast<const DataTypeDecimal<Decimal256> *>(type.get());
150+
columns[i] = convert_and_insert<Decimal256>(column, dtype->getScale());
151+
}
152+
else if (which.isDateTime())
153+
columns[i] = convert_and_insert<UInt32>(column);
154+
else if (which.isDateTime64())
155+
columns[i] = convert_and_insert<DateTime64>(column);
156+
else if (which.isString())
147157
columns[i] = convert_and_insert<String>(column);
148158
else
149-
throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unsupported type {}", type->getName());
159+
throw Exception(
160+
ErrorCodes::BAD_TYPE_OF_FIELD,
161+
"Unsupported type {} for column {}",
162+
type->getName(),
163+
description.sample_block.getByPosition(i).name);
150164
}
151165
// Set data vector to empty to avoid trigger py::object destructor without GIL
152166
// Note: we have already manually decremented the reference count of the list or array in `convert_and_insert` function

src/Storages/StoragePython.cpp

Lines changed: 132 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,26 @@
11
#include <Columns/IColumn.h>
2+
#include <DataTypes/DataTypeDate.h>
3+
#include <DataTypes/DataTypeDate32.h>
4+
#include <DataTypes/DataTypeDateTime.h>
5+
#include <DataTypes/DataTypeString.h>
6+
#include <DataTypes/DataTypesNumber.h>
27
#include <Functions/FunctionsConversion.h>
38
#include <Interpreters/evaluateConstantExpression.h>
49
#include <Processors/Sources/PythonSource.h>
10+
#include <Storages/ColumnsDescription.h>
511
#include <Storages/IStorage.h>
612
#include <Storages/StorageFactory.h>
713
#include <Storages/StoragePython.h>
14+
#include <base/types.h>
815
#include <pybind11/functional.h>
16+
#include <pybind11/numpy.h>
917
#include <pybind11/pybind11.h>
18+
#include <pybind11/pytypes.h>
1019
#include <pybind11/stl.h>
20+
#include <re2/re2.h>
21+
#include <Poco/Logger.h>
1122
#include <Common/Exception.h>
23+
#include <Common/logger_useful.h>
1224

1325
#include <any>
1426

@@ -22,8 +34,6 @@ extern const int LOGICAL_ERROR;
2234
extern const int BAD_TYPE_OF_FIELD;
2335
}
2436

25-
namespace py = pybind11;
26-
2737

2838
StoragePython::StoragePython(
2939
const StorageID & table_id_,
@@ -66,6 +76,126 @@ Block StoragePython::prepareSampleBlock(const Names & column_names, const Storag
6676
return sample_block;
6777
}
6878

79+
ColumnsDescription StoragePython::getTableStructureFromData(std::shared_ptr<PyReader> reader)
80+
{
81+
if (!reader)
82+
throw Exception(ErrorCodes::LOGICAL_ERROR, "Python reader not initialized");
83+
auto schema = reader->getSchema();
84+
85+
auto * logger = &Poco::Logger::get("StoragePython");
86+
if (logger->debug())
87+
{
88+
LOG_DEBUG(logger, "Schema content:");
89+
for (const auto & item : schema)
90+
LOG_DEBUG(logger, "Column: {}, Type: {}", String(item.first), String(item.second));
91+
}
92+
93+
NamesAndTypesList names_and_types;
94+
95+
// Define regular expressions for different data types
96+
RE2 pattern_int(R"(\bint(\d+))");
97+
RE2 pattern_generic_int(R"(\bint\b|<class 'int'>)"); // Matches generic 'int'
98+
RE2 pattern_uint(R"(\buint(\d+))");
99+
RE2 pattern_float(R"(\b(float|double)(\d+))");
100+
RE2 pattern_decimal128(R"(decimal128\((\d+),\s*(\d+)\))");
101+
RE2 pattern_decimal256(R"(decimal256\((\d+),\s*(\d+)\))");
102+
RE2 pattern_date32(R"(\bdate32\b)");
103+
RE2 pattern_date64(R"(\bdate64\b)");
104+
RE2 pattern_time32(R"(\btime32\b)");
105+
RE2 pattern_time64_us(R"(\btime64\[us\]\b)");
106+
RE2 pattern_time64_ns(R"(\btime64\[ns\]\b)");
107+
RE2 pattern_string_binary(R"(\bstring\b|<class 'str'>|str|DataType\(string\)|DataType\(binary\)|dtype\[object_\]|dtype\('O'\))");
108+
109+
// Iterate through each pair of name and type string in the schema
110+
for (const auto & [name, typeStr] : schema)
111+
{
112+
std::shared_ptr<IDataType> data_type;
113+
114+
std::string bits, precision, scale;
115+
if (RE2::PartialMatch(typeStr, pattern_int, &bits))
116+
{
117+
if (bits == "8")
118+
data_type = std::make_shared<DataTypeInt8>();
119+
else if (bits == "16")
120+
data_type = std::make_shared<DataTypeInt16>();
121+
else if (bits == "32")
122+
data_type = std::make_shared<DataTypeInt32>();
123+
else if (bits == "64")
124+
data_type = std::make_shared<DataTypeInt64>();
125+
else if (bits == "128")
126+
data_type = std::make_shared<DataTypeInt128>();
127+
else if (bits == "256")
128+
data_type = std::make_shared<DataTypeInt256>();
129+
}
130+
else if (RE2::PartialMatch(typeStr, pattern_uint, &bits))
131+
{
132+
if (bits == "8")
133+
data_type = std::make_shared<DataTypeUInt8>();
134+
else if (bits == "16")
135+
data_type = std::make_shared<DataTypeUInt16>();
136+
else if (bits == "32")
137+
data_type = std::make_shared<DataTypeUInt32>();
138+
else if (bits == "64")
139+
data_type = std::make_shared<DataTypeUInt64>();
140+
else if (bits == "128")
141+
data_type = std::make_shared<DataTypeUInt128>();
142+
else if (bits == "256")
143+
data_type = std::make_shared<DataTypeUInt256>();
144+
}
145+
else if (RE2::PartialMatch(typeStr, pattern_generic_int))
146+
{
147+
data_type = std::make_shared<DataTypeInt64>(); // Default to 64-bit integers for generic 'int'
148+
}
149+
else if (RE2::PartialMatch(typeStr, pattern_float, &bits))
150+
{
151+
if (bits == "32")
152+
data_type = std::make_shared<DataTypeFloat32>();
153+
else if (bits == "64")
154+
data_type = std::make_shared<DataTypeFloat64>();
155+
}
156+
else if (RE2::PartialMatch(typeStr, pattern_decimal128, &precision, &scale))
157+
{
158+
data_type = std::make_shared<DataTypeDecimal128>(std::stoi(precision), std::stoi(scale));
159+
}
160+
else if (RE2::PartialMatch(typeStr, pattern_decimal256, &precision, &scale))
161+
{
162+
data_type = std::make_shared<DataTypeDecimal256>(std::stoi(precision), std::stoi(scale));
163+
}
164+
else if (RE2::PartialMatch(typeStr, pattern_date32))
165+
{
166+
data_type = std::make_shared<DataTypeDate32>();
167+
}
168+
else if (RE2::PartialMatch(typeStr, pattern_date64))
169+
{
170+
data_type = std::make_shared<DataTypeDateTime64>(3); // date64 corresponds to DateTime64(3)
171+
}
172+
else if (RE2::PartialMatch(typeStr, pattern_time32))
173+
{
174+
data_type = std::make_shared<DataTypeDateTime>();
175+
}
176+
else if (RE2::PartialMatch(typeStr, pattern_time64_us))
177+
{
178+
data_type = std::make_shared<DataTypeDateTime64>(6); // time64[us] corresponds to DateTime64(6)
179+
}
180+
else if (RE2::PartialMatch(typeStr, pattern_time64_ns))
181+
{
182+
data_type = std::make_shared<DataTypeDateTime64>(9); // time64[ns] corresponds to DateTime64(9)
183+
}
184+
else if (RE2::PartialMatch(typeStr, pattern_string_binary))
185+
{
186+
data_type = std::make_shared<DataTypeString>();
187+
}
188+
else
189+
{
190+
throw Exception(ErrorCodes::TYPE_MISMATCH, "Unrecognized data type: {}", typeStr);
191+
}
192+
193+
names_and_types.push_back({name, data_type});
194+
}
195+
196+
return ColumnsDescription(names_and_types);
197+
}
198+
69199
void registerStoragePython(StorageFactory & factory)
70200
{
71201
factory.registerStorage(

0 commit comments

Comments
 (0)